# Explore here

In [2]:
# Your code here
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import chi2, SelectKBest

#creamos una variable con los la tabla del csv indicando la separación entre campos.

df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/logistic-regression-project-tutorial/main/bank-marketing-campaign-data.csv', sep = ';')



In [3]:
df.shape


(41188, 21)

In [4]:

# Eliminamos los duplicados

df = df.drop_duplicates().reset_index(drop = True)
df.shape

(41176, 21)

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [6]:
# Vemos si tenemos valores nulos

df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [7]:
# vamos a ver si encontramos valores únicos

df.nunique()


age                 78
job                 12
marital              4
education            8
default              3
housing              3
loan                 3
contact              2
month               10
day_of_week          5
duration          1544
campaign            42
pdays               27
previous             8
poutcome             3
emp.var.rate        10
cons.price.idx      26
cons.conf.idx       26
euribor3m          316
nr.employed         11
y                    2
dtype: int64

In [8]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [9]:
#Transformamos los valores no numéricos a enteros

encoder = LabelEncoder()

df['job_int'] = encoder.fit_transform(df['job'])
df['marital_int'] = encoder.fit_transform(df['marital'])
df['educiation_in'] = encoder.fit_transform(df['education'])
df['default_int'] = encoder.fit_transform(df['default'])
df['housing_int'] = encoder.fit_transform(df['housing'])
df['loan_int'] = encoder.fit_transform(df['loan'])
df['contact_int'] = encoder.fit_transform(df['contact'])
df['poutcome_int'] = encoder.fit_transform(df['poutcome'])
df['y_int'] = encoder.fit_transform(df['y'])


# En el caso de los días de la semana y los meses lo he transformado de esta forma para que mantengan el orden. (no era necesario pero quería ver cómo hacerlo)

months = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
    'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
    'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}

df['month_int'] = df['month'].map(months)

days = {
    'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4,
    'fri': 5, 'sat': 6, 'sun': 7
}

df['day_of_week_int']= df['day_of_week'].map(days)

In [10]:
#generamos el nuevo dataframe quedándonos con los valores numñericos.

df_int = df.drop(['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome','y'], axis = 1)

df_int.dtypes



age                  int64
duration             int64
campaign             int64
pdays                int64
previous             int64
emp.var.rate       float64
cons.price.idx     float64
cons.conf.idx      float64
euribor3m          float64
nr.employed        float64
job_int              int64
marital_int          int64
educiation_in        int64
default_int          int64
housing_int          int64
loan_int             int64
contact_int          int64
poutcome_int         int64
y_int                int64
month_int            int64
day_of_week_int      int64
dtype: object

In [11]:
# quiero comprobar que los valores no corresponden con 0 y lo sí con 1
df_int.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,marital_int,educiation_in,default_int,housing_int,loan_int,contact_int,poutcome_int,y_int,month_int,day_of_week_int
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,0,1,1,0,5,1
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,3,1,0,0,1,1,0,5,1
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,3,0,2,0,1,1,0,5,1
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,1,0,0,0,1,1,0,5,1
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,3,0,0,2,1,1,0,5,1


In [12]:
#aplicamos la normalización

escalador = MinMaxScaler().fit(df_int)

arr_norm = escalador.transform(df_int)

arr_norm

array([[0.48148148, 0.05307035, 0.        , ..., 0.        , 0.22222222,
        0.        ],
       [0.49382716, 0.03029687, 0.        , ..., 0.        , 0.22222222,
        0.        ],
       [0.24691358, 0.04595364, 0.        , ..., 0.        , 0.22222222,
        0.        ],
       ...,
       [0.48148148, 0.03843026, 0.01818182, ..., 0.        , 0.88888889,
        1.        ],
       [0.33333333, 0.08987393, 0.        , ..., 1.        , 0.88888889,
        1.        ],
       [0.7037037 , 0.04859699, 0.03636364, ..., 0.        , 0.88888889,
        1.        ]], shape=(41176, 21))

In [13]:
# la tansformación de minmaxscaler nos devuelve un array. vamos a convertirlo en un Dataframe. 

df_norm = pd.DataFrame(arr_norm, index = df_int.index, columns = df_int.columns)

df_norm

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,marital_int,educiation_in,default_int,housing_int,loan_int,contact_int,poutcome_int,y_int,month_int,day_of_week_int
0,0.481481,0.053070,0.000000,1.0,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.333333,0.000000,0.0,0.0,0.0,1.0,0.5,0.0,0.222222,0.0
1,0.493827,0.030297,0.000000,1.0,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.333333,0.428571,0.5,0.0,0.0,1.0,0.5,0.0,0.222222,0.0
2,0.246914,0.045954,0.000000,1.0,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.333333,0.428571,0.0,1.0,0.0,1.0,0.5,0.0,0.222222,0.0
3,0.283951,0.030704,0.000000,1.0,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.333333,0.142857,0.0,0.0,0.0,1.0,0.5,0.0,0.222222,0.0
4,0.481481,0.062424,0.000000,1.0,0.000000,0.937500,0.698753,0.60251,0.957379,0.859735,...,0.333333,0.428571,0.0,0.0,1.0,1.0,0.5,0.0,0.222222,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41171,0.691358,0.067914,0.000000,1.0,0.000000,0.479167,1.000000,0.00000,0.089322,0.000000,...,0.333333,0.714286,0.0,1.0,0.0,0.0,0.5,1.0,0.888889,1.0
41172,0.358025,0.077877,0.000000,1.0,0.000000,0.479167,1.000000,0.00000,0.089322,0.000000,...,0.333333,0.714286,0.0,0.0,0.0,0.0,0.5,0.0,0.888889,1.0
41173,0.481481,0.038430,0.018182,1.0,0.000000,0.479167,1.000000,0.00000,0.089322,0.000000,...,0.333333,0.857143,0.0,1.0,0.0,0.0,0.5,0.0,0.888889,1.0
41174,0.333333,0.089874,0.000000,1.0,0.000000,0.479167,1.000000,0.00000,0.089322,0.000000,...,0.333333,0.714286,0.0,0.0,0.0,0.0,0.5,1.0,0.888889,1.0


In [14]:
#separamos entre variables de dependientes e independiente y en 2 grupos (test y train)

X = df_norm.drop("y_int", axis= 1)
y = df_norm['y_int']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [15]:
X_train

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_int,marital_int,educiation_in,default_int,housing_int,loan_int,contact_int,poutcome_int,month_int,day_of_week_int
12346,0.148148,0.058154,0.036364,1.0,0.000000,1.000000,0.669135,0.338912,0.980503,1.000000,0.181818,0.333333,0.142857,0.0,1.0,1.0,1.0,0.5,0.444444,1.00
8564,0.407407,0.036194,0.090909,1.0,0.000000,1.000000,0.882307,0.376569,0.958966,1.000000,0.000000,0.666667,0.857143,0.0,0.0,0.0,1.0,0.5,0.333333,0.50
21417,0.148148,0.020943,0.054545,1.0,0.000000,1.000000,0.484412,0.615063,0.981410,1.000000,0.000000,0.666667,0.857143,0.0,0.0,0.0,0.0,0.5,0.555556,0.25
3315,0.222222,0.027450,0.000000,1.0,0.000000,0.937500,0.698753,0.602510,0.958059,0.859735,0.000000,0.666667,0.857143,0.0,0.0,0.0,1.0,0.5,0.222222,0.75
33236,0.283951,0.084994,0.000000,1.0,0.000000,0.333333,0.269680,0.192469,0.148946,0.512287,0.090909,0.666667,0.428571,0.5,0.0,0.0,0.0,0.5,0.222222,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.209877,0.020333,0.018182,1.0,0.000000,0.937500,0.698753,0.602510,0.957379,0.859735,0.090909,0.333333,0.285714,0.0,0.0,1.0,1.0,0.5,0.222222,0.25
11284,0.148148,0.024400,0.000000,1.0,0.000000,1.000000,0.882307,0.376569,0.980957,1.000000,0.000000,0.333333,0.285714,0.0,1.0,0.0,1.0,0.5,0.333333,0.75
38158,0.666667,0.050834,0.018182,1.0,0.285714,0.000000,0.089634,1.000000,0.027205,0.203781,0.454545,0.333333,0.000000,0.0,0.0,0.0,0.0,0.0,0.777778,0.75
860,0.283951,0.059984,0.018182,1.0,0.000000,0.937500,0.698753,0.602510,0.957153,0.859735,0.363636,0.333333,0.857143,0.0,1.0,0.0,1.0,0.5,0.222222,0.50


In [16]:

#Realizamos el cribado de las variables más importantes para crear el modelo. 

modelo_cribado = SelectKBest(chi2, k=5)
modelo_cribado.fit(X_train,y_train)
bool_x = modelo_cribado.get_support()
X_train_sel = pd.DataFrame(modelo_cribado.transform(X_train),columns=X_train.columns.values[bool_x])
X_test_sel = pd.DataFrame(modelo_cribado.transform(X_test), columns= X_test.columns.values[bool_x])


#aunque no vayamos a usarlo necesariamente, construyo el df de todos los datos entrenamiento y test post-cribado. 

train_data  = X_train_sel.copy()
train_data ['y_int'] = y_train
test_data = X_test_sel.copy()
test_data['y_int'] = y_test

#Simplemente duplicamos la variable para no liarnos con la nomenclatura

y_train_sel = y_train
y_test_sel = y_test

In [17]:
# Finalmente creamos el objeto modelo de la clase LogisticRegression.

model = LogisticRegression()
model.fit(X_train_sel,y_train_sel)


In [18]:
# Generamos el array con los datos de la prediccióin 

y_pred = model.predict(X_test_sel)
y_pred

array([0., 0., 1., ..., 0., 0., 0.], shape=(8236,))

In [20]:
#Vemos cuánto se ajusta el modelo a la realidad. Desde mi punto de vista no está nada mal. 

accuracy_score(y_test,y_pred)

0.8844099077221952