In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from IPython.display import display

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [6]:
data = pd.read_csv('../data/bank-full.csv', sep=';')

In [7]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
df = data[[
    'age',
    'job',
    'marital',
    'education',
    'balance',
    'housing',
    'contact',
    'day',
    'month',
    'duration',
    'campaign',
    'pdays',
    'previous',
    'poutcome',
    'y'
]]

In [9]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


In [11]:
# Check if the missing values are presented in the features.


In [12]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

## Question 1
What is the most frequent observation (mode) for the column education?

* unknown
* primary
* secondary
* tertiary

In [13]:
df.education.mode()

0    secondary
Name: education, dtype: object

In [14]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

R/ secondary

## Q2 Create the correlation matrix for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

* age and balance
* day and campaign
* day and pdays
* pdays and previous

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


In [18]:
numerical_variables = [
    'age',
    'balance',
    'day',
    'duration',
    'campaign',
    'pdays',
    'previous'
    ]

In [19]:
df_numerical = df[numerical_variables]
df_numerical.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0


In [26]:
corr_matrix = round(df_numerical.corr(), 2)
print(corr_matrix)

           age  balance   day  duration  campaign  pdays  previous
age       1.00     0.10 -0.01     -0.00      0.00  -0.02      0.00
balance   0.10     1.00  0.00      0.02     -0.01   0.00      0.02
day      -0.01     0.00  1.00     -0.03      0.16  -0.09     -0.05
duration -0.00     0.02 -0.03      1.00     -0.08  -0.00      0.00
campaign  0.00    -0.01  0.16     -0.08      1.00  -0.09     -0.03
pdays    -0.02     0.00 -0.09     -0.00     -0.09   1.00      0.45
previous  0.00     0.02 -0.05      0.00     -0.03   0.45      1.00


In [29]:
def max_correlation(df):
    # Calculamos la matriz de correlación
    corr_matrix = df.corr().abs()
    
    # Creamos un DataFrame del triángulo superior de la matriz, excluyendo la diagonal
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Encontramos el valor máximo
    max_corr = upper_tri.max().max()
    
    # Encontramos las variables correspondientes
    max_corr_vars = upper_tri.stack().idxmax()
    
    return max_corr, max_corr_vars

In [30]:
# Uso de la función
max_corr, (var1, var2) = max_correlation(df_numerical)

print(f"La correlación más alta es {max_corr:.2f} entre las variables {var1} y {var2}")

La correlación más alta es 0.45 entre las variables pdays y previous


**Target encoding**

* Now we want to encode the y variable.
* Let's replace the values yes/no with 1/0.

**Split the data**
* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value y is not in your dataframe.

In [31]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [32]:
df.y = (df.y == 'yes').astype(int)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.y = (df.y == 'yes').astype(int)


Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


In [34]:
df.y.value_counts(normalize=True)

y
0    0.883015
1    0.116985
Name: proportion, dtype: float64

In [35]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)


In [36]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [37]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [38]:
del df_train['y']
del df_val['y']
del df_test['y']

In [39]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

## Question 3

Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?

* contact
* education
* housing
* poutcome

In [44]:
df_full_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36168 entries, 3344 to 15795
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        36168 non-null  int64 
 1   job        36168 non-null  object
 2   marital    36168 non-null  object
 3   education  36168 non-null  object
 4   balance    36168 non-null  int64 
 5   housing    36168 non-null  object
 6   contact    36168 non-null  object
 7   day        36168 non-null  int64 
 8   month      36168 non-null  object
 9   duration   36168 non-null  int64 
 10  campaign   36168 non-null  int64 
 11  pdays      36168 non-null  int64 
 12  previous   36168 non-null  int64 
 13  poutcome   36168 non-null  object
 14  y          36168 non-null  int64 
dtypes: int64(8), object(7)
memory usage: 4.4+ MB


In [45]:
categorical_variables = df_full_train.select_dtypes(include=['object']).columns.tolist()
categorical_variables

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [47]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.y)


mutual_information_scores = df_full_train[categorical_variables].apply(mutual_info_churn_score)
round(mutual_information_scores.sort_values(ascending=False),2)

poutcome     0.03
month        0.02
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

R/ poutcome

## Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    * To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    * model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

* 0.6
* 0.7
* 0.8
* 0.9

In [48]:
train_dicts = df_train[categorical_variables + numerical_variables].to_dict(orient='records')

In [49]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

In [50]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [51]:
dv.transform(train_dicts)[0]

array([ 3.2e+01,  1.1e+03,  1.0e+00,  1.0e+00,  0.0e+00,  0.0e+00,
        1.1e+01,  6.7e+01,  0.0e+00,  0.0e+00,  1.0e+00,  0.0e+00,
        0.0e+00,  1.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  1.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  1.0e+00,  0.0e+00,
        1.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00, -1.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  1.0e+00,  0.0e+00])

In [52]:
X_train = dv.fit_transform(train_dicts)
X_train[0]

array([ 3.2e+01,  1.1e+03,  1.0e+00,  1.0e+00,  0.0e+00,  0.0e+00,
        1.1e+01,  6.7e+01,  0.0e+00,  0.0e+00,  1.0e+00,  0.0e+00,
        0.0e+00,  1.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  1.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  1.0e+00,  0.0e+00,
        1.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00,  0.0e+00, -1.0e+00,
        0.0e+00,  0.0e+00,  0.0e+00,  1.0e+00,  0.0e+00])

In [53]:
val_dicts = df_val[categorical_variables + numerical_variables].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [54]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [55]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.0123981 , 0.01011697, 0.15469084, ..., 0.05592089, 0.00905149,
       0.28450953])

In [56]:
churn_decision = (y_pred >= 0.5)

In [57]:
round((y_val == churn_decision).mean(),2)

0.9

R/ 0.9

## Question 5
* Let's find the least useful feature using the feature elimination technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

* age
* balance
* marital
* previous

**Note: The difference doesn't have to be positive.**

In [65]:
def train_and_evaluate(X_train, X_val, y_train, y_val):
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    return round((y_val == churn_decision).mean(), 4)


In [66]:
# Lista completa de características
all_features = categorical_variables + numerical_variables

In [67]:
# Entrenar el modelo con todas las características
dv = DictVectorizer(sparse=False)
X_train_full = dv.fit_transform(train_dicts)
X_val_full = dv.transform(val_dicts)
accuracy_full = train_and_evaluate(X_train_full, X_val_full, y_train, y_val)

In [68]:
# Diccionario para almacenar las diferencias de precisión
accuracy_diff = {}

In [69]:
# Iterar sobre cada característica
for feature in all_features:
    # Crear una nueva lista de características excluyendo la característica actual
    features_subset = [f for f in all_features if f != feature]
    
    # Preparar los datos de entrenamiento y validación sin la característica
    train_dicts_subset = df_train[features_subset].to_dict(orient='records')
    val_dicts_subset = df_val[features_subset].to_dict(orient='records')
    
    # Vectorizar los datos
    dv_subset = DictVectorizer(sparse=False)
    X_train_subset = dv_subset.fit_transform(train_dicts_subset)
    X_val_subset = dv_subset.transform(val_dicts_subset)
    
    # Entrenar y evaluar el modelo
    accuracy_subset = train_and_evaluate(X_train_subset, X_val_subset, y_train, y_val)
    
    # Calcular la diferencia de precisión
    accuracy_diff[feature] = accuracy_full - accuracy_subset

In [70]:
# Encontrar la característica con la menor diferencia absoluta
least_important_feature = min(accuracy_diff, key=lambda x: abs(accuracy_diff[x]))

print("Diferencias de precisión para cada característica:")
for feature, diff in accuracy_diff.items():
    print(f"{feature}: {diff:.4f}")

print(f"\nLa característica menos útil es: {least_important_feature}")
print(f"Con una diferencia de precisión de: {accuracy_diff[least_important_feature]:.4f}")

Diferencias de precisión para cada característica:
job: -0.0004
marital: -0.0007
education: -0.0002
housing: -0.0003
contact: 0.0007
month: 0.0011
poutcome: 0.0072
age: -0.0002
balance: 0.0000
day: -0.0004
duration: 0.0121
campaign: 0.0002
pdays: -0.0005
previous: -0.0007

La característica menos útil es: balance
Con una diferencia de precisión de: 0.0000


## Question 6
* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

* 0.01
* 0.1
* 1
* 10
* 100

**Note: If there are multiple options, select the smallest C.**

In [71]:
def train_and_evaluate(X_train, X_val, y_train, y_val, C):
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    return round((y_val == churn_decision).mean(), 3)

In [72]:
# Valores de C a probar
C_values = [0.01, 0.1, 1, 10, 100]

In [73]:
# Diccionario para almacenar las precisiones
accuracies = {}

In [74]:
# Entrenar y evaluar modelos para cada valor de C
for C in C_values:
    accuracy = train_and_evaluate(X_train, X_val, y_train, y_val, C)
    accuracies[C] = accuracy
    print(f"C = {C}: Accuracy = {accuracy}")

C = 0.01: Accuracy = 0.899
C = 0.1: Accuracy = 0.9
C = 1: Accuracy = 0.901
C = 10: Accuracy = 0.901
C = 100: Accuracy = 0.901


In [75]:
# Encontrar el mejor C
best_C = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_C]

print(f"\nEl mejor valor de C es {best_C} con una precisión de {best_accuracy}")

# Verificar si hay múltiples C con la misma precisión máxima
best_C_values = [C for C, acc in accuracies.items() if acc == best_accuracy]
if len(best_C_values) > 1:
    best_C = min(best_C_values)
    print(f"Múltiples valores de C tienen la misma precisión máxima. El C más pequeño es {best_C}")


El mejor valor de C es 1 con una precisión de 0.901
Múltiples valores de C tienen la misma precisión máxima. El C más pequeño es 1
