In [271]:
import pandas as pd
import numpy as np

In [42]:
data = 'bank-full.csv'

df = pd.read_csv(data, delimiter=';')

In [48]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


# Data preparation

In [64]:
features = df.drop('y', axis=1)
features.isnull().sum().any()

False

## Question 1
#### What is the most frequent observation (mode) for the column education?

In [76]:
education = df['education']

most_frequent_education = education.mode()[0]
most_frequent_education

'secondary'

In [86]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


## Question 2

### Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

In [242]:
categorials_columns = list(df.dtypes[df.dtypes == 'object'].index)
categorials_columns

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [114]:
features_columns = df.drop(categorials_columns, axis=1)
features_columns

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [126]:
correlation_matrix = features_columns.corr()
correlation_pairs = correlation_matrix.unstack().sort_values(ascending=False)
top_correlation = correlation_pairs[correlation_pairs < 1].head(2)
top_correlation

previous  pdays       0.45482
pdays     previous    0.45482
dtype: float64

In [244]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression



## Target encoding

In [246]:
le = LabelEncoder()
df['y'] = le.fit_transform(df['y'])

In [148]:
X = df.drop(columns=['y'])
y = df['y']

## Split the data
#### <li>Split your data in train/val/test sets with 60%/20%/20% distribution.</li>#### <li>
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42</li>.#### <li>
Make sure that the target value y is not in your datafram</li>e.

In [154]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

In [156]:
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## Question 3

### <li> Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.</li>
### <li> 
Round the scores to 2 decimals using round(score, 2)</li>.


In [168]:
categorical_cols = X_train.select_dtypes(include=['object']).columns
categorical_cols

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

In [174]:
X_train_encoded = X_train.copy()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col])
    label_encoders[col] = le

In [180]:
mi_scores = mutual_info_classif(X_train_encoded[categorical_cols], y_train, discrete_features=True)

In [182]:
mi_df = pd.DataFrame({'Feature': categorical_cols, 'Mutual Information Score': mi_scores})


In [184]:
mi_df

Unnamed: 0,Feature,Mutual Information Score
0,job,0.00791
1,marital,0.002201
2,education,0.00257
3,default,0.000263
4,housing,0.009464
5,loan,0.002492
6,contact,0.014214
7,month,0.02478
8,poutcome,0.029403


## Question 4

In [196]:
encoder = OneHotEncoder(drop='first', sparse_output=False)  

In [198]:
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_val_encoded = encoder.transform(X_val[categorical_cols])

In [200]:
X_train_ohe = pd.DataFrame(X_train_encoded, index=X_train.index, columns=encoder.get_feature_names_out(categorical_cols))
X_val_ohe = pd.DataFrame(X_val_encoded, index=X_val.index, columns=encoder.get_feature_names_out(categorical_cols))

In [202]:
X_train_final = pd.concat([X_train.drop(columns=categorical_cols).reset_index(drop=True), X_train_ohe.reset_index(drop=True)], axis=1)
X_val_final = pd.concat([X_val.drop(columns=categorical_cols).reset_index(drop=True), X_val_ohe.reset_index(drop=True)], axis=1)

In [208]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_final, y_train)

In [212]:
y_pred_val = model.predict(X_val_final)
accuracy_val = accuracy_score(y_val, y_pred_val)

accuracy_val_rounded = round(accuracy_val, 2)

print(f'Validation accuracy: {accuracy_val_rounded}')

Validation accuracy: 0.9


In [216]:
all_features = list(X_train_final.columns) + list(X_train_ohe.columns)

## Question 5

In [257]:
accuracy_diffs = {}

for feature in all_features:
    X_train_reduced = X_train_final.drop(columns=[feature])
    X_val_reduced = X_val_final.drop(columns=[feature])

    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)

    y_pred_val_reduced = model_reduced.predict(X_val_reduced)
    accuracy_reduced = accuracy_score(y_val, y_pred_val_reduced)

    accuracy_diff = accuracy_val - accuracy_reduced
    accuracy_diffs[feature] = accuracy_diff

    print(f'Accuracy without {feature}: {accuracy_reduced} (Difference: {accuracy_diff})')
    


Accuracy without age: 0.9006856890068569 (Difference: -0.00033178500331787486)
Accuracy without balance: 0.9002433090024331 (Difference: 0.00011059500110588427)
Accuracy without day: 0.900353904003539 (Difference: 0.0)
Accuracy without duration: 0.893607608936076 (Difference: 0.006746295067462937)
Accuracy without campaign: 0.9007962840079629 (Difference: -0.00044238000442387015)
Accuracy without pdays: 0.9001327140013271 (Difference: 0.00022119000221187957)
Accuracy without previous: 0.9002433090024331 (Difference: 0.00011059500110588427)
Accuracy without job_blue-collar: 0.900353904003539 (Difference: 0.0)
Accuracy without job_entrepreneur: 0.900353904003539 (Difference: 0.0)
Accuracy without job_housemaid: 0.9002433090024331 (Difference: 0.00011059500110588427)
Accuracy without job_management: 0.9001327140013271 (Difference: 0.00022119000221187957)
Accuracy without job_retired: 0.9000221190002212 (Difference: 0.00033178500331776384)
Accuracy without job_self-employed: 0.900575094005

In [224]:
least_useful_feature = min(accuracy_diffs, key=accuracy_diffs.get)
print(f'Least useful feature: {least_useful_feature}, Accuracy difference: {accuracy_diffs[least_useful_feature]}')

Least useful feature: month_oct, Accuracy difference: -0.0012165450121655041


In [226]:
smallest_difference_feature = min(accuracy_diffs, key=lambda k: abs(accuracy_diffs[k]))
smallest_difference = accuracy_diffs[smallest_difference_feature]

print(f"Feature with the smallest difference: {smallest_difference_feature} (Difference: {smallest_difference})")


Feature with the smallest difference: day (Difference: 0.0)


## Question 6

In [261]:
C_values = [0.01, 0.1, 1, 10, 100]

accuracy_results = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_final, y_train)

    y_pred_val = model.predict(X_val_final)
    accuracy = accuracy_score(y_val, y_pred_val)

    accuracy_rounded = round(accuracy, 3)
    
    accuracy_results[C] = accuracy_rounded

In [263]:
for C, accuracy in accuracy_results.items():
    print(f'Accuracy with C={C}: {accuracy}')

Accuracy with C=0.01: 0.897
Accuracy with C=0.1: 0.899
Accuracy with C=1: 0.9
Accuracy with C=10: 0.9
Accuracy with C=100: 0.901


In [265]:
best_C = min(accuracy_results, key=accuracy_results.get)
best_accuracy = accuracy_results[best_C]

In [269]:
print(f'The best C is: {best_C} with an accuracy of: {best_accuracy}')

The best C is: 0.01 with an accuracy of: 0.897
