In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [2]:
data = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'
!wget $data
!unzip bank+marketing.zip

--2024-10-14 19:10:08--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [        <=>         ] 999.85K   538KB/s    in 1.9s    

2024-10-14 19:10:11 (538 KB/s) - ‘bank+marketing.zip’ saved [1023843]

Archive:  bank+marketing.zip
 extracting: bank.zip                
 extracting: bank-additional.zip     


In [3]:
!unzip bank.zip
# !unzip bank-additional.zip

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv                


In [4]:
df = pd.read_csv('bank-full.csv', sep=';')
df.head(2)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no


In [5]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [6]:
base = [
    'age', 'job', 'marital', 'education', 'balance', 'housing','contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'
]

numerical = [
    'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'
]

categorical = [
    'job', 'marital', 'education', 'housing','contact', 'month', 'poutcome'
]

In [7]:
df = df[base]
df.columns[df.isna().any()]

Index([], dtype='object')

1. Mode for 'education'

In [8]:
df['education'].mode()

0    secondary
Name: education, dtype: object

2. Correlation matrix for the numerical features of your dataset

In [9]:
"""
for feature1 in df[numerical]:
    for feature2 in df[numerical]:
        if feature1 != feature2:
            print(feature1, feature2)
            print(df[[feature1]].corrwith(df[[feature2]]))
"""

corr_matrix = df[numerical].corr()
corr_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


Two features with the biggest correlation

In [10]:
corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
corr_pairs = corr_pairs[corr_pairs < 1]
print(corr_pairs[:2])

previous  pdays       0.45482
pdays     previous    0.45482
dtype: float64


Encode the y variable

In [11]:
df['y'] = (df['y'] == 'yes').astype(int)
df.head(2)

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0


Split the data

In [12]:
print(len(df))

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)

print(len(df_train), len(df_val), len(df_test))

45211
28934 7234 9043


In [13]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['y'].values
y_val = df_val['y'].values
y_test = df_test['y'].values

del df_train['y']
del df_val['y']
del df_test['y']

3. Mutual information score between y and other categorical variables

In [14]:
def mutual_info_cat_score(series):
    return round(mutual_info_score(series, y_train), 2)

mi_score = df_train[categorical].apply(mutual_info_cat_score)
print(mi_score.sort_values(ascending=False))

poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64


4. 

One-hot encoding

In [15]:
df_train[categorical].iloc[:2]

Unnamed: 0,job,marital,education,housing,contact,month,poutcome
0,management,married,tertiary,yes,cellular,nov,unknown
1,retired,married,primary,yes,telephone,aug,unknown


In [16]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

Accuracy on the validation dataset

In [17]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [18]:
y_val_pred = model.predict(X_val)
print(round((y_val_pred == y_val).mean(), 2))

0.9


5. The least useful feature using the feature elimination technique

In [19]:
rfe = RFE(estimator=model, n_features_to_select=X_train.shape[1]-1)

rfe.fit(X_train, y_train)
ranking = rfe.ranking_

least_useful_feature_index = np.argmax(ranking)
least_useful_feature_name = dv.get_feature_names_out()[least_useful_feature_index]

print(least_useful_feature_name)

balance


6. C with the best accuracy on the validation set

In [20]:
print("C\tAccuracy")

for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    
    train_dict = df_train.to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    val_dict = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    print(c,"\t", round((y_val_pred == y_val).mean(), 3))

C	Accuracy
0.01 	 0.898
0.1 	 0.902
1 	 0.901
10 	 0.902
100 	 0.901
