In [105]:
import pandas as pd
import numpy as np
import urllib.request
import zipfile
import urllib.request
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score

In [106]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

--2024-10-16 22:15:36--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: 'bank+marketing.zip.4'

     0K .......... .......... .......... .......... ..........  294K
    50K .......... .......... .......... .......... ..........  592K
   100K .......... .......... .......... .......... .......... 6.72M
   150K .......... .......... .......... .......... .......... 1.14M
   200K .......... .......... .......... .......... .......... 1.19M
   250K .......... .......... .......... .......... .......... 5.34M
   300K .......... .......... .......... .......... .......... 3.95M
   350K .......... .......... .......... .......... .......... 4.72M
   400K .......... .......... .......... .......... ..........  929K
   450K .......... ........

In [107]:
zip_file_path = 'bank+marketing.zip'

In [108]:
with zipfile.ZipFile('bank+marketing.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

# List the extracted files and directories
print(os.listdir('.'))

['.idea', '.ipynb_checkpoints', 'bank+marketing.zip', 'bank+marketing.zip.1', 'bank+marketing.zip.2', 'bank+marketing.zip.3', 'bank+marketing.zip.4', 'bank-additional.zip', 'bank-full.csv', 'bank-names.txt', 'bank.csv', 'bank.zip', 'bank_marketing.zip', 'laptops.csv', 'S01.ipynb', 'S02-1.ipynb', 'S02.ipynb', 'S03.ipynb', 'S04.ipynb']


In [109]:
df = pd.read_csv('bank-full.csv', delimiter=';')

In [110]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [111]:
columns = ['age', 'job', 'marital', 'education', 'balance', 'housing', 
           'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 
           'previous', 'poutcome', 'y']

In [112]:
df_selected = df[columns]

In [113]:
df_selected.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [114]:
missing_values = df_selected.isnull().sum()

In [115]:
print(missing_values)

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [116]:
education_mode = df_selected['education'].mode()[0]
education_mode

'secondary'

In [117]:
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [118]:
correlation_matrix = df[numerical_features].corr()

In [119]:
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [120]:
max_corr = correlation_matrix.abs().unstack().sort_values(ascending=False)
max_corr = max_corr[max_corr < 1]  # Exclude self-correlation
top_pair = max_corr.idxmax(), max_corr.max()
top_pair

(('pdays', 'previous'), np.float64(0.4548196354805043))

In [121]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [122]:
X = df.drop('y', axis=1)
y = df['y']    

In [123]:
from sklearn.model_selection import train_test_split

In [124]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((27126, 16), (9042, 16), (9043, 16), (27126,), (9042,), (9043,))

### Q3

In [125]:
from sklearn.feature_selection import mutual_info_classif

In [126]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day', 'poutcome']

In [127]:
X_train_categorical = X_train[categorical_columns]

X_train_encoded = pd.get_dummies(X_train_categorical)

mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features=True)

In [128]:
mi_scores_df = pd.DataFrame({
    'Variable': X_train_encoded.columns,
    'Mutual Information Score': mi_scores
})

In [129]:
mi_scores_grouped = mi_scores_df.groupby(mi_scores_df['Variable'].str.split('_').str[0]).max()

mi_scores_grouped['Mutual Information Score'] = mi_scores_grouped['Mutual Information Score'].round(2)

mi_scores_grouped.loc[['contact', 'education', 'housing', 'poutcome']].sort_values(by='Mutual Information Score', ascending=False)

Unnamed: 0_level_0,Variable,Mutual Information Score
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1
poutcome,poutcome_unknown,0.03
contact,contact_unknown,0.01
housing,housing_yes,0.01
education,education_unknown,0.0


### Q4

In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [131]:
X_train_encoded = pd.get_dummies(X_train)
X_val_encoded = pd.get_dummies(X_val)

In [132]:
X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [133]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [134]:
model.fit(X_train_encoded, y_train)

In [135]:
y_pred = model.predict(X_val_encoded)

In [136]:
accuracy = accuracy_score(y_val, y_pred)

In [137]:
accuracy_rounded = round(accuracy, 2)
accuracy_rounded

0.9

### Q5

In [138]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [139]:
# Perform one-hot encoding on the categorical variables in both training and validation datasets
X_train_encoded = pd.get_dummies(X_train)
X_val_encoded = pd.get_dummies(X_val)

# Ensure that the validation set has the same columns as the training set (align columns)
X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Define the Logistic Regression model with the specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Fit the model on the training data
model.fit(X_train_encoded, y_train)

# Predict on the validation set
y_pred = model.predict(X_val_encoded)

# Calculate the accuracy score on the validation set
accuracy = accuracy_score(y_val, y_pred)

# Round the accuracy to 2 decimal places
original_accuracy = round(accuracy, 2)
print("Original Accuracy: ", original_accuracy)

Original Accuracy:  0.9


In [140]:
features_to_test = ['age', 'balance', 'marital', 'previous']
accuracy_differences = {}

In [141]:
for feature in features_to_test:
    # Identify all columns related to the feature (those that start with the feature name)
    cols_to_drop = [col for col in X_train_encoded.columns if col.startswith(feature)]
    
    # Remove the feature-related columns from the training and validation sets
    X_train_reduced = X_train_encoded.drop(cols_to_drop, axis=1)
    X_val_reduced = X_val_encoded.drop(cols_to_drop, axis=1)
    
    # Train a new model without the feature
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Predict on the validation set without the feature
    y_pred_reduced = model_reduced.predict(X_val_reduced)
    
    # Calculate the accuracy without the feature
    accuracy_reduced = accuracy_score(y_val, y_pred_reduced)
    
    # Calculate the difference in accuracy
    accuracy_diff = round(original_accuracy - accuracy_reduced, 2)
    
    # Store the difference for this feature
    accuracy_differences[feature] = accuracy_diff

In [142]:
accuracy_differences

{'age': -0.0, 'balance': -0.0, 'marital': -0.0, 'previous': -0.0}

### Q6

In [143]:
# List of regularization parameters to try
C_values = [0.01, 0.1, 1, 10, 100]

# Initialize a dictionary to store the accuracy for each value of C
C_accuracy = {}

In [144]:
# Loop over the C values and train a model for each one
for C in C_values:
    # Define the Logistic Regression model with the specified C value
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    # Fit the model on the training data
    model.fit(X_train_encoded, y_train)
    
    # Predict on the validation set
    y_pred = model.predict(X_val_encoded)
    
    # Calculate the accuracy score on the validation set
    accuracy = accuracy_score(y_val, y_pred)
    
    # Round the accuracy to 3 decimal places
    accuracy_rounded = round(accuracy, 3)
    
    # Store the accuracy for this value of C
    C_accuracy[C] = accuracy_rounded


In [145]:
# Find the best value of C (smallest C with the highest accuracy)
best_C = sorted(C_accuracy.items(), key=lambda x: (-x[1], x[0]))[0]

best_C

(0.1, 0.9)