In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('C:\\Users\\Muhammad Elbaklishy\\Downloads\\Bank.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
df = df[['age', 'job', 'marital', 'education', 'balance', 'housing', 
    'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 
    'previous', 'poutcome', 'y']]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
missing_values = df.isnull().sum()
print(missing_values)

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [5]:
print( df['education'].mode())

0    secondary
Name: education, dtype: object


In [6]:
# Select the numerical features
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Compute the correlation matrix
correlation_matrix = df[numerical_features].corr()

# Display the correlation matrix
print(correlation_matrix)

# Find the two features with the highest correlation
correlated_features = correlation_matrix.unstack().sort_values(ascending=False)
# Remove self-correlation
correlated_features = correlated_features[correlated_features != 1].reset_index()

print(correlated_features.head(1))  # Top 2 correlated features

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000
    level_0 level_1        0
0  previous   pdays  0.45482


In [7]:
(df.y == 'yes').astype(int).head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int32

In [8]:
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [9]:
df_train, df_val = train_test_split(df_train_full, test_size = 0.25, random_state = 42)

len(df_train) , len(df_val) , len(df_test)

(27126, 9042, 9043)

In [10]:
y_train = df_train.y.values
y_val = df_val.y.values

del df_train['y']
del df_val['y']

In [11]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# One-hot encode categorical variables in the training set
df_train_encoded = pd.get_dummies(df_train, drop_first=True)

# Compute mutual information between target 'y' and categorical variables
mi_scores = mutual_info_classif(df_train_encoded, y_train)

# Create a DataFrame with scores
mi_df = pd.DataFrame({'feature': df_train_encoded.columns, 'mi_score': mi_scores})

# Sort and display the features by mutual information score
mi_df_sorted = mi_df.sort_values(by='mi_score', ascending=False)
print(mi_df_sorted.head())

             feature  mi_score
3           duration  0.068588
38  poutcome_success  0.029842
5              pdays  0.024239
39  poutcome_unknown  0.018604
1            balance  0.016557


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train a logistic regression model using one-hot encoding for categorical variables
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(df_train_encoded, y_train)

# One-hot encode validation data
X_val_encoded = pd.get_dummies(df_val, drop_first=True)

# Predict on the validation set and calculate accuracy
y_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {round(accuracy, 2)}")

Validation Accuracy: 0.9


In [17]:
# Features to evaluate
features_to_evaluate = ['age', 'balance', 'marital', 'previous']

# Store the original accuracy
original_accuracy = accuracy

# Dictionary to store accuracy differences
feature_importances = {}

for feature in features_to_evaluate:
    # If the feature was one-hot encoded, drop all its related columns
    if feature in ['marital']:  # Add any other categorical features that were one-hot encoded if necessary
        columns_to_drop = [col for col in df_train_encoded.columns if feature in col]
    else:
        columns_to_drop = [feature]

    # Drop the selected feature(s) from both train and validation sets
    X_train_temp = df_train_encoded.drop(columns_to_drop, axis=1)
    X_val_temp = X_val_encoded.drop(columns_to_drop, axis=1)

    # Train the model without the feature(s)
    model.fit(X_train_temp, y_train)
    
    # Predict and calculate accuracy
    y_pred_temp = model.predict(X_val_temp)
    accuracy_temp = accuracy_score(y_val, y_pred_temp)
    
    # Calculate the difference in accuracy
    feature_importances[feature] = original_accuracy - accuracy_temp

# Find the feature with the smallest difference in accuracy
least_useful_feature = min(feature_importances, key=feature_importances.get)
print(f"The least useful feature is: {least_useful_feature}")
print(f" ")

# Print the differences for better visibility
for feature, diff in feature_importances.items():
    print(f"Feature: {feature}, Accuracy difference: {diff}")

The least useful feature is: age
 
Feature: age, Accuracy difference: -0.0013271400132713884
Feature: balance, Accuracy difference: -0.0009953550099535136
Feature: marital, Accuracy difference: -0.0008847600088475183
Feature: previous, Accuracy difference: -0.000774165007741634


In [15]:
# Try different values for C
C_values = [0.01, 0.1, 1, 10, 100]
best_accuracy = 0
best_C = None

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(df_train_encoded, y_train)
    
    y_pred = model.predict(X_val_encoded)
    accuracy = accuracy_score(y_val, y_pred)
    
    print(f"C = {C}, Validation Accuracy: {round(accuracy, 3)}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C

print(f"Best C value: {best_C} with accuracy {round(best_accuracy, 3)}")

C = 0.01, Validation Accuracy: 0.898
C = 0.1, Validation Accuracy: 0.901
C = 1, Validation Accuracy: 0.901
C = 10, Validation Accuracy: 0.901
C = 100, Validation Accuracy: 0.9
Best C value: 10 with accuracy 0.901
