In [47]:
import pandas as pd
import numpy as np

In [48]:
df = pd.read_csv('/kaggle/input/classification-ml-tasks-3/bank/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


# > **Q1**

In [49]:
# Checking for missing values in the selected features
missing_values = df.isnull().sum()

# Finding the most frequent observation for the column 'education'
education_mode = df['education'].mode()[0]

# Display the missing values check and the mode of the 'education' column
print(missing_values)
print(education_mode)

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
secondary


# Q2

In [50]:
# Selecting only the numerical columns from the dataset
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Creating the correlation matrix
correlation_matrix = df[numerical_columns].corr()

# Finding the pair of features with the highest correlation (excluding diagonal)
max_corr_pair = correlation_matrix.unstack().sort_values(ascending=False)

# Remove self-correlations (correlation of a feature with itself)
max_corr_pair = max_corr_pair[max_corr_pair < 1].reset_index()

# Get the two features with the highest correlation
max_corr_features = max_corr_pair.iloc[0]

print(correlation_matrix)
print(max_corr_features)


               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000
level_0    previous
level_1       pdays
0           0.45482
Name: 0, dtype: object


# *Target encoding*

In [64]:
from sklearn.model_selection import train_test_split

# Split the dataset

In [65]:
# Prepare the features and target variable
X = df.drop(columns=['y'])
y = df['y'].apply(lambda x: 1 if x == 'yes' else 0)  # Convert target to binary (0/1)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=[np.number]).columns

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [66]:
print((X_train.shape, X_val.shape, X_test.shape))
print((y_train.shape, y_val.shape, y_test.shape))

((36168, 16), (9043, 16), (9043, 16))
((36168,), (9043,), (9043,))


# Q3

In [67]:
from sklearn.feature_selection import mutual_info_classif

In [68]:
# Define a function to calculate mutual information score between y and a categorical feature
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

# Apply the function to each categorical column in the training set
categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
mi_scores = X_train[categorical_columns].apply(mutual_info_churn_score)

# Round the scores to 2 decimal places and sort them
mi_scores_rounded = mi_scores.round(2).sort_values(ascending=False)

# Display the mutual information scores for the selected features
selected_features = ['contact', 'education', 'housing', 'poutcome']
print(mi_scores_rounded[selected_features])

contact      0.01
education    0.00
housing      0.01
poutcome     0.03
dtype: float64


# Q4

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [71]:
# Preprocessing pipeline: OneHotEncode categorical columns, pass numerical columns through unchanged
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Define the logistic regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Create a pipeline with preprocessing and logistic regression
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Calculate the accuracy and round it to 2 decimal places
accuracy = round(accuracy_score(y_val, y_pred), 2)

In [72]:
print(f'Validation Accuracy: {accuracy}')

Validation Accuracy: 0.9


# Q5

In [84]:
# Initialize a dictionary to store the accuracy without each feature
feature_differences = {}

# Iterate over each feature to exclude one at a time
for feature in ['age', 'balance', 'marital', 'previous']:
    # Drop the feature from X
    X_train_dropped = X_train.drop(columns=[feature])
    X_val_dropped = X_val.drop(columns=[feature])

    # Identify new categorical and numerical columns after dropping the feature
    categorical_cols_dropped = X_train_dropped.select_dtypes(include=['object']).columns
    numerical_cols_dropped = X_train_dropped.select_dtypes(include=[np.number]).columns

    # Update the preprocessing pipeline to exclude the dropped feature
    preprocessor_dropped = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numerical_cols_dropped),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_dropped)
        ]
    )

In [85]:
# Create a new pipeline with the updated preprocessing
pipeline_dropped = Pipeline(steps=[('preprocessor', preprocessor_dropped), ('model', model)])

In [86]:
# Train the model without the current feature
pipeline_dropped.fit(X_train_dropped, y_train)

In [87]:
# Make predictions and calculate accuracy without the feature

y_pred_dropped = pipeline_dropped.predict(X_val_dropped)
dropped_accuracy = accuracy_score(y_val, y_pred_dropped)

In [88]:
# Calculate the difference in accuracy
accuracy_difference = original_accuracy - dropped_accuracy
feature_differences[feature] = accuracy_difference

In [89]:
for feature, difference in feature_differences.items():
    print(f"Difference in accuracy without {feature}: {difference}")

Difference in accuracy without previous: -0.00033174831361271195


In [91]:
# Print the differences in accuracy for each feature
print("Accuracy differences for each feature:")
for feature, difference in feature_differences.items():
    print(f"Difference in accuracy without {feature}: {difference}")

# Find the feature with the smallest difference is accuracy
least_useful_feature = min(feature_differences, key=feature_differences.get)
smallest_difference = feature_differences[least_useful_feature]

print(f"\nThe least useful feature is '{least_useful_feature}' with a difference of {smallest_difference}.")

Accuracy differences for each feature:
Difference in accuracy without previous: -0.00033174831361271195

The least useful feature is 'previous' with a difference of -0.00033174831361271195.


# Q6

In [92]:
from sklearn.compose import ColumnTransformer

In [93]:
C_values = [0.01, 0.1, 1, 10, 100]
accuracy_dict = {}

In [94]:
# Iterate over each value of C, train the model, and calculate accuracy
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred), 3)
    accuracy_dict[C] = accuracy

In [95]:
best_C = max(accuracy_dict, key=lambda x: (accuracy_dict[x], -x))
best_accuracy = accuracy_dict[best_C]

In [96]:
print(f"Best value of C: {best_C}")
print(f"Accuracy with best C: {best_accuracy}")

Best value of C: 0.1
Accuracy with best C: 0.899
