## 3.15 Homework 3: Machine Learning for Classification

In [671]:
# Import libraries
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression

from IPython.display import display # displaying certain variables on purpose

# Disable scientific notation
np.set_printoptions(suppress=True, precision=2)

# Allow plotting of all subplots in a jupyter notebook
%matplotlib inline

### Dataset

In this homework, we will use the Bank Marketing dataset.
Download it from https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

We need to take bank/bank-full.csv file from the downloaded zip-file.
In this dataset our desired target for classification task will be y variable - has the client subscribed a term deposit or not.


In [674]:
# Download and read the dataset
df_banks = pd.read_csv('bank-full.csv', delimiter=';')

### Features

For the rest of the homework, you'll need to use only these columns:

- age,
- job,
- marital,
- education,
- balance,
- housing,
- contact,
- day,
- month,
- duration,
- campaign,
- pdays,
- previous,
- poutcome,
- y

### Data preparation
- Select only the features from above.
- Check if the missing values are presented in the features.


In [678]:
# Check the column names
df_banks.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [680]:
# Select only the features from above
df_banks_features = df_banks[['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']]
df_banks_features

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


In [682]:
# Check if missing values are presented in the features
df_banks_features.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [684]:
# Answer: No missing values present

### Question 1

What is the most frequent observation (mode) for the column education?

- unknown
- primary
- secondary
- tertiary

In [687]:
education_mode = df_banks_features['education'].mode()
education_mode

0    secondary
Name: education, dtype: object

#### Question 1 - Answer: 'secondary'

### Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- age and balance
- day and campaign
- day and pdays
- pdays and previous

In [691]:
df_banks_features.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [693]:
# List of categorial variables
categorical = df_banks_features.select_dtypes(include='object').columns.tolist()

# List of numerical variables
numerical = df_banks_features.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical:", categorical)
print("Numerical:", numerical)

Categorical: ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome', 'y']
Numerical: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [695]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
print("Categorical:", categorical)
print("Numerical:", numerical)

Categorical: ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
Numerical: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [697]:
# Calculate correlations for certain feature pairs
corr_age_balance = df_banks_features['age'].corr(df_banks_features['balance'])
corr_day_campaign = df_banks_features['day'].corr(df_banks_features['campaign'])
corr_day_pdays = df_banks_features['day'].corr(df_banks_features['pdays'])
corr_pdays_previous = df_banks_features['pdays'].corr(df_banks_features['previous'])

# Show correlation matrices
print("Correlation between age and balance:", corr_age_balance)
print("Correlation between day and campaign:", corr_day_campaign)
print("Correlation between day and pdays:", corr_day_pdays)
print("Correlation between pdays and previous:", corr_pdays_previous)

Correlation between age and balance: 0.09778273937134752
Correlation between day and campaign: 0.16249021632619282
Correlation between day and pdays: -0.09304407377294048
Correlation between pdays and previous: 0.45481963548050097


In [699]:
# Creating a list of tuples with feature pairs and their correlation values
correlations = [
    ("age and balance", corr_age_balance),
    ("day and campaign", corr_day_campaign),
    ("day and pdays", corr_day_pdays),
    ("pdays and previous", corr_pdays_previous)
]

# Sorting the list by the absolute values of the correlations in descending order
sorted_correlations = sorted(correlations, key=lambda x: abs(x[1]), reverse=True)

# Displaying the sorted list
sorted_correlations

# Display the two features with the highest correlation
sorted_correlations[0]

('pdays and previous', 0.45481963548050097)

In [701]:
# Compute the pairwise correlation between the features
correlation_matrix = df_banks_features[['age', 'balance', 'day', 'campaign', 'pdays', 'previous']].corr()

# Unstack the correlation matrix to a list of correlation pairs and sort by absolute values
correlation_pairs = correlation_matrix.unstack().abs().sort_values(kind="quicksort", ascending=False)

# Remove correlations of variables with themselves (which are always 1)
correlation_pairs = correlation_pairs[correlation_pairs < 1]

# Display the two features with the highest correlation
print(correlation_pairs.head(1))

previous  pdays    0.45482
dtype: float64


#### Question 2 - Answer: 'pdays` and `previous'

### Target encoding

- Now we want to encode the y variable.
- Let's replace the values yes/no with 1/0.

In [705]:
df_banks_features['y'].unique()

array(['no', 'yes'], dtype=object)

In [707]:
df_banks_features['y']

0         no
1         no
2         no
3         no
4         no
        ... 
45206    yes
45207    yes
45208    yes
45209     no
45210     no
Name: y, Length: 45211, dtype: object

In [709]:
# Explicitly replace the values yes/no with 1/0.
df_banks_features.loc[:, 'y'] = (df_banks_features['y'] == 'yes').astype(int)
df_banks_features.loc[:, 'y']

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: object

In [711]:
df_banks_features['y'].unique()

array([0, 1], dtype=object)

### Split the data
- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value y is not in your dataframe.

In [714]:
# Splitting the dataset into two parts 80% (df_train_full) + 20% (df_test)
df_train_full, df_test = train_test_split(df_banks_features, test_size=0.2, random_state=42)

In [716]:
len(df_train_full), len(df_test)

(36168, 9043)

In [718]:
# Splitting the df_train_full into training and validation 60% (train) + 20% (val) (20% of 80% = 25%)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [720]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [722]:
df_train.head(), df_val.head(), df_test.head()

(       age            job  marital  education  balance housing   contact  day  \
 20326   32     technician   single   tertiary     1100     yes  cellular   11   
 24301   38   entrepreneur  married  secondary        0     yes  cellular   17   
 38618   49    blue-collar  married  secondary     3309     yes  cellular   15   
 18909   37      housemaid  married    primary     2410      no  cellular    4   
 23081   31  self-employed  married   tertiary     3220      no  cellular   26   
 
       month  duration  campaign  pdays  previous poutcome  y  
 20326   aug        67         1     -1         0  unknown  0  
 24301   nov       258         1     -1         0  unknown  0  
 38618   may       349         2     -1         0  unknown  0  
 18909   aug       315         1     -1         0  unknown  0  
 23081   aug        74         4     -1         0  unknown  0  ,
        age         job   marital  education  balance housing    contact  day  \
 11019   38    services  divorced  secon

In [724]:
# Reset index of shuffled datasets
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train.head(), df_val.head(), df_test.head()

(   age            job  marital  education  balance housing   contact  day  \
 0   32     technician   single   tertiary     1100     yes  cellular   11   
 1   38   entrepreneur  married  secondary        0     yes  cellular   17   
 2   49    blue-collar  married  secondary     3309     yes  cellular   15   
 3   37      housemaid  married    primary     2410      no  cellular    4   
 4   31  self-employed  married   tertiary     3220      no  cellular   26   
 
   month  duration  campaign  pdays  previous poutcome  y  
 0   aug        67         1     -1         0  unknown  0  
 1   nov       258         1     -1         0  unknown  0  
 2   may       349         2     -1         0  unknown  0  
 3   aug       315         1     -1         0  unknown  0  
 4   aug        74         4     -1         0  unknown  0  ,
    age         job   marital  education  balance housing    contact  day  \
 0   38    services  divorced  secondary      -10     yes    unknown   17   
 1   42  manage

In [726]:
# Extracting the target variables y from the dataframes
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

y_test, y_val, y_test

(array([0, 0, 0, ..., 0, 0, 0], dtype=object),
 array([0, 0, 1, ..., 0, 0, 1], dtype=object),
 array([0, 0, 0, ..., 0, 0, 0], dtype=object))

In [728]:
# Deleting target variables y from dataframes (train and val, not test!)
del df_train['y']
del df_val['y']

In [730]:
df_train.head(), df_val.head()

(   age            job  marital  education  balance housing   contact  day  \
 0   32     technician   single   tertiary     1100     yes  cellular   11   
 1   38   entrepreneur  married  secondary        0     yes  cellular   17   
 2   49    blue-collar  married  secondary     3309     yes  cellular   15   
 3   37      housemaid  married    primary     2410      no  cellular    4   
 4   31  self-employed  married   tertiary     3220      no  cellular   26   
 
   month  duration  campaign  pdays  previous poutcome  
 0   aug        67         1     -1         0  unknown  
 1   nov       258         1     -1         0  unknown  
 2   may       349         2     -1         0  unknown  
 3   aug       315         1     -1         0  unknown  
 4   aug        74         4     -1         0  unknown  ,
    age         job   marital  education  balance housing    contact  day  \
 0   38    services  divorced  secondary      -10     yes    unknown   17   
 1   42  management    single   t

### Question 3

- Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?

- contact
- education
- housing
- poutcome

In [733]:
# Define the function for calculating thee mutual_info_score for a series of features
def calculate_mi(series):
    return round(mutual_info_score(series, df_train.y), 2)

In [735]:
# Apply function on categorial variable
df_train['y'] = y_train
df_mi = df_train[categorical].apply(calculate_mi)
df_mi

job          0.01
marital      0.00
education    0.00
housing      0.01
contact      0.01
month        0.03
poutcome     0.03
dtype: float64

In [737]:
# Sort the series in descending order
df_mi = df_mi.sort_values(ascending=False)
df_mi

month        0.03
poutcome     0.03
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

In [739]:
# Apply function on categorial variables in question
df_train['y'] = y_train
df_mi_question = df_train[['contact', 'education', 'housing', 'poutcome']].apply(calculate_mi)
df_mi_question

contact      0.01
education    0.00
housing      0.01
poutcome     0.03
dtype: float64

In [741]:
# Sort the series of categorial variables in question in descending order
df_mi_question = df_mi_question.sort_values(ascending=False)
df_mi_question

poutcome     0.03
contact      0.01
housing      0.01
education    0.00
dtype: float64

#### Question 3 - Answer: 'poutcome'

### Question 4

- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9

In [745]:
# Transform df_train to dictionary for using it in DictVectorizer
train_dict = df_train[categorical + numerical].to_dict(orient='records')

# View the first dictionary from the converted DataFrame to inspect the structure
train_dict[0]

{'job': 'technician',
 'marital': 'single',
 'education': 'tertiary',
 'housing': 'yes',
 'contact': 'cellular',
 'month': 'aug',
 'poutcome': 'unknown',
 'age': 32,
 'balance': 1100,
 'day': 11,
 'duration': 67,
 'campaign': 1,
 'pdays': -1,
 'previous': 0}

In [747]:
# Initialize the DictVectorizer. Setting sparse=False ensures the output is a dense array.
dv = DictVectorizer(sparse=False)

# Fit the vectorizer to the training data (it learns the feature space from the dictionary keys)
dv.fit(train_dict)

In [749]:
# Transform the training data into a numerical format (one-hot encode categorical variables)
X_train = dv.transform(train_dict)
print(X_train)
print(X_train.shape)

[[  32. 1100.    1. ...    0.    1.    0.]
 [  38.    0.    1. ...    0.    1.    0.]
 [  49. 3309.    2. ...    0.    1.    0.]
 ...
 [  54.    0.    1. ...    0.    1.    0.]
 [  25. 2311.    2. ...    0.    1.    0.]
 [  30.   15.    2. ...    0.    1.    0.]]
(27126, 47)


In [750]:
# Use fit_transform in one step (fit and transform in a single call)
X_train = dv.fit_transform(train_dict)

# Check the shape again after using fit_transform
print(X_train.shape)

(27126, 47)


In [752]:
# Get the feature names generated by the DictVectorizer after fitting (to see what features were created)
print(dv.get_feature_names_out())

['age' 'balance' 'campaign' 'contact=cellular' 'contact=telephone'
 'contact=unknown' 'day' 'duration' 'education=primary'
 'education=secondary' 'education=tertiary' 'education=unknown'
 'housing=no' 'housing=yes' 'job=admin.' 'job=blue-collar'
 'job=entrepreneur' 'job=housemaid' 'job=management' 'job=retired'
 'job=self-employed' 'job=services' 'job=student' 'job=technician'
 'job=unemployed' 'job=unknown' 'marital=divorced' 'marital=married'
 'marital=single' 'month=apr' 'month=aug' 'month=dec' 'month=feb'
 'month=jan' 'month=jul' 'month=jun' 'month=mar' 'month=may' 'month=nov'
 'month=oct' 'month=sep' 'pdays' 'poutcome=failure' 'poutcome=other'
 'poutcome=success' 'poutcome=unknown' 'previous']


In [753]:
# Transform a small sample of the data and convert the first row into a list to see the result
print(list(dv.transform(train_dict[:5])[0]))

[32.0, 1100.0, 1.0, 1.0, 0.0, 0.0, 11.0, 67.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 1.0, 0.0]


In [757]:
# Convert the validation DataFrame into a list of dictionaries (same process as for the training set)
val_dict = df_val[categorical + numerical].to_dict(orient='records')

In [759]:
# Transform the validation data using the same DictVectorizer (no need to fit again, just transform)
X_val = dv.transform(val_dict)

# Check the shape of the validation data to ensure it matches the expected dimensions
print(X_val.shape)

(9042, 47)


In [761]:
import pandas as pd
y_train = pd.Series(y_train)

In [767]:
# Convert y_train from object type to integer type (binary classification requires numeric values)
y_train = y_train.astype(int)

# Verify that y_train is now of dtype int (should print int64 or int32)
print(y_train.dtype)

# Initialize the logistic regression model with specific parameters (liblinear solver, C=1.0, max_iter=1000)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Fit the logistic regression model on the training data (X_train) and target labels (y_train)
model.fit(X_train, y_train)

int32


In [785]:
# Make predictions on the validation set
y_pred = model.predict(X_val)

# Create a DataFrame to store predictions and actual values
df_pred = pd.DataFrame()
df_pred['prediction'] = y_pred
df_pred['actual'] = y_val

# Check if the predictions are correct
df_pred['correct'] = df_pred.prediction == df_pred.actual

# Calculate the accuracy manually as the mean of correct predictions
manual_accuracy = df_pred.correct.mean()

# Display the manual accuracy
print("Manual accuracy:", round(manual_accuracy, 2))

# Shorter method
accuracy = (y_val == y_pred).mean()
print("Shortcut accuracy:", round(accuracy, 2))

Manual accuracy: 0.9
Shortcut accuracy: 0.9


In [779]:
from sklearn.metrics import accuracy_score

# Convert y_val to int
y_val = y_val.astype(int)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)

# Round to 2 decimal places
accuracy_rounded = round(accuracy, 2)

# Print the accuracy
print(accuracy_rounded)

0.9


#### Question 4 - Answer: 9

### Question 5

- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

- age
- balance
- marital
- previous

Note: The difference doesn't have to be positive.

In [794]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Train a model with all features and calculate the baseline accuracy
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
baseline_accuracy = accuracy_score(y_val, y_pred)
print(f"Baseline accuracy: {baseline_accuracy:.4f}")

# List of features to test (the ones in the question)
features_to_test = ['age', 'balance', 'marital', 'previous']

# Dictionary to store accuracy results
accuracy_diffs = {}

# Get the feature names from the DictVectorizer
feature_names = dv.get_feature_names_out()

# Loop through each feature, remove it, and calculate the accuracy
for feature in features_to_test:
    # Find the feature index in the feature names list (handle prefixes like marital= or balance=)
    feature_indices = [i for i, fname in enumerate(feature_names) if feature in fname]
    
    if len(feature_indices) == 0:
        print(f"Feature {feature} not found in feature names.")
        continue
    
    # Remove the selected feature from the dataset
    X_train_new = np.delete(X_train, feature_indices, axis=1)
    X_val_new = np.delete(X_val, feature_indices, axis=1)
    
    # Train the model without this feature
    model.fit(X_train_new, y_train)
    y_pred_new = model.predict(X_val_new)
    
    # Calculate the new accuracy
    new_accuracy = accuracy_score(y_val, y_pred_new)
    
    # Calculate the accuracy difference
    accuracy_diff = baseline_accuracy - new_accuracy
    accuracy_diffs[feature] = accuracy_diff
    
    # Print the result for this feature
    print(f"Accuracy without {feature}: {new_accuracy:.4f} (diff: {accuracy_diff:.4f})")

# Find the feature with the smallest difference
least_important_feature = min(accuracy_diffs, key=accuracy_diffs.get)
print(f"\nThe feature with the smallest accuracy difference is: {least_important_feature}")

Baseline accuracy: 0.9016
Accuracy without age: 0.9004 (diff: 0.0012)
Accuracy without balance: 0.9013 (diff: 0.0002)
Accuracy without marital: 0.9001 (diff: 0.0014)
Accuracy without previous: 0.9012 (diff: 0.0003)

The feature with the smallest accuracy difference is: balance


#### Question 5 - Answer: 'balance'

### Question 6

- Now let's train a regularized logistic regression.
- Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
- Train models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

Note: If there are multiple options, select the smallest C.

In [806]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# List of C values to try
C_values = [0.01, 0.1, 1, 10, 100]

# Dictionary to store the accuracy for each C value
accuracy_results = {}

# Loop through each C value, train the model, and calculate accuracy
for C in C_values:
    # Train the logistic regression model with the given C value
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    
    # Store the accuracy rounded to 3 decimal places
    accuracy_results[C] = round(accuracy, 3)
    
    # Print the accuracy for each C
    print(f"Accuracy with C={C}: {accuracy:.3f}")

# Find the best C value (smallest C with the highest accuracy)
best_C = max(accuracy_results, key=lambda x: (accuracy_results[x], -x))

print(f"\nThe best C value is: {best_C} with accuracy: {accuracy_results[best_C]:.3f}")

Accuracy with C=0.01: 0.898
Accuracy with C=0.1: 0.901
Accuracy with C=1: 0.902
Accuracy with C=10: 0.901
Accuracy with C=100: 0.901

The best C value is: 1 with accuracy: 0.902


#### Question 6 - Answer: 1