In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('../data/bank.csv', sep= ';')

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


## Data preparation

['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']
 
    Select only the features from above.
    Check if the missing values are presented in the features.


In [4]:
df = df[['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   balance    4521 non-null   int64 
 5   housing    4521 non-null   object
 6   contact    4521 non-null   object
 7   day        4521 non-null   int64 
 8   month      4521 non-null   object
 9   duration   4521 non-null   int64 
 10  campaign   4521 non-null   int64 
 11  pdays      4521 non-null   int64 
 12  previous   4521 non-null   int64 
 13  poutcome   4521 non-null   object
 14  y          4521 non-null   object
dtypes: int64(7), object(8)
memory usage: 529.9+ KB


In [6]:
cat_cols = ['job','marital','education','housing','contact','month','poutcome' ]

#['age', 'balance', 'day', 'duration','campaign', 'pdays','previous']

for col in cat_cols:
    df[col] = df[col].astype('category')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        4521 non-null   int64   
 1   job        4521 non-null   category
 2   marital    4521 non-null   category
 3   education  4521 non-null   category
 4   balance    4521 non-null   int64   
 5   housing    4521 non-null   category
 6   contact    4521 non-null   category
 7   day        4521 non-null   int64   
 8   month      4521 non-null   category
 9   duration   4521 non-null   int64   
 10  campaign   4521 non-null   int64   
 11  pdays      4521 non-null   int64   
 12  previous   4521 non-null   int64   
 13  poutcome   4521 non-null   category
 14  y          4521 non-null   object  
dtypes: category(7), int64(7), object(1)
memory usage: 315.1+ KB


In [8]:
df.isna().sum().sum()

np.int64(0)

# Question 1

What is the most frequent observation (mode) for the column education?

    unknown
    primary
    secondary
    tertiary

In [9]:
df['education'].mode()

0    secondary
Name: education, dtype: category
Categories (4, object): ['primary', 'secondary', 'tertiary', 'unknown']

# Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- age and balance
- day and campaign
- day and pdays
- **pdays and previous**

In [10]:
#df.info()

In [11]:
numerical = df[['age', 'balance', 'day', 'campaign', 'pdays', 'previous']]

In [12]:
corr = numerical.corr()
corr

Unnamed: 0,age,balance,day,campaign,pdays,previous
age,1.0,0.08382,-0.017853,-0.005148,-0.008894,-0.003511
balance,0.08382,1.0,-0.008677,-0.009976,0.009437,0.026196
day,-0.017853,-0.008677,1.0,0.160706,-0.094352,-0.059114
campaign,-0.005148,-0.009976,0.160706,1.0,-0.093137,-0.067833
pdays,-0.008894,0.009437,-0.094352,-0.093137,1.0,0.577562
previous,-0.003511,0.026196,-0.059114,-0.067833,0.577562,1.0


## Target encoding
    Now we want to encode the y variable.
    Let's replace the values yes/no with 1/0.

In [13]:
df['y'].value_counts()

y
no     4000
yes     521
Name: count, dtype: int64

In [14]:
pd.Series(np.where(df['y'].values == 'yes', 1, 0), df.index);

In [15]:
df['y']= df['y'].replace(to_replace = ['yes','no'],value = ['1','0'])

In [16]:
df['y'].value_counts()

y
0    4000
1     521
Name: count, dtype: int64

In [17]:
df

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,1787,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,4789,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,1350,yes,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,1476,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,0,yes,unknown,5,may,226,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,-333,yes,cellular,30,jul,329,5,-1,0,unknown,0
4517,57,self-employed,married,tertiary,-3313,yes,unknown,9,may,153,1,-1,0,unknown,0
4518,57,technician,married,secondary,295,no,cellular,19,aug,151,11,-1,0,unknown,0
4519,28,blue-collar,married,secondary,1137,no,cellular,6,feb,129,4,211,3,other,0


## Split the data
 - [x] Split your data in train/val/test sets with 60%/20%/20% distribution.
 - [x] Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
 - [x] Make sure that the target value y is not in your dataframe.

In [18]:
df_full_train, df_test = train_test_split(df, test_size=.2, random_state=42)

In [19]:
df_train, df_val = train_test_split(df_full_train, test_size=.25, random_state=42)

In [20]:
df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True);

In [21]:
y_train = df_train['y']
y_val = df_val['y']
y_test = df_val['y']

In [22]:
del df_train['y']
del df_val['y']
del df_test['y']

In [None]:
df_full_train, df_test = train_test_split(df, test_size=.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=.25, random_state=42)

df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True);

y_train = df_train['y']
y_val = df_val['y']
y_test = df_val['y']

del df_train['y']
del df_val['y']
del df_test['y']

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        4521 non-null   int64   
 1   job        4521 non-null   category
 2   marital    4521 non-null   category
 3   education  4521 non-null   category
 4   balance    4521 non-null   int64   
 5   housing    4521 non-null   category
 6   contact    4521 non-null   category
 7   day        4521 non-null   int64   
 8   month      4521 non-null   category
 9   duration   4521 non-null   int64   
 10  campaign   4521 non-null   int64   
 11  pdays      4521 non-null   int64   
 12  previous   4521 non-null   int64   
 13  poutcome   4521 non-null   category
 14  y          4521 non-null   object  
dtypes: category(7), int64(7), object(1)
memory usage: 315.1+ KB


## Question 3

    Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only. Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?

 - contact
 - education
 - housing
 - **poutcome**

In [24]:
mis_variables = ['contact', 'education', 'housing', 'poutcome']

def miscores(y, cols):
    for col in cols:
        score = round(mutual_info_score(y_train, df_train[col]), 2)
        print(f"{col}: {score}")
    

In [25]:
miscores(y_train, mis_variables)

contact: 0.01
education: 0.0
housing: 0.01
poutcome: 0.03


In [None]:
j

### Let's do it the same way it was done in the videos to see if we get the same results

In [26]:
def mutual_info_scores(series):
    return mutual_info_score(series, y_train)


df_mi = df_train[cat_cols].apply(mutual_info_scores).sort_values()
df_mi

education    0.001559
marital      0.002446
housing      0.007329
job          0.009356
contact      0.011583
month        0.022095
poutcome     0.030355
dtype: float64

## Task -- Now let's train a logistic regression.
    Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
    Fit the model on the training dataset.
        To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
        model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [27]:
train_dict = df_train.to_dict(orient='records')


In [28]:
train_dict[0]

{'age': 41,
 'job': 'management',
 'marital': 'married',
 'education': 'tertiary',
 'balance': 72,
 'housing': 'yes',
 'contact': 'unknown',
 'day': 7,
 'month': 'may',
 'duration': 764,
 'campaign': 3,
 'pdays': -1,
 'previous': 0,
 'poutcome': 'unknown'}

In [29]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train.to_dict(orient='records'))

In [30]:
X_val = dv.transform(df_val.to_dict(orient='records'))

In [31]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [32]:
y_pred = model.predict(X_val)

In [33]:
baseline_accuracy = round(accuracy_score(y_val, y_pred), 2)
print(f"Baseline validation Accuracy: {baseline_accuracy}")

Baseline validation Accuracy: 0.89


## Question 4
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- **0.9**

## Task
Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [37]:
features = df_train.columns
accuracy_differences = {}

for feature in features:
    df_train_temp = df_train.drop(columns=[feature])
    df_val_temp = df_val.drop(columns=[feature])
    
    X_train_temp = dv.fit_transform(df_train_temp.to_dict(orient='records'))
    X_val_temp = dv.transform(df_val_temp.to_dict(orient='records'))
    
    X_train_scaled_temp = scaler.fit_transform(X_train_temp)
    X_val_scaled_temp = scaler.transform(X_val_temp)
    
    model.fit(X_train_scaled_temp, y_train)
    y_pred_temp = model.predict(X_val_scaled_temp)
    accuracy_temp = accuracy_score(y_val, y_pred_temp)
    
    accuracy_differences[feature] = baseline_accuracy - accuracy_temp

print("Feature elimination results:")
for feature, diff in accuracy_differences.items():
    print(f"{feature}: {diff}")# Scaling the data


Feature elimination results:
age: 0.002831858407079668
job: -0.0015929203539822856
marital: 0.0006194690265486358
education: -0.0015929203539822856
balance: 0.001725663716814152
housing: 0.0050442477876105896
contact: 0.002831858407079668
day: 0.002831858407079668
month: 0.0006194690265486358
duration: 0.01831858407079645
campaign: 0.001725663716814152
pdays: 0.002831858407079668
previous: 0.002831858407079668
poutcome: 0.01057522123893806


## Question 5

Which of following feature has the smallest difference?
 - age
 - balance
 - **marital**
 - previous

# Task: Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

In [39]:
C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for C in C_values:
    model = LogisticRegression(C=C, max_iter=500)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    accuracy = accuracy_score(y_val, y_pred)
    accuracies[C] = round(accuracy, 3)
    print(f"C = {C}, Validation Accuracy: {accuracies[C]}")

C = 0.01, Validation Accuracy: 0.878
C = 0.1, Validation Accuracy: 0.886
C = 1, Validation Accuracy: 0.887
C = 10, Validation Accuracy: 0.887
C = 100, Validation Accuracy: 0.887


## Question 6

    Now let's train a regularized logistic regression.
    Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
    Train models using all the features as in Q4.
    Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

 - 0.01
 - **0.1**
 - 1
 - 10
 - 100