In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc

df = pd.read_csv('../data/bank-full.csv', sep=';')
df = df[['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']]

numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

y = df.y
X = df.drop('y', axis=1)
dict_vectorizer = DictVectorizer(sparse=False)

def get_splitted_df():
    df_train_full, df_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    df_train, df_val, y_train, y_val = train_test_split(df_train_full, y_train_full, test_size=len(X) * 0.2 / len(df_train_full), random_state=1)
    
    return df_train, df_val, df_test, y_train, y_val, y_test

def get_trained_model(train_serie: pd.Series, y_train: pd.Series, reg: float = 1.0):
    train_serie_dict = train_serie.to_dict(orient='records')
    X_train = dict_vectorizer.fit_transform(train_serie_dict)
    
    model = LogisticRegression(solver='liblinear', C=reg, max_iter=1000)
    model.fit(X_train, y_train)
    
    return model

def predict_model(tmp_model, serie: pd.DataFrame, threshold: float = 0.5):    
    serie_dict = serie.to_dict(orient='records')
    X_serie = dict_vectorizer.fit_transform(serie_dict)

    y_predict = tmp_model.predict_proba(X_serie)[:, 1]
    y_val = (y_predict > threshold).astype(int)
    
    return y_val

df_train, df_val, df_test, y_train, y_val, y_test = get_splitted_df()

model = get_trained_model(df_train, y_train)

### Question 1

In [24]:
aucs = {}
for col in numerical:
    auc = roc_auc_score(y_train, df_train[col])
    if auc < 0.5:
        auc = roc_auc_score(y_train, -df_train[col])
    
    aucs[col] = auc
    
aucs = dict(sorted(aucs.items(), key=lambda item: item[1], reverse=True))
aucs


{'duration': 0.8146942424632446,
 'previous': 0.5985621197852585,
 'pdays': 0.5901240684387066,
 'balance': 0.588819759177242,
 'campaign': 0.5714526215503108,
 'day': 0.5259654358875353,
 'age': 0.5121718893992577}

### Qurestion 2

In [25]:
y_val_pred = predict_model(model, df_val)
auc = roc_auc_score(y_val, y_val_pred)
round(auc, 3)


0.652

### Question 3

### Question 4

### Question 5

### Question 6