In [None]:
# module import
import pandas as pd
import pdpipe as pdp
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

df_train = pd.read_csv('marketing_train.csv')
df_test = pd.read_csv('marketing_test.csv')
df_aim = pd.read_csv('marketing_aim.csv')

In [None]:
# Lösung:
print(df_train.info())
print(df_test.info())
print(df_aim.info())

In [None]:
print(df_train.describe())
print(df_test.describe())
print(df_aim.describe())

In [None]:
# deal with -1 values in 'Days passed'

# Lösung:
import numpy as np

def cleaner(df):
    for row in range(df.shape[0]):
        if df.loc[row, 'Days passed'] == -1:
            df.loc[row, 'Days passed'] = np.nan
            
cleaner(df=df_train)
cleaner(df=df_test)
cleaner(df=df_aim)

In [None]:
# class distribution in train set and test set

# Lösung:
print(pd.crosstab(df_train.loc[:, 'Subscribed deposit'], columns='count', normalize=True))
print(pd.crosstab(df_test.loc[:, 'Subscribed deposit'], columns='count', normalize=True))

**2. Vorbereitung der Daten für die statistische Modellierung**

In [None]:
# deal with NaN values

# Lösung:
df_train = df_train.dropna(axis=0)
df_test = df_test.dropna(axis=0)
df_aim = df_aim.dropna(axis=0)

print(df_train.isna().sum())
print(df_test.isna().sum())
print(df_aim.isna().sum())

In [None]:
# correlation matrix

# Lösung:
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(nrows=3, figsize=(20, 20))

mask_train = np.triu(np.ones_like(df_train.corr(), dtype=bool))
sns.heatmap(df_train.corr(), mask=mask_train, vmin=-1, vmax=1, fmt='.2f', annot=True, ax=ax[0]);

mask_test = np.triu(np.ones_like(df_test.corr(), dtype=bool))
sns.heatmap(df_test.corr(), mask=mask_test, vmin=-1, vmax=1, fmt='.2f',annot=True, ax=ax[1]);

mask_aim = np.triu(np.ones_like(df_aim.corr(), dtype=bool))
sns.heatmap(df_aim.corr(), mask=mask_aim, vmin=-1, vmax=1, fmt='.2f',annot=True, ax=ax[2]);

fig.tight_layout()

In [None]:
# number of unique values for categorial columns

# Lösung:
cat_cols = ['Job',
            'Marital Status',
            'Education',
            'Has credit in default',
            'Has housing loan',
            'Has personal loan',
            'Contact type',
            'Last contact month',
            'Outcome previous campaign',
            'Subscribed deposit']

for cat_col in cat_cols:
    print(cat_col,': ', 
          df_train.loc[:, cat_col].nunique(), 
          df_train.loc[:, cat_col].unique())

print('\n\n')
for cat_col in cat_cols:
    print(cat_col,': ', 
          df_test.loc[:, cat_col].nunique(), 
          df_test.loc[:, cat_col].unique())

print('\n\n')
for cat_col in cat_cols[:-1]:
    print(cat_col,': ', 
          df_aim.loc[:, cat_col].nunique(),
          df_aim.loc[:, cat_col].unique())

In [None]:
# label encoding

# Lösung:
bin_cols = ['Has credit in default',
            'Has housing loan',
            'Has personal loan',
            'Subscribed deposit']

dict_label_encoding = {'no': 0, 'yes': 1}

for bin_col in bin_cols:
    df_train = df_train.replace(to_replace={bin_col: dict_label_encoding})
    df_test = df_test.replace(to_replace={bin_col: dict_label_encoding})
    
for bin_col in bin_cols[:-1]:
    df_aim = df_aim.replace(to_replace={bin_col: dict_label_encoding})

In [None]:
# one-hot encoding

# Lösung:
import pdpipe as pdp

onehot = pdp.OneHotEncode(['Job', 
                           'Marital Status', 
                           'Education', 
                           'Contact type',
                           'Last contact month',
                           'Outcome previous campaign'], drop_first=False)

df_train = onehot.fit_transform(df_train) # always fit on train set only!
df_test = onehot.transform(df_test)
df_aim = onehot.transform(df_aim)

**3. Modellierung der Daten anhand einer logistischen Regression und Identifizierung der besten Hyperparametereinstellungen**

In [None]:
# logistic model

# Lösung:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipeline_log = Pipeline([('scaler', StandardScaler()),
                         ('classifier', LogisticRegression(solver='saga',
                                                           max_iter=10000, 
                                                           random_state=42))])

In [None]:
# gridsearch with cross-validation

# Lösung:
from sklearn.model_selection import GridSearchCV

features_train = df_train.iloc[:, :-1]
target_train = df_train.iloc[:, -1]

search_space_grid = [{'classifier__penalty': ['l1', 'l2'],
                      'classifier__C': np.geomspace(start=0.001, stop=1000, num=14)}]

model_grid = GridSearchCV(estimator=pipeline_log,
                          param_grid=search_space_grid,
                          scoring='roc_auc',
                          cv=5,
                          n_jobs=-1)

model_grid.fit(features_train, target_train)

print(model_grid.best_estimator_)
print(model_grid.best_score_)

**4. Evaluierung des gefundenen besten logistischen Regressionsmodells** 

In [None]:
# evaluate best model on test set

# Lösung:
features_test = df_test.iloc[:, :-1]
target_test = df_test.iloc[:, -1]

target_test_pred_proba = model_grid.best_estimator_.predict_proba(features_test)

from sklearn.metrics import roc_auc_score
roc_auc_score(target_test, target_test_pred_proba[:, 1])

**5. Vorhersage der Festgeldkonten-Zuordnung** 

In [None]:
# prediction on aim set

# Lösung:
features_aim = df_aim.copy()
df_aim.loc[:, 'subdep_pred_proba'] = model_grid.predict_proba(features_aim)[:, 1]
df_aim.loc[:, 'subdep_pred'] = model_grid.predict(features_aim)

pd.crosstab(df_aim.loc[:, 'subdep_pred'], columns='count')