In [None]:
# Checking Python's version
!python -V

In [1]:
# Importing libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import category_encoders as ce
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import (roc_auc_score,
                             confusion_matrix,
                             classification_report)

In [2]:
# Read training and validation data
df_train = pd.read_csv('./data/training_data.csv')
df_val = pd.read_csv('./data/validation_data.csv')

In [None]:
# Check training data info  
df_train.info()

In [None]:
# Check feature diversity
df_train.nunique()

In [3]:
# Creating target variable 
y_train = df_train['Response']
y_val = df_val['Response']

In [4]:
# Create a copy of training data to be used later
df_train_2 = df_train.copy()

In [5]:
# Droping redundant features
red_ftrs_1 = ["ID", "Dt_Customer", "Z_CostContact", "Z_Revenue", "Response"]

df_train = df_train.drop(red_ftrs_1, axis=1)

In [6]:
# Creating list of categorical features
categ_ftrs_1 = list(
    df_train.nunique()
    [(df_train.nunique() <= 8)]
    .index
)
not_categ = ['TeenHome', 'KidHome']
categ_ftrs_1 = [categ for categ in categ_ftrs_1 
              if categ not in not_categ]

# Creating a list of numerical features
num_ftrs_1 = [col for col in df_train.columns 
            if col not in categ_ftrs_1]


In [None]:
# Check for missing values
df_train.isna().sum() 

In [7]:
# Handling missing values

df_train[num_ftrs_1] = (
    df_train[num_ftrs_1].fillna(df_train[num_ftrs_1].median())
)

df_train[categ_ftrs_1] = (
    df_train[categ_ftrs_1].fillna(df_train[categ_ftrs_1].mode().iloc[0])
)

In [None]:
# look at purchases as a whole
purchases = list(df_train.filter(regex='Purchases').columns)
df_train['total_purchases'] = (
    df_train['NumCatalogPurchases'] + df_train['NumDealsPurchases'] + 
    df_train['NumStorePurchases'] + df_train['NumWebPurchases']
)

# look at amounts as a whole
amount = list(df_train.filter(regex='Mnt').columns)
df_train['total_amount'] = (
    df_train['MntFishProducts'] + df_train['MntFruits'] + 
    df_train['MntGoldProds'] + df_train['MntMeatProducts'] + 
    df_train['MntSweetProducts'] + df_train['MntWines']
)

# add a dependants feature
dependants = list(df_train.filter(regex='home').columns)
df_train['dependants'] = df_train['Kidhome'] + df_train['Teenhome']

# Drop redundant features
red_ftrs_2 = dependants+amount+purchases
df_train = df_train.drop(red_ftrs_2, axis=1)

In [None]:
# Creating list of categorical features
categ_ftrs_2 = list(
    df_train.nunique()
    [(df_train.nunique() <= 8)]
    .index
)

categ_ftrs_2 = [categ for categ in categ_ftrs_2
              if categ not in ['dependants']]

In [None]:
# Encode ordinal categorical variables
ordinal_categ = ['Education']
categories = ['Basic', '2n Cycle', 'Graduation', 'Master', 'PhD']

df_train[ordinal_categ] = pd.Categorical(df_train[ordinal_categ], categories=categories, ordered=True).codes

# Encode nominal categorical variables
nominal_categs = list(filter(lambda x: x not in ordinal_categ, categ_ftrs_2))

df_train[nominal_categs] = (
    df_train[nominal_categs].astype('category')
    .apply(lambda x: x.cat.codes)
    ) 


In [None]:
pd.Categorical(df_train_2[ordinal_categ], categories=categories, ordered=True)


In [None]:
df_train_2[ordinal_categ]

In [None]:
# Encode ordinal categorical variables
ordinal_categ = ['Education']
categories = {'Basic':0, '2n Cycle':1, 'Graduation':2, 'Master':3, 'PhD':4}

df_train[ordinal_categ] = df_train[ordinal_categ].apply(lambda x: x.map(categories))

# Encode nominal categorical variables
nominal_categs = list(filter(lambda x: x not in ordinal_categ, categ_ftrs_2))

df_train[nominal_categs] = (
    df_train[nominal_categs].astype('category')
    .apply(lambda x: x.cat.codes)
    )


In [None]:

# Creating a list of numerical features
num_ftrs_2 = [col for col in df_train.columns 
            if col not in categ_ftrs_2]


In [None]:
# Check distribution of numerical features
fig, axes = plt.subplots(2, 4, figsize=(8, 4))
axes = axes.flatten()

for i, var in enumerate(num_ftrs_2):
        sns.histplot(x=var, ax=axes[i], data=df_train, kde=True, )
axes[-1].axis('off')
plt.tight_layout()

In [None]:
def clean_data(df):
    
    # Drop redundant features
    df = df.drop(red_ftrs_1, axis=1)
    
    # Handling missing values and transformations
    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('normalize', MinMaxScaler()),
    ])
    
    ct = ColumnTransformer([
        ('num_trans', num_transformer, num_ftrs_1),
        ('cat_trans', SimpleImputer(strategy='most_frequent'), 
         categ_ftrs_1)
    ])
    
    df = pd.DataFrame(ct.fit_transform(df), 
                      columns=num_ftrs_1+categ_ftrs_1)
    
    # Compute total purchase
    df['total_purchases'] = (
        df['NumCatalogPurchases'] + df['NumDealsPurchases'] + 
        df['NumStorePurchases'] + df['NumWebPurchases']
    )
    
    # Compute total amount
    df['total_amount'] = (
        df['MntFishProducts'] + df['MntFruits'] + 
        df['MntGoldProds'] + df['MntMeatProducts'] +
        df['MntSweetProducts'] + df['MntWines']
    )
    
    # Add a `dependants` feature
    df['dependants'] = df['Kidhome'] + df['Teenhome']
    
    
    # Drop superflous columns
    df = df.drop(red_ftrs_2, axis=1)

    # Encode ordinal categorical variables
    df_train[ordinal_categ] = (
        df_train[ordinal_categ]
        .apply(lambda x: x.map(categories))
    )
    
    # Encode nominal categorical variables
    df[nominal_categs] = (
        df[nominal_categs].astype('category')
        .apply(lambda x: x.cat.codes)
    )
    
    return df

In [None]:
train_dicts= clean_data(df_train_2).to_dict(orient='records')
val_dicts = clean_data(df_val).to_dict(orient='records')

In [None]:
clean_data(df_train_2)

In [None]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [None]:
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train.values)
y_pred = lr.predict(X_train)
print(classification_report(y_train, y_pred))

In [None]:
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)

In [None]:
def cmat_table(y_true, y_pred):
    cmat = confusion_matrix(y_true, y_pred)
    cols = pd.MultiIndex.from_tuples(
        [('predictions', 0), ('predictions', 1)]
    )
    indx = pd.MultiIndex.from_tuples(
        [('actual', 0), ('actual', 1)]
    )
    display(pd.DataFrame(cmat, columns=cols, index=indx))


In [None]:
cmat_table(y_val, y_pred)

In [None]:

print(classification_report(y_val, y_pred))

In [None]:
lr.predict_proba(X_val)

In [None]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [None]:
lr.fit(X_resampled, y_resampled)
y_pred = lr.predict(X_val)


In [None]:
cmat_table(y_val, y_pred)

In [None]:
print(classification_report(y_val, y_pred))

In [None]:
roc_auc_score(y_val, lr.decision_function(X_val))

In [None]:
def metrics_output(X_1, X_2, y_1, y_2, clf):
    
    # Fit classifier
    clf.fit(X_1, y_1)
    
    # Get prediction
    y_pred = clf.predict(X_2)
    
    # Create confusion matrix table
    cols = pd.MultiIndex.from_tuples(
        [('predictions', 0), ('predictions', 1)]
    )
    indx = pd.MultiIndex.from_tuples(
        [('actual', 0), ('actual', 1)]
    )
    cmat = confusion_matrix(y_2, y_pred)
    display(pd.DataFrame(cmat, columns=cols, index=indx))
    
    print('-'*50, '\n')

    # Print classification report
    print(classification_report(y_2, y_pred))
    
    print('-'*50)
    
    # Get area under curve
    auc_score = roc_auc_score(y_2, y_pred)
    print(f'area_under_curve: {auc_score:.2f}')

In [None]:
metrics_output(X_resampled, X_val, y_resampled, y_val, lr)

In [None]:
from sklearn.svm import SVC
svc = SVC()
metrics_output(X_resampled, X_val, y_resampled, y_val, svc)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1)

metrics_output(X_resampled, X_val, y_resampled, y_val, rfc)