In [28]:
from src.utils.cleaning import *
import src.utils.sktools as skt
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import warnings 
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', None)

### Simple Data Cleaning

In [15]:
# Load the original dataset
df = load_data(config.orig_data_path)
df.head()

Unnamed: 0,Customer Number,Offer Accepted,Reward,Mailer Type,Income Level,# Bank Accounts Open,Overdraft Protection,Credit Rating,# Credit Cards Held,# Homes Owned,Household Size,Own Your Home,Average Balance,Q1 Balance,Q2 Balance,Q3 Balance,Q4 Balance
0,1,No,Air Miles,Letter,High,1,No,High,2,1,4,No,1160.75,1669.0,877.0,1095.0,1002.0
1,2,No,Air Miles,Letter,Medium,1,No,Medium,2,2,5,Yes,147.25,39.0,106.0,78.0,366.0
2,3,No,Air Miles,Postcard,High,2,No,Medium,2,1,2,Yes,276.5,367.0,352.0,145.0,242.0
3,4,No,Air Miles,Letter,Medium,2,No,High,1,1,4,No,1219.0,1578.0,1760.0,1119.0,419.0
4,5,No,Air Miles,Letter,Medium,1,No,Medium,2,1,6,Yes,1211.0,2140.0,1357.0,982.0,365.0


In [16]:
# Data cleaning
df = drop_nan(df, verbose=True)

Dropped 24 rows, 0.13% of original rows


In [17]:
# Formatting index and column names
df = format_data(df)
df.head()

Unnamed: 0_level_0,offer_accepted,reward,mailer_type,income_level,bank_accounts_open,overdraft_protection,credit_rating,credit_cards_held,homes_owned,household_size,own_your_home,average_balance,q1_balance,q2_balance,q3_balance,q4_balance
customer_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,No,Air Miles,Letter,High,1,No,High,2,1,4,No,1160.75,1669.0,877.0,1095.0,1002.0
2,No,Air Miles,Letter,Medium,1,No,Medium,2,2,5,Yes,147.25,39.0,106.0,78.0,366.0
3,No,Air Miles,Postcard,High,2,No,Medium,2,1,2,Yes,276.5,367.0,352.0,145.0,242.0
4,No,Air Miles,Letter,Medium,2,No,High,1,1,4,No,1219.0,1578.0,1760.0,1119.0,419.0
5,No,Air Miles,Letter,Medium,1,No,Medium,2,1,6,Yes,1211.0,2140.0,1357.0,982.0,365.0


In [18]:
# Value counts file dump
dump_value_counts(config.value_counts_path, df, verbose=True)

Value counts information has been printed to src/data/tmp/value_counts.txt


In [19]:
# Save cleaned data
df.to_csv(config.cleaned_data_path)

### Benchmark model without EDA

In [20]:
# Load cleaned dataset
df = load_data(config.cleaned_data_path, index_col=0)
skt.report(df)

Unnamed: 0,nan_count,dtype,unique
offer_accepted,0,object,2
reward,0,object,3
mailer_type,0,object,2
income_level,0,object,3
bank_accounts_open,0,int64,3
overdraft_protection,0,object,2
credit_rating,0,object,3
credit_cards_held,0,int64,4
homes_owned,0,int64,3
household_size,0,int64,8


In [21]:
df.select_dtypes(object).columns

Index(['offer_accepted', 'reward', 'mailer_type', 'income_level',
       'overdraft_protection', 'credit_rating', 'own_your_home'],
      dtype='object')

In [25]:
# Benchmark Logistic Regressor Model
cols_ord_encod = ['income_level', 'credit_rating']
cols_onehot_encod = ['reward', 'mailer_type', 'overdraft_protection', 
                     'own_your_home']
predictions, classification_report = skt.score_classification_model(
    df=df, target='offer_accepted', model=LogisticRegression(solver='lbfgs'),
    cols_to_encode=[cols_ord_encod, cols_onehot_encod],
    scaler=StandardScaler(),
    encoders=[OrdinalEncoder(), OneHotEncoder()],
    outsiders_thresh=None
)

print(classification_report)

              precision    recall  f1-score   support

          No       0.94      1.00      0.97      5079
         Yes       0.00      0.00      0.00       314

    accuracy                           0.94      5393
   macro avg       0.47      0.50      0.49      5393
weighted avg       0.89      0.94      0.91      5393



In [30]:
# Benchmark Logistic Regressor Model
cols_ord_encod = ['income_level', 'credit_rating']
cols_onehot_encod = ['reward', 'mailer_type', 'overdraft_protection', 
                     'own_your_home']
predictions, classification_report = skt.score_classification_model(
    df=df, target='offer_accepted', model=LogisticRegression(solver='lbfgs'),
    cols_to_encode=[cols_ord_encod, cols_onehot_encod],
    scaler=StandardScaler(),
    encoders=[OrdinalEncoder(), OneHotEncoder()],
    outsiders_thresh=None
)

print(classification_report)

              precision    recall  f1-score   support

          No       0.94      1.00      0.97      5079
         Yes       0.00      0.00      0.00       314

    accuracy                           0.94      5393
   macro avg       0.47      0.50      0.49      5393
weighted avg       0.89      0.94      0.91      5393



In [29]:
# Benchmark Knn Regressor Model
cols_ord_encod = ['income_level', 'credit_rating']
cols_onehot_encod = ['reward', 'mailer_type', 'overdraft_protection', 
                     'own_your_home']
predictions, classification_report = skt.score_classification_model(
    df=df, target='offer_accepted', model=KNeighborsClassifier(),
    cols_to_encode=[cols_ord_encod, cols_onehot_encod],
    scaler=StandardScaler(),
    encoders=[OrdinalEncoder(), OneHotEncoder()],
    outsiders_thresh=None
)

print(classification_report)

              precision    recall  f1-score   support

          No       0.94      1.00      0.97      5079
         Yes       0.19      0.01      0.02       314

    accuracy                           0.94      5393
   macro avg       0.57      0.50      0.50      5393
weighted avg       0.90      0.94      0.91      5393



Logistic Regression model simply is not able to predict any credict cart offer <br> acceptance. <br>
Precision is zero and recall is zero. Probably due to really high imbalance of <br>
target data.<br>
KN Classifier performs definitely better, maybe it is the right algorithm for the <br>
problem, once EDA, correaltions and data imbalace are taken into account.