## 4. Full Train
Make sure that the code is resuable for future predictions

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import datetime
import pickle

In [2]:
data = pd.read_csv('marketing_campaign.csv', sep=';')

### Train-test Split, Preprocessing, Feature Generation

In [3]:
# Create train test split
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.1, random_state=42)

In [4]:
class FeatureEngineering:
    def __init__(self, data, preprocessing_type:'test'):
        self.data = data
        self.median_income = None
        self.income_cutoff = None
        self.preprocessing_type = preprocessing_type

    def preprocessing(self):
        # Calculate Age and remove outliers
        self.data['Age'] = datetime.datetime.now().year - self.data['Year_Birth'] 
        self.data = self.data[self.data['Age'] < 100]

        # Calculate Tenure
        self.data['Dt_Customer'] = pd.to_datetime(self.data['Dt_Customer'])
        self.data['Tenure'] = datetime.datetime.now().year - self.data['Dt_Customer'].dt.year

        # Create Total Children
        self.data['Total_Children'] = self.data['Kidhome'] + self.data['Teenhome']

        # Fix Marital Status
        self.data['Marital_Status'] = self.data['Marital_Status'].replace(['YOLO', 'Alone', 'Absurd'], 'Single')

        # Remove outlier and impute median income
        if self.preprocessing_type == 'train':
            Q3 = np.quantile(data['Income'].dropna(), 0.75)
            Q1 = np.quantile(data['Income'].dropna(), 0.25)
            IQR = Q3 - Q1
            self.income_cutoff = Q3 + 1.5 * IQR
            self.median_income = self.data.loc[self.data['Income'] < self.income_cutoff, 'Income'].median()

        self.data.loc[self.data['Income'] > self.income_cutoff, 'Income'] = self.median_income
        self.data['Income'] = self.data['Income'].fillna(self.median_income)

        # Set ID as index
        self.data = self.data.set_index('ID')

        # One hot encode categorical variables
        self.data = pd.get_dummies(self.data, columns=['Education', 'Marital_Status'], drop_first=True)

        # Drop columns
        self.data = self.data.drop(['Year_Birth', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue'], axis=1)
    
    def get_data(self) -> pd.DataFrame:
        return(self.data)

In [5]:
processing_train = FeatureEngineering(train, preprocessing_type='train')
processing_train.preprocessing()
preprocessed_Xtrain = processing_train.get_data()

In [6]:
with open('train_processing.pkl', 'wb') as outp:
    pickle.dump(processing_train, outp, pickle.HIGHEST_PROTOCOL)

In [7]:
with open('train_processing.pkl', 'rb') as inp:
    processing_train = pickle.load(inp)
    print(processing_train.median_income) 
    print(processing_train.income_cutoff)

50988.5
118350.5


In [8]:
preprocessing_test = FeatureEngineering(test, preprocessing_type='test')
preprocessing_test.median_income = processing_train.median_income
preprocessing_test.income_cutoff = processing_train.income_cutoff
preprocessing_test.preprocessing()
preprocessed_Xtest = preprocessing_test.get_data()

### Train

In [16]:
import lightgbm

model = lightgbm.LGBMClassifier() 

_params = {'bagging_fraction':0.7, 'bagging_freq':6, 'feature_fraction':0.5,
               'min_child_samples':66, 'min_split_gain':0.4, 'n_estimators':90,
               'n_jobs':-1, 'num_leaves':90, 'random_state':123, 'reg_alpha':0.0005,
               'reg_lambda':0.1}

model.set_params(**_params)

gbm = model.fit(preprocessed_Xtrain.drop('Response', axis=1), preprocessed_Xtrain['Response'])

[LightGBM] [Info] Number of positive: 294, number of negative: 1720
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1566
[LightGBM] [Info] Number of data points in the train set: 2014, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.145978 -> initscore=-1.766500
[LightGBM] [Info] Start training from score -1.766500




### Test

In [19]:
import pickle
pickle.dump(gbm, open('lightgbm_model.pkl', 'wb'))

In [20]:
import pickle
model = pickle.load(open('lightgbm_model.pkl', 'rb'))

In [17]:
preprocessed_Xtest['predict'] = model.predict(preprocessed_Xtest.drop(['Response'], axis=1))



In [18]:
from sklearn.metrics import confusion_matrix
confusion_matrix(preprocessed_Xtest['Response'], preprocessed_Xtest['predict'])

array([[177,   6],
       [ 26,  14]], dtype=int64)

In [19]:
from sklearn.metrics import classification_report
print(classification_report(preprocessed_Xtest['Response'], preprocessed_Xtest['predict']))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92       183
           1       0.70      0.35      0.47        40

    accuracy                           0.86       223
   macro avg       0.79      0.66      0.69       223
weighted avg       0.84      0.86      0.84       223

