## 5. Predict


In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import datetime
import pickle

In [5]:
data = pd.read_csv('marketing_campaign.csv', sep=';')

### Preprocessing, Feature Generation

In [2]:
class FeatureEngineering:
    def __init__(self, data, preprocessing_type:'predict'):
        self.data = data
        self.median_income = None
        self.income_cutoff = None
        self.preprocessing_type = preprocessing_type

    def preprocessing(self):
        # Calculate Age and remove outliers
        self.data['Age'] = datetime.datetime.now().year - self.data['Year_Birth'] 
        self.data = self.data[self.data['Age'] < 100]

        # Calculate Tenure
        self.data['Dt_Customer'] = pd.to_datetime(self.data['Dt_Customer'])
        self.data['Tenure'] = datetime.datetime.now().year - self.data['Dt_Customer'].dt.year

        # Create Total Children
        self.data['Total_Children'] = self.data['Kidhome'] + self.data['Teenhome']

        # Fix Marital Status
        self.data['Marital_Status'] = self.data['Marital_Status'].replace(['YOLO', 'Alone', 'Absurd'], 'Single')

        # Remove outlier and impute median income
        if self.preprocessing_type == 'train':
            Q3 = np.quantile(data['Income'].dropna(), 0.75)
            Q1 = np.quantile(data['Income'].dropna(), 0.25)
            IQR = Q3 - Q1
            self.income_cutoff = Q3 + 1.5 * IQR
            self.median_income = self.data.loc[self.data['Income'] < self.income_cutoff, 'Income'].median()

        self.data.loc[self.data['Income'] > self.income_cutoff, 'Income'] = self.median_income
        self.data['Income'] = self.data['Income'].fillna(self.median_income)

        # Set ID as index
        self.data = self.data.set_index('ID')

        # One hot encode categorical variables
        self.data = pd.get_dummies(self.data, columns=['Education', 'Marital_Status'], drop_first=True)

        # Drop columns
        self.data = self.data.drop(['Year_Birth', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue'], axis=1)
    
    def get_data(self) -> pd.DataFrame:
        return(self.data)

In [3]:
with open('train_processing.pkl', 'rb') as inp:
    processing_train = pickle.load(inp)
    print(processing_train.median_income) 
    print(processing_train.income_cutoff)

50737.0
118350.5


In [6]:
preprocessing_test = FeatureEngineering(data, preprocessing_type='predict')
preprocessing_test.median_income = processing_train.median_income
preprocessing_test.income_cutoff = processing_train.income_cutoff
preprocessing_test.preprocessing()
preprocessed_Xtest = preprocessing_test.get_data()

### Predict

In [7]:
import pickle
model = pickle.load(open('lightgbm_model.pkl', 'rb'))

In [8]:
preprocessed_Xtest['predict'] = model.predict(preprocessed_Xtest.drop(['Response'], axis=1))



In [9]:
from sklearn.metrics import confusion_matrix
confusion_matrix(preprocessed_Xtest['Response'], preprocessed_Xtest['predict'])

array([[1879,   24],
       [ 109,  225]], dtype=int64)

In [10]:
from sklearn.metrics import classification_report
print(classification_report(preprocessed_Xtest['Response'], preprocessed_Xtest['predict']))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1903
           1       0.90      0.67      0.77       334

    accuracy                           0.94      2237
   macro avg       0.92      0.83      0.87      2237
weighted avg       0.94      0.94      0.94      2237

