## 2. Features
Looking at the EDA we there are several data cleaning items as well as features that we can build

Drop Columns:
- Z_CostContact
- Z_Revenue

Fix items:
- drop ages over 100
- impute income
- Marital status

Features:
- create age
- see how long the customer has been a client
- create total children


In [72]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import datetime

In [65]:
data = pd.read_csv(f'marketing_campaign.csv', sep=';')

In [73]:
class FeatureEngineering:
    def __init__(self, data, preprocessing_type='test'):
        self.data = data
        self.median_income = None
        self.income_cutoff = None
        self.preprocessing_type = preprocessing_type

    def preprocessing(self):
        # Calculate Age and remove outliers
        self.data['Age'] = datetime.datetime.now().year - self.data['Year_Birth'] 
        self.data = self.data[self.data['Age'] < 100]

        # Calculate Tenure
        self.data['Dt_Customer'] = pd.to_datetime(self.data['Dt_Customer'])
        self.data['Tenure'] = datetime.datetime.now().year - self.data['Dt_Customer'].dt.year

        # Create Total Children
        self.data['Total_Children'] = self.data['Kidhome'] + self.data['Teenhome']

        # Fix Marital Status
        self.data['Marital_Status'] = self.data['Marital_Status'].replace(['YOLO', 'Alone', 'Absurd'], 'Single')

        # Remove outlier and impute median income
        if self.preprocessing_type == 'train':
            Q3 = np.quantile(data['Income'].dropna(), 0.75)
            Q1 = np.quantile(data['Income'].dropna(), 0.25)
            IQR = Q3 - Q1
            self.income_cutoff = Q3 + 1.5 * IQR
            self.median_income = self.data.loc[self.data['Income'] < self.income_cutoff, 'Income'].median()

        self.data.loc[self.data['Income'] > self.income_cutoff, 'Income'] = self.median_income
        self.data['Income'] = self.data['Income'].fillna(self.median_income)

        # Set ID as index
        self.data = self.data.set_index('ID')

        # Drop columns
        self.data = self.data.drop(['Year_Birth', 'Dt_Customer', 'Z_CostContact', 'Z_Revenue'], axis=1)
    
    def get_data(self) -> pd.DataFrame:
        return(self.data)

In [74]:
processing_train = FeatureEngineering(data, preprocessing_type='train')
processing_train.preprocessing()
preprocessed_data = processing_train.get_data()

In [77]:
preprocessed_data.head()

Unnamed: 0_level_0,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Age,Tenure,Total_Children
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5524,Graduation,Single,58138.0,0,0,58,635,88,546,172,...,0,0,0,0,0,0,1,66,11,0
2174,Graduation,Single,46344.0,1,1,38,11,1,6,2,...,0,0,0,0,0,0,0,69,9,2
4141,Graduation,Together,71613.0,0,0,26,426,49,127,111,...,0,0,0,0,0,0,0,58,10,0
6182,Graduation,Together,26646.0,1,0,26,11,4,20,10,...,0,0,0,0,0,0,0,39,9,1
5324,PhD,Married,58293.0,1,0,94,173,43,118,46,...,0,0,0,0,0,0,0,42,9,1


In [80]:
preprocessed_data.describe()

Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,Age,Tenure,Total_Children
count,2205.0,2237.0,2237.0,2237.0,2237.0,2237.0,2237.0,2237.0,2237.0,2237.0,...,2237.0,2237.0,2237.0,2237.0,2237.0,2237.0,2237.0,2237.0,2237.0,2237.0
mean,51622.094785,0.444345,0.506482,49.104604,303.99553,26.270451,166.916853,37.523022,27.068842,43.968708,...,0.072865,0.074654,0.072418,0.064372,0.013411,0.008941,0.149307,54.098346,9.972284,0.950827
std,20713.063826,0.538467,0.544593,28.956073,336.574382,39.715972,225.661158,54.639909,41.293949,52.054318,...,0.259974,0.26289,0.259237,0.245469,0.115052,0.094152,0.356471,11.701917,0.684704,0.752037
min,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,9.0,0.0
25%,35196.0,0.0,0.0,24.0,24.0,1.0,16.0,3.0,1.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,10.0,0.0
50%,51287.0,0.0,0.0,49.0,174.0,8.0,67.0,12.0,8.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,10.0,1.0
75%,68281.0,1.0,1.0,74.0,504.0,33.0,232.0,50.0,33.0,56.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,10.0,1.0
max,113734.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,263.0,362.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,83.0,11.0,3.0


In [79]:
# preprocessing_test = FeatureEngineering(data, preprocessing_type='test')
# preprocessing_test.median_income = processing_train.median_income
# preprocessing_test.income_cutoff = processing_train.income_cutoff
# preprocessing_test.preprocessing()
# preprocessing_test.get_data()