In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
df = pd.read_excel("marketing_data.xlsx")

In [None]:
df

## Preprocessing

### Handling null values

In [None]:
df.isna().sum()

Solutions in dealing with the null values

- Fill null values of Income with the median of their respective groups
- Fill null values of Income with 0
- Remove rows with null values

Sidenote: Since there is a weakl relationship between income and education + income and marital status based on previous testing, I chose the option removing rows with null values


In [None]:
df = df.dropna(subset='Income').reset_index(drop=True)

### Feature Engineering

Create `Total_children`

In [None]:
df = df.assign(Total_children=df.Kidhome + df.Teenhome)

In [None]:
sns.barplot(df.Total_children.value_counts())

Removing entries where someone's age could not be realistically alive today

In [None]:
print(np.sort(df.Year_Birth.unique()))
print(df.Year_Birth.min())
print(df.Year_Birth.max())

In [None]:
df = df[df.Year_Birth >= 1940]

Convert `Dt_Customer` to datetime and create a column for days since they first became a customer

In [None]:
df.Dt_Customer

In [None]:
from datetime import datetime

df.Dt_Customer = pd.to_datetime(df.Dt_Customer)

Checking for future dates

In [None]:
df[df.Dt_Customer > datetime.now()]

Checking for customer dates older than their age

In [None]:
df[df.Dt_Customer.dt.year < df.Year_Birth]

In [None]:
curr_date = datetime.now()
df['Days_Since_Customer'] = (curr_date - df.Dt_Customer).dt.days

In [None]:
df.Days_Since_Customer

Dropping records where marital status are the following

In [None]:
df.drop(df[df.Marital_Status.isin(['YOLO', 'Absurd', 'Alone'])].index).reset_index(drop=True)

### Removing outliers

creating the function for removing outliers in continuous variables

In [None]:
from scipy.stats import zscore

def remove_outliers(data, column, z_thresh=2):
    z_scores = np.abs(zscore(data[column], nan_policy='omit'))

    non_outlier_indices = np.where(z_scores < z_thresh)[0]

    print(data.shape)
    no_outliers = data.iloc[non_outlier_indices]
    print(no_outliers.shape)
    return no_outliers

Removing outliers for `Mnt.*` columns

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(10, 6))

for ax, col in zip(axes.flat, ['MntWines', 'MntFishProducts', 'MntMeatProducts', 'MntFruits', 'MntGoldProds', 'MntSweetProducts']):
  print(f'{col} skew: {df[col].skew()}')
  sns.kdeplot(df, x=col, fill=True, ax=ax)

fig.tight_layout()
plt.show()

In [None]:
for col in ['MntWines', 'MntFishProducts', 'MntMeatProducts', 'MntFruits', 'MntGoldProds', 'MntSweetProducts']:
  df = remove_outliers(df, col)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(10, 6))

for ax, col in zip(axes.flat, ['MntWines', 'MntFishProducts', 'MntMeatProducts', 'MntFruits', 'MntGoldProds', 'MntSweetProducts']):
  print(f'{col} skew: {df[col].skew()}')
  sns.kdeplot(df, x=col, fill=True, ax=ax)

fig.tight_layout()
plt.show()

Removing outliers for `Num.*` columns

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 6))

for ax, col in zip(axes.flat, ['NumDealsPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebPurchases']):
  print(f'{col} skew: {df[col].skew()}')
  sns.kdeplot(df, x=col, hue='Response', fill=True, ax=ax)

fig.tight_layout()
plt.show()

In [None]:
for col in ['NumDealsPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebPurchases']:
  df = remove_outliers(df, col)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 6))

for ax, col in zip(axes.flat, ['NumDealsPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebPurchases']):
  print(f'{col} skew: {df[col].skew()}')
  sns.kdeplot(df, x=col, hue='Response', fill=True, ax=ax)

fig.tight_layout()
plt.show()

### One-Hot Encoding

In [None]:
def one_hot_encode(data, column):
    encoded = pd.get_dummies(data[column], prefix=column, drop_first=True)
    data = data.drop(column, axis=1)
    data = data.join(encoded)
    return data

In [None]:
df = one_hot_encode(df, 'Education')
df = one_hot_encode(df, 'Marital_Status')

In [None]:
df

### Interaction features

In [None]:
from sklearn.preprocessing import *

sc = StandardScaler()

## Testing

In [None]:
from sklearn.model_selection import *

X = df.drop('Response', axis=1)
y = df.Response

In [None]:
# kf = KFold(n_splits=9, shuffle=True)

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
for (train, test) in skf.split(X, y):
  print(test.shape)