# Import and Loadings

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
style_dict={
    'axes.facecolor': '#49515f',
    'axes.edgecolor': 'white',
    'axes.grid': True,
    'axes.axisbelow': True,
    'axes.labelcolor': 'white',
    'figure.facecolor': '#49515f',
    'grid.color': 'white',
    'grid.linestyle': '-',
    'text.color': 'white',
    'xtick.color': 'white',
    'ytick.color': 'white',
    'lines.solid_capstyle': 'round',
    'patch.edgecolor': 'white',
    'patch.force_edgecolor': True,
    'image.cmap': 'rocket',
    'font.sans-serif': ['Arial',
      'DejaVu Sans',
      'Liberation Sans',
      'Bitstream Vera Sans',
      'sans-serif'],
    'xtick.bottom': False,
    'xtick.top': False,
    'ytick.left': False,
    'ytick.right': False,
    'axes.spines.left': False,
    'axes.spines.bottom': False,
    'axes.spines.right': False,
    'axes.spines.top': False
}
sns.set_style("whitegrid", rc=style_dict)


In [None]:
col_info = pd.read_excel("dataset.xlsx", sheet_name="Data Dict", header=1, usecols=[1,2])
col_info

In [None]:
df = pd.read_excel("dataset.xlsx", sheet_name="dataset")

In [None]:
df.head()

In [None]:
df.info()

# EDA

In [None]:
print(df["Churn"].value_counts())
sns.countplot(data=df, x="Churn")

#### The target value "Churn" is inbalance, this is important for us to make decision in data analytics and modeling

-----------------

In [None]:
num_cols = df.select_dtypes(include='number').columns.tolist()

fig, axes = plt.subplots(nrows=3, ncols=5, figsize=(18, 9))

axes = axes.ravel()

for i, col in enumerate(num_cols):
    sns.violinplot(ax=axes[i], data=df, y=col)
    axes[i].set(title=col.capitalize(), xlabel=col)
    axes[i].grid(False)
    
plt.tight_layout()
plt.show()

#### I can see that there are outliers in Tenure, Warehousetohome, NumberOfAddress, CouponUsed, DaysSinceLastOrder and CashbackAmount.
This is relevant to our choice of model or wether we want to remove these outliers or no.

-------------

In [None]:
num_cols = df.select_dtypes(include='number').columns.tolist()

fig, axes = plt.subplots(nrows=3, ncols=5, figsize=(18, 9))

axes = axes.ravel()

for i, col in enumerate(num_cols):
    sns.kdeplot(ax=axes[i], data=df, x=col, hue="Churn", common_norm=False, warn_singular=False)
    axes[i].set(title=col.capitalize(), xlabel=col)
    axes[i].grid(False)
plt.tight_layout()
plt.show()

#### Here we can see that there is noticable difference between people who churned and who didn't.
Here we can see Tenure is our first candidate for the best predictor of churn.
The second best predictor seems to the Complain column.
Following those columns, SatisfactionScore and CityTier seem to show some information about churn.
----------

In [None]:
sns.heatmap(df[num_cols].drop("CustomerID", axis=1).corr(), vmin=-1, vmax=1, cmap="vlag")

#### There arenn't strongly correlated columns with Churn, as I have mentioned the biggest ones seem to be Tenure and Complain.
We also see few columns to be correlated with each other which affects our choice of model.

-----------

In [None]:
cat_cols = df.select_dtypes(exclude='number').columns.tolist()
cat_cols.append("Churn")
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 9))

axes = axes.ravel()

for i, col in enumerate(cat_cols):
    sns.histplot(ax=axes[i], data=df, x=col, stat="percent", common_norm=False, hue="Churn")
    axes[i].set(title=col.capitalize(), xlabel=col)
    axes[i].grid(False)
plt.tight_layout()
plt.show()

#### We see here that few categories from PreferredOrderCat, Marital Status and PreferredPaymentMonde columns are correlated with Churn
Also I'm noticing few duplicated categories in PreferredLoginDevice and PrefredOrderCat

-------------

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 3))
axes = axes.ravel()

sns.heatmap(df.isna(), vmin=-1, vmax=1, cmap="vlag", ax=axes[0])
n_cols = df.shape[1]
axes[0].vlines(range(1, n_cols), *axes[0].get_ylim(), colors='grey', linewidths=1)
axes[0].set_yticks([])
axes[0].tick_params(labeltop=True, labelbottom=False)
res = axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=85)
axes[0].set_title("Random order")

axes[1] = sns.heatmap(df.sort_values("CashbackAmount").isna(), vmin=-1, vmax=1, cmap="vlag", ax=axes[1])
n_cols = df.shape[1]
axes[1].vlines(range(1, n_cols), *axes[1].get_ylim(), colors='grey', linewidths=1)
axes[1].set_yticks([])
axes[1].tick_params(labeltop=True, labelbottom=False)
res = axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=85)
axes[1].set_title("Sorted by CashbackAmount")

#### Here I can see that initially it seems that the missing values are completely at random but after closely analyzing the missing values I figured that if I sorted the data by CashbackAMount, we can see it's not completely at Random

I tried to understand if these missing values are random or not, and since I don't have the context where and how this data was collected and what each column is exactly referring to I can't make better judgement on the type of missing data, I'm going to treat this as missing at random.

Although missing in the churn column is seems is completely at random

# Preprocessing

## Imports

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Missing Value

## Na Churns

Since there are rows that column Churn is na, I can't use those rows in classification, so I'm going to save remove those rows from data and use them for later in non classification modelings

In [None]:
df = df[~df["Churn"].isna()]

## Train Test Split

In [None]:
df.set_index("CustomerID", drop=True, inplace=True)
df.drop_duplicates()
X = df.drop("Churn", axis=1)
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
numeric_cols = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 'NumberOfDeviceRegistered',
                'SatisfactionScore', 'NumberOfAddress', 'OrderAmountHikeFromlastYear',
                'CouponUsed', 'OrderCount', 'DaySinceLastOrder', 'CashbackAmount']

In [None]:
categoric_cols = ['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender',
                  'PreferedOrderCat', 'MaritalStatus', 'CityTier', 'Complain']

## Data Imputation

In [None]:
class CategoryTransformer:
    def __init__(self, maps, col_name):
        self.category_maps = maps
        self.col_name = col_name
        
    def transform(self, X, **transform_params):
        for key, val in self.category_maps.items():
            X[self.col_name].replace(key, val, inplace=True)
        return X
    
    def fit(self, X, y= None, **fit_params):
        return self
    
    def fit_transform(self, X, y= None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

In [None]:
PreferredLoginDeviceTransformer = CategoryTransformer({"Phone": "Mobile Phone"}, "PreferredLoginDevice")
PreferedOrderCatTransformer = CategoryTransformer({"Mobile": "Mobile Phone"}, "PreferedOrderCat")

In [None]:
CategoicTransformer = Pipeline([('prefered_order_cat_transformer', PreferedOrderCatTransformer),
                             ('preferred_login_device_transformer', PreferredLoginDeviceTransformer),
                             ('frequent_imputer_categoric', SimpleImputer(strategy="most_frequent")),
                             ('onehot_encoding_categoric', OneHotEncoder(drop="first"))])
NumericTransformer = Pipeline([('median_imputer_numeric', SimpleImputer(strategy="median"))])

In [None]:
transformers = ColumnTransformer([('numerical_transformer', NumericTransformer, numeric_cols),
                                  ('categorical_transformer', CategoicTransformer, categoric_cols)])

In [None]:
transformers

# Modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [None]:
params_grid = {'clf__n_estimators': [100, 250, 500],
               'clf__max_features': ['log2', 'sqrt'],
               'clf__max_depth': [10, 50, 100, None],
               'clf__min_samples_split': [1, 2, 4],
               'clf__min_samples_leaf': [2, 5, 10],
               'clf__bootstrap': [True, False],
               'clf__class_weight': ['balanced']}

In [None]:
pipeline = Pipeline(steps=[("preprocessing", transformers), ("clf", RandomForestClassifier())])

In [None]:
clf_search = GridSearchCV(pipeline, param_grid=params_grid, scoring=make_scorer(f1_score , average='macro'), n_jobs=1, verbose=1, cv=2)

In [None]:
clf_search.fit(X_train, y_train)

In [None]:
clf_search.best_params_

In [None]:
best_clf = clf_search.best_estimator_

In [None]:
best_clf = load("clf3.joblib")

In [None]:
print(classification_report(y_test, best_clf.predict(X_test)))

In [None]:
cols = numeric_cols + ['PreferredLoginDevice_Mobile Phone',
                       'PreferredPaymentMode_COD',
                       'PreferredPaymentMode_Cash on Delivery',
                       'PreferredPaymentMode_Credit Card',
                       'PreferredPaymentMode_Debit Card',
                       'PreferredPaymentMode_E wallet',
                       'PreferredPaymentMode_UPI',
                       'Gender_Male',
                       'PreferedOrderCat_Grocery',
                       'PreferedOrderCat_Laptop & Accessory',
                       'PreferedOrderCat_Mobile Phone',
                       'PreferedOrderCat_Others',
                       'MaritalStatus_Married',
                       'MaritalStatus_Single',
                       'CityTier_2',
                       'CityTier_3',
                       'Complain_1']

In [None]:
feature_importance = pd.DataFrame(best_clf[1].feature_importances_, columns=['importance'])
feature_importance['features'] = cols

plt.figure(figsize=(10,8))
sns.barplot(x='importance', y='features', data=feature_importance.sort_values(by='importance', ascending=False))
plt.title('Feature importances')
plt.show()

In [None]:
from joblib import dump

In [None]:
dump(best_clf, "clf.joblib")