In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings("ignore")


## importing data sets 

In [None]:
df =  pd.read_csv("superstore_data.csv")

In [None]:
df

In [None]:
df.shape


In [None]:
df.info()

## calcuating summary stats

In [None]:
df.describe(include = "all")

## checking for duplicate values 

In [None]:
df.duplicated().sum()

## Feature Engineering

In [None]:

data = df.copy()


# Dropping column - Id
data.drop(columns=["Id"], inplace=True)

#converting to date time object
data.Dt_Customer = data.Dt_Customer.apply(lambda x : pd.to_datetime(str(x)))
data.Dt_Customer.describe()
data["Age"] = 2021 - pd.to_datetime(data["Year_Birth"], format="%Y").apply(lambda x: x.year)

data["Age"].sort_values()

# few observations with ages greater than 100
data[data["Age"] > 100]


In [None]:
data.drop(data[data["Age"] > 100].index, inplace=True)

In [None]:
# Extracting registration year from the date
data["Reg_year"] = data["Dt_Customer"].apply(lambda x: x.year)

# Extracting registration quarter from the date
data["Reg_quarter"] = data["Dt_Customer"].apply(lambda x: x.quarter)

# Extracting registration month from the date
data["Reg_month"] = data["Dt_Customer"].apply(lambda x: x.month)

# Extracting registration week from the date
data["Reg_week"] = data["Dt_Customer"].apply(lambda x: x.day // 7)

data.head()

In [None]:
cat_col = [
    "Education",
    "Marital_Status",
    "Kidhome",
    "Teenhome",
    "Complain",
    "Response",
    "Reg_year",
    "Reg_quarter",
    "Reg_month",
    "Reg_week",
]

# Printing number of count of each unique value in each column
for column in cat_col:
    print(data[column].value_counts())
    print("-____________________________________________---")

In [None]:
#     In education, 2n cycle and Master means the same thing. We can combine these two categories.
#     There are many categories in marital status. We can combine the categories 'Alone', 'Absurd' and 'YOLO' with 'Single' and 'Together' categories with 'Married'.
#     There are only 20 customers who complained in the last two years.
#     In 'Response' we have 1903 observations for the 0 class but only 334 observations for class 1.
#     There are only three years in the customer registration data.

In [None]:
# Replacing 2n Cycle with Master
data["Education"] = data["Education"].replace("2n Cycle", "Master")

In [None]:
# Replacing YOLO, Alone, Absurd with single and Together with Married
data["Marital_Status"] = data["Marital_Status"].replace(["YOLO", "Alone", "Absurd"], "Single")
data["Marital_Status"] = data["Marital_Status"].replace(["Together"], "Married")

In [None]:
data["Total_Amount_Spent"] = data[
    [
        "MntWines"      ,
        "MntFruits",
        "MntMeatProducts",
        "MntFishProducts",
        "MntSweetProducts",
        "MntGoldProds",
    ]
].sum(axis=1)


In [None]:
pd.pivot_table(
    data=data,
    index=["Reg_year", "Reg_month"],
    values="Total_Amount_Spent",
    aggfunc=np.sum,
).plot(kind="line", marker="o", linewidth=2, figsize=(12, 5))

In [None]:
plt.figure(figsize=(12, 5))
sns.regplot(y=data.Total_Amount_Spent, x=data.Income)

In [None]:
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a triangle will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--",
        
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
        #add median to the histogram
    ) 

## Exploratory data Analysis

## univariate analysis

In [None]:
#income observation
histogram_boxplot(data, "Income")

In [None]:
#income observation


# We can see there are some outliers in the income variable.
#   we can remove the data point on the extreme right end of the boxplot 



In [None]:
# Dropping observaion with income greater than 20000. Since there is just 1 such observation
data.drop(index=data[data.Income > 200000].index, inplace=True)

In [None]:
#meat products observation
histogram_boxplot(data, "MntMeatProducts")

In [None]:
# meat products observation


#     The distribution for the amount spent on meat products is highly skewed to the right.
#     We can see that there are some extreme observations in the variable that can be 
#     considered as outliers as they very far from the rest of the values.
#     We can cap the value of the variable to the next highest value.



In [None]:


# Checking 10 largest values of amount spend on meat products
data.MntMeatProducts.nlargest(10)


In [None]:
data[data["MntMeatProducts"] > 1580]

In [None]:
# what is clip?
# Trim values at input threshold(s). Assigns values outside boundary to boundary values.
# Thresholds can be singular values or array like, and in the latter case the clipping is performed element-wise 
# in the specified axis.
# Capping values for amount spent on meat products at next highest value i.e. 984
data["MntMeatProducts"].clip(upper=984, inplace=True)



In [None]:
#  MntSweetProducts observation
histogram_boxplot(data, "MntSweetProducts")

In [None]:


#     The distribution for the amount spent on sweet products is right-skewed
#     There is one observation to the right extreme which can be considered as an outlier.




In [None]:


data[data["MntSweetProducts"] > 200]



In [None]:
# Capping values for amount spent on sweet products at 198
data["MntSweetProducts"].clip(upper=198, inplace=True)

In [None]:
# MntGoldProds observation
histogram_boxplot(data, "MntGoldProds")

In [None]:
#     The distribution for the amount spent on gold products is right-skewed
#     There are some outliers in the amount spent on gold products. We will not remove all such data points as they represent real market trends but we can cap some of the extreme values.

data[data["MntGoldProds"] > 250]

In [None]:
data["MntGoldProds"].clip(upper=250, inplace=True)



In [None]:
 ## MntWines observation
histogram_boxplot(data, "MntWines")

In [None]:

#     The distribution for the amount spent on wines is highly skewed to the right
#     As the median of the distribution is less than 200, more than 50% of customers have spent less than 200 on wines.



In [None]:
# NumWebPurchases observation
histogram_boxplot(data, "NumWebPurchases")

In [None]:


#     The median of the distribution is 4 i.e. 50% of customers have 4 or less than 4 web purchases.
#     We can see that there are some extreme observations in the variable. We can cap these values to the next highest number of purchases.


data[data
     ["NumWebPurchases"] > 15]




In [None]:
data["NumWebPurchases"].clip(upper=11, inplace=True)

In [None]:
#  NumWebVisitsMonth observation
histogram_boxplot(data, "NumWebVisitsMonth")

In [None]:
# The distribution for the number of visits in a month is skewed and has some outliers at the right end.
data[data["NumWebVisitsMonth"] > 10]

In [None]:
# NumCatalogPurchases observation
histogram_boxplot(data, "NumCatalogPurchases")

In [None]:
#     The most number of observations are for 0 catalog purchases.
#     The median of the distribution is 2 i.e. 50% of customers have 2 or less than 2 catalog purchases.
#     We can see that there is two extreme observation in the variable. We can cap these values to the next highest number of purchases.
data[data["NumCatalogPurchases"] > 15]

In [None]:
# Capping values for number of catalog purchases at 11
data["NumCatalogPurchases"].clip(upper=11, inplace=True)


In [None]:
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 2, 6))
    else:
        plt.figure(figsize=(n + 2, 6))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n],
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # 

In [None]:
# bar chart for complains
labeled_barplot(data, "Complain", perc=True)

In [None]:
#  99% of customers had no complaint in the last 2 years. 
# This might be because the company provides good services or might be due to the lack of feedback options for customers.

In [None]:
# barchart having teenagers at customer home
labeled_barplot(data, "Teenhome", perc=True)

In [None]:
#Majority of the customers i.e. ~52% customers have no teen at home
#There are very few customers, only ~2%, with a number of teens greater than 1

In [None]:
# observation of pervious responsones
labeled_barplot(data, "Response", perc=True)

In [None]:
# 85% customer's response was NO in the last campaign.
# This shows that the distribution of classes in the target variable is imbalanced. We have only ~15% observations where response is YES.



## Bivariate Analysis


In [None]:
cols = data[
    [
        "MntWines",
        "MntGoldProds",
        "MntMeatProducts",
        "MntFruits",
        "MntFishProducts",
        "MntSweetProducts",
    ]
].columns.tolist()
plt.figure(figsize=(10, 10))

for i, variable in enumerate(cols):
    plt.subplot(3, 2, i + 1)
    sns.boxplot(df["Response"], df[variable])
    plt.tight_layout()
    plt.title(variable)
plt.show()

In [None]:
#Each plot shows that customer spending more on any product is more likely to take the offer. 

## again univariate analysis

In [None]:
# Observations on Customer_age
histogram_boxplot(data, "Age")
# As per the boxplot, there are no outliers in the 'Age' variable

In [None]:
# observations on NumStorePurchases
histogram_boxplot(data, "NumStorePurchases")

In [None]:
# There are no outliers in this variable

In [None]:
# observations on NumDealsPurchases
histogram_boxplot(data, "NumDealsPurchases")

In [None]:
data[data["NumDealsPurchases"] > 14]

In [None]:
# observations on Recency
histogram_boxplot(data, "Recency")
# recency  means the quality or state of being recent.

In [None]:
# There are no outliers in the 'Recency' variable

In [None]:
# observations on MntFruits
histogram_boxplot(data, "MntFruits")

In [None]:
# There are some outliers on the right end of the boxplot 
data[data["MntFruits"] > 190]

In [None]:
# observations on MntFishProducts
histogram_boxplot(data, "MntFishProducts")

In [None]:
# observations on Income
histogram_boxplot(data, "Income")



In [None]:
# We can see there are some outliers in the income variable.
data[data["Income"] > 200000]

In [None]:
# observations on Registration week
labeled_barplot(data, "Reg_week", perc = True)

In [None]:
# This shows that the number of registrations declines at the end of the month i.e. in the last two weeks.

In [None]:
# observations on Marital_Status
labeled_barplot(data, "Marital_Status",perc = True)

In [None]:
# Majority of the customers are married comprising approx ~64% of total customers.



In [None]:
# observations on Education
labeled_barplot(data, "Education",perc = True)

In [None]:
# Education of approx ~50% of customers is at graduation level.
# Very few observations i.e. ~2% for customers with basic level education

In [None]:
# observations on Kidhome
labeled_barplot(data, "Kidhome",perc = True)

In [None]:
# ~40% of customers have 1 kid and ~58% of customers have no kids at home
# There are very few customers, approx 2%, with a number of kids greater than 1 

In [None]:
# observations on Registration year
labeled_barplot(data, "Reg_year",perc = True)

In [None]:
# The number of customers registered is highest in the year 2013.

In [None]:
# observations on Registration quarter
labeled_barplot(data, "Reg_quarter",perc = True)

In [None]:
# There is no significant difference in the number of registrations for each quarter.

In [None]:
# observations on Registration month
labeled_barplot(data, "Reg_month",perc = True)



## Bivariate Analysis

In [None]:
# correlation check
plt.figure(figsize=(18, 10))
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()

In [None]:
# age and year of birth have a high negative correlation. We can drop one of them.
# Registration month and quarter columns are highly correlated which can be expected as we extracted these columns from the same column.
# We can drop one of the columns in a quarter or month as they are almost perfectly correlated.
# Total amount spent is correlated with variables they are associated with and hence we can drop this column.
# Number of purchases is positively correlated with income which can be expected as customers with higher income might spend more than customers with lower income.



In [None]:
sns.boxplot(y="Total_Amount_Spent", x="Education", data=data, orient="vertical")

In [None]:
# the amount spent increases with the increase in education level.
# Customers with graduate-level education spend slightly more than customers with master-level education. 

In [None]:
sns.boxplot(y="Total_Amount_Spent", x="Marital_Status", data=data, orient="vertical")

In [None]:
# We can see that the total amount spent is higher for widowed customers.
# No significant difference in the amount spent by single, married or divorced customers.

In [None]:
cols = data[["Recency", "Age", "Income", "Total_Amount_Spent"]].columns.tolist()
plt.figure(figsize=(10, 10))

for i, variable in enumerate(cols):
    plt.subplot(3, 2, i + 1)
    sns.boxplot(data["Response"], data[variable])
    plt.tight_layout()
    plt.title(variable)
plt.show()



In [None]:
#Customers with lower recency i.e. less number of days since the last purchase, are more likely to take the offer.
#Response does not depend much on age.
#Customers with higher income are more likely to take the offer.
#Customers who spent more in the last 2 years are more likely to take the offer.



In [None]:
# function to plot stacked bar chart


def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart

    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 120)
    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
    plt.legend(
        loc="lower left", frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.show()

In [None]:


stacked_barplot(data, "Education", "Response")



In [None]:
#  customers with higher education are more likely to take the offer. 

In [None]:
 stacked_barplot(data, "Marital_Status", "Response")

In [None]:
# divorced/widow customers are more likely to take the offer.
# Single customers are more likely to take the offer than married customers.

In [None]:
stacked_barplot(data, "Kidhome", "Response")

In [None]:
# Customers with no teens at home are most likely to take the offer.
# Customers with two teens are more likely to take the offer than customers with 1 teenager.

In [None]:
stacked_barplot(data, "Reg_year", "Response")



In [None]:
# Number of customers taking the offer is decreasing each subsequent year.

In [None]:
stacked_barplot(data, "Reg_month", "Response")

In [None]:
plt.figure(figsize=(15, 5))
sns.regplot(y=data.Total_Amount_Spent, x=data.Income)



In [None]:
#We can see that income and the total amount spent have a positive correlation.
#The total amount spent is not much different for customers with income in the range of 20K to 60K but the difference is significant for customers in the range of 60K to 100K.




## Data Preprocessing

In [None]:
data1 = data.copy()
data1

In [None]:
# Dropping birth year and Dt_Customer columns
data1.drop(
    columns=[
        
        "Dt_Customer"
    ],
    inplace=True,
)

In [None]:
# Dropping birth year and Dt_Customer columns
data1.drop(
    columns=[
        
        "Year_Birth"
    ],
    inplace=True,
)

In [None]:
# Dropping birth year and Dt_Customer columns
data1.drop(
    columns=[
        
        "Reg_quarter"
    ],
    inplace=True,
)

In [None]:
# Dropping birth year and Dt_Customer columns
data1.drop(
    columns=[
        
        "Total_Amount_Spent"
    ],
    inplace=True,
)

In [None]:
data1

In [None]:


# Separating target variable and other variables
X = data1.drop(columns="Response")
X = pd.get_dummies(X)

Y = data1["Response"]



In [None]:
from sklearn.model_selection import train_test_split

# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test

X_temp, X_test, y_temp, y_test = train_test_split(X, Y, test_size=0.2, random_state=1, stratify=Y)

# then we split the temporary set into train and validation

X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp)


print(X_train.shape, X_val.shape, X_test.shape)



## Missing-Value Treatment

In [None]:
# To be used for missing value imputation
from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy="median")



In [None]:


# Fit and transform the train data
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

# Transform the validation data
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_train.columns)

# Transform the test data
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)



In [108]:


# Checking that no column has missing values in train or test sets
print(X_train.isna().sum())
print("-" * 30)
print(X_val.isna().sum())
print("-" * 30)
print(X_test.isna().sum())



Income                     0
Kidhome                    0
Teenhome                   0
Recency                    0
MntWines                   0
MntFruits                  0
MntMeatProducts            0
MntFishProducts            0
MntSweetProducts           0
MntGoldProds               0
NumDealsPurchases          0
NumWebPurchases            0
NumCatalogPurchases        0
NumStorePurchases          0
NumWebVisitsMonth          0
Complain                   0
Age                        0
Reg_year                   0
Reg_month                  0
Reg_week                   0
Education_Basic            0
Education_Graduation       0
Education_Master           0
Education_PhD              0
Marital_Status_Divorced    0
Marital_Status_Married     0
Marital_Status_Single      0
Marital_Status_Widow       0
dtype: int64
------------------------------
Income                     0
Kidhome                    0
Teenhome                   0
Recency                    0
MntWines                   0

## model Building

In [109]:
# To get different metric scores, and split data
from sklearn import metrics

scorer = metrics.make_scorer(metrics.recall_score)



In [110]:

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    plot_confusion_matrix,
)
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [111]:

# defining model
model = AdaBoostClassifier(random_state=1)

# Parameter grid to pass in GridSearchCV

param_grid = {
    "n_estimators": np.arange(10, 110, 10),
    "learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
    "base_estimator": [
        DecisionTreeClassifier(max_depth=1, random_state=1),
        DecisionTreeClassifier(max_depth=2, random_state=1),
        DecisionTreeClassifier(max_depth=3, random_state=1),
    ],
}

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1)

# Fitting parameters in GridSearchCV
grid_cv.fit(X_train, y_train)

print(
    "Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)

Best Parameters:{'base_estimator': DecisionTreeClassifier(max_depth=2, random_state=1), 'learning_rate': 1, 'n_estimators': 30} 
Score: 0.395


In [112]:


# building model with best parameters
adb_tuned1 = AdaBoostClassifier(
    n_estimators=30,
    learning_rate=1,
    random_state=1,
    base_estimator=DecisionTreeClassifier(max_depth=2, random_state=1),
)

# Fit the model on training data
adb_tuned1.fit(X_train, y_train)



AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
                                                         random_state=1),
                   learning_rate=1, n_estimators=30, random_state=1)

In [113]:
# Calculating different metrics on train set
Adaboost_grid_train = model_performance_classification_sklearn(
    adb_tuned1, X_train, y_train
)
print("Training performance:")
Adaboost_grid_train

NameError: name 'model_performance_classification_sklearn' is not defined

In [None]:
# Calculating different metrics on train set
Adaboost_grid_train = model_performance_classification_sklearn(
    adb_tuned1, X_train, y_train
)
print("Training performance:")
Adaboost_grid_train

In [None]:


# Calculating different metrics on validation set
Adaboost_grid_val = model_performance_classification_sklearn(adb_tuned1, X_val, y_val)
print("Validation performance:")
Adaboost_grid_val



In [None]:


# creating confusion matrix
confusion_matrix_sklearn(adb_tuned1, X_val, y_val)

