In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'ecommerce-customer-churn-analysis-and-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1119908%2F1880629%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240815%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240815T083515Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D70dc383183804c1d81aaf05b960928a0509f7c868f8bec0b4156a7f9118380a7506aa51006b5eb1fe18eacb1eb3577c8ef31bca9de713a991f27d88e6e0bd57c2cb49184599d1f6753ed4298c5cc62eeee9ea40e6102e78773adfdf9f533ca3e7ca63c329be908a8e9ef913460d66e9397a8f1d69359c35d303bf74f6ca4601449f39daf731c18b86a9a845cc0e004122ec168459d98ef5828524bb5ec43bff0e48a495e9ec50f1c99dd863649fa331743bf15e5bd7895421025f0b8e7ee2c128979f290ad64683f5c0d4c3fd50e6b2fb5f9f8227e49eebe66afec12df07142726a4ced7399e5eedc09e08c2306404c84f19f078e447f40b3e2803d9f26ba8a6'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


<h1 style='background:#F5F5DC;border:0; color:black;
    box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);
    transform: rotateX(10deg);
    '><center>E-Commerce Customer Churn</center></h1>

![sklep-internetowy.jpg](attachment:0582b2b3-10ec-4089-be18-e7f9aad875e0.jpg)

<div class="table-of-contents" style="background-color:#F5F5DC; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h1>Table Of Contents</h1>
  <ol>
    <li><a href="#Introduction" style="color: blue;">Introduction</a></li>
    <li><a href="#Features" style="color: blue;">Features</a></li>
      <li><a href="#Goal" style="color: blue;">Goal</a></li>
    <li><a href="#Plan" style="color: blue;">Plan</a></li>
    <li><a href="#Importing" style="color: blue;">Importing Important Libraries</a></li>
    <li><a href="#Data_Overview" style="color: blue;">Data Overview</a></li>
    <li><a href="#EDA" style="color: blue;">EDA (Exploratory Data Analysis)</a></li>
    <li><a href="#Data_Preprocessing" style="color: blue;">Data Preprocessing</a></li>
    <li><a href="#Modeling" style="color: blue;">Modeling</a></li>
    <li><a href="#Evaluation" style="color: blue;">Evaluation</a></li>
    <li><a href="#Auto_ML" style="color: blue;">Auto ML</a></li>
    <li><a href="#Recommendations_&_Conclustion" style="color: blue;">Recommendations & Conclustion</a></li>
  </ol>
</div>


<a id="Introduction"></a>
<div class="Introduction" style="background-color:#F5F5DC; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h1><center>Introduction</center></h1>
  <ul>
  <li>
    This is a dataset of leading ecommerce company and we have analysis who are churn(leaving the company service) and have to make predicting churn model.   
   </li>
  </ul>
  </div>

<a id="Features"></a>
<div class="Features" style="background-color:#F5F5DC; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h1><center>Features</center></h1>
  <ul>
    <li > CustomerID: Unique customer ID</li>
    <li>Churn: Churn Flag</li>
    <li>Tenure: Tenure of customer in organization</li>
    <li>PreferredLoginDevice: Preferred login device of customer</li>
    <li>CityTier: City tier</li>
    <li>WarehouseToHome: Distance in between warehouse to home of customer</li>
    <li>PreferredPaymentMode: Preferred payment method of customer</li>
    <li>Gender: Gender of customer</li>
    <li>HourSpendOnApp: Number of hours spend on mobile application or website</li>
    <li>NumberOfDeviceRegistered: Total number of deceives is registered on particular customer</li>
    <li>PreferedOrderCat: Preferred order category of customer in last month</li>
    <li>SatisfactionScore: Satisfactory score of customer on service</li>
    <li>MaritalStatus: Marital status of customer</li>
    <li>NumberOfAddress: Total number of added added on particular customer</li>
    <liComplain: Any complaint has been raised in last month</li>
    <li>OrderAmountHikeFromlastYear: Percentage increases in order from last year</li>
    <li>CouponUsed: Total number of coupon has been used in last month</li>
    <li>OrderCount: Total number of orders has been places in last month</li>
    <li>DaySinceLastOrder: Day Since last order by customer</li>
    <li>CashbackAmount: Average cashback in last month</li>        

  </ul>
</div>

<a id="Goal"></a>
<div class="Goal" style="background-color:#F5F5DC; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h1><center>Goal</center></h1>
  <ul>
    <li > Build a predictive model that can accurately identify customers who are at risk of leaving the company (churn) based on the provided variables. This can help the company take proactive steps to retain these customers and reduce the rate of churn.</li>
    <br>
    <li>Perform a thorough exploratory analysis of the provided customer data to gain insights into the behavior and characteristics of the customers. This includes analyzing patterns and trends in variables. This analysis can help the company understand its customers better and inform future decision-making.</li>      

  </ul>

<a id="Plan"></a>
<div class="Plan" style="background-color:#F5F5DC; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h1><center>Plan</center></h1>

<div class="Datasets Overview" style="background-color:#DC143C; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h3 style="color: white;" >Datasets Overview</h3>

    
  <ul>
    <li style="color: white;" > Review the provided customer data to familiarize yourself with the variables and their structure.</li>
    <li style="color: white;">Check the data quality, missing values, and potential errors.</li>
    <li style="color: white;">Determine if any data pre-processing is necessary.</li>
    </ul>
</div>
<div class="Exploratory Data Analysis" style="background-color:#DC143C; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h3 style="color: white;" >Exploratory Data Analysis</h3>
  
  <ul>
    <li style="color: white;">Analyze the distribution of the variables to identify any outliers or anomalies.</li>
    <li style="color: white;">Investigate the relationship between variables to identify any correlations or patterns.</li>
    <li style="color: white;">Visualize the data to gain insights into the behavior and characteristics of the customers.</li>
   </ul>
</div>
<div class="Pre-Processing" style="background-color:#DC143C; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h3 style="color: white;">Pre-Processing</h3>
  
  <ul>
    <li style="color: white;">Clean the data by handling missing values, converting variables to appropriate data types, and addressing any data quality issues.</li>
    <li style="color: white;">Select the most important variables for building the predictive model.</li>
   </ul>  
</div>
<div class="Machine Learning" style="background-color:#DC143C; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h3 style="color: white;">Machine Learning</h3>
  
  <ul>
    <li style="color: white;">Build a predictive model that can identify customers who are at risk of leaving the company.</li>
   </ul>  
   </div>

</div>

<a id="Importing"></a>
<h1 style='background:#F5F5DC;border:0; color:black;
    box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);
    transform: rotateX(10deg);
    '><center>Importing Important Libraries</center></h1>  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.svm import SVC

# Additional imports
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score , confusion_matrix , classification_report
from sklearn.model_selection import GridSearchCV, cross_validate

import warnings
warnings.simplefilter(action='ignore')

<a id="Data_Overview"></a>
<h1 style='background:#F5F5DC;border:0; color:black;
    box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);
    transform: rotateX(10deg);
    '><center>Data Overview</center></h1>

In [None]:
# Step 1: Data Loading and Understanding

df = pd.read_excel('/kaggle/input/ecommerce-customer-churn-analysis-and-prediction/E Commerce Dataset.xlsx', sheet_name='E Comm')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
# colums to list
columns = df.columns.to_list()
columns

In [None]:
df.select_dtypes(exclude=np.number).columns

In [None]:
df.describe(include='O').style.background_gradient(axis=None , cmap = "Blues" , vmin = 0 , vmax = 9000  )


In [None]:
# Show the unique values on each column.
for col in df.columns:
    if df[col].dtype == object:
        print(str(col) + ' : ' + str(df[col].unique()))
        print(df[col].value_counts())
        print("________________________________________________________________________________")

In [None]:

df.select_dtypes(include=np.number).columns

In [None]:
df.describe().T.style.bar(subset=['mean']).background_gradient(subset=['std','50%','max'])

In [None]:
for col in df.columns:
    if df[col].dtype == float or df[col].dtype == int:
        print(str(col) + ' : ' + str(df[col].unique()))
        print(df[col].value_counts())
        print("________________________________________________________________________________")

In [None]:
#As mobile phone and phone are both same so we have merged them
df.loc[df['PreferredLoginDevice'] == 'Phone', 'PreferredLoginDevice' ] = 'Mobile Phone'
df.loc[df['PreferedOrderCat'] == 'Mobile', 'PreferedOrderCat' ] = 'Mobile Phone'

In [None]:
df['PreferredLoginDevice'].value_counts()

In [None]:
#as cod is also cash on delievery
#as cc is also credit card so i merged them
df.loc[df['PreferredPaymentMode'] == 'COD', 'PreferredPaymentMode' ] = 'Cash on Delivery'   # uses loc function
df.loc[df['PreferredPaymentMode'] == 'CC', 'PreferredPaymentMode' ] = 'Credit Card'

In [None]:
df['PreferredPaymentMode'].value_counts()

In [None]:
# convert num_cols to categories
df2 = df.copy()
for col in df2.columns:
  if col == 'CustomerID':
    continue

  else:
    if df2[col].dtype == 'int':
      df2[col] = df[col].astype(str)

df2.dtypes

In [None]:
# Categorical cols after Converting
df2.describe(include='O').style.background_gradient(axis=None , cmap = "Blues" , vmin = 0 , vmax = 9000  )

In [None]:
# Numerical cols after Converting
df2.describe().T.style.bar(subset=['mean']).background_gradient(subset=['std','50%','max'])

In [None]:
df.duplicated().sum()

In [None]:
# the sum of null values
grouped_data = []
for col in columns:
    n_missing = df[col].isnull().sum()
    percentage = n_missing / df.shape[0] * 100
    grouped_data.append([col, n_missing, percentage])

# Create a new DataFrame from the grouped data
grouped_df = pd.DataFrame(grouped_data, columns=['column', 'n_missing', 'percentage'])

# Group by 'col', 'n_missing', and 'percentage'
result = grouped_df.groupby(['column', 'n_missing', 'percentage']).size()
result

In [None]:
from pandas_profiling import ProfileReport
ProfileReport(df)

<a id="EDA"></a>
<div class="EDA (Exploratory Data Analysis)" style="background-color:#F5F5DC; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h1><center>EDA (Exploratory Data Analysis)</center></h1>


<div style="background-color:#DC143C; color:black; padding: 5px; margin: 5px; font-size: 110%; border-radius: 15px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);"><h3 style="color: white;"><center>Bussiness Questions<center></h3>
  </div>

<div style="background-color:#DC143C; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">  
  <ol>
    <li style="color: white;"> Is there a relationship between Gender and Churn? & Which Gender has more Orders?</li>
    <li style="color: white;">Which MartialStatus has the highest Churn rate?</li>
    <li style="color: white;">Which CityTier has higher Tenure and OrderCount?</li>
    <li style="color: white;">Is Customer with High SatisfactionScore have high HourSpendOnApp?<br>Is there a correlation between SatisfactionScore and HourSpendOnApp?</li>
    <li style="color: white;">Which CityTier has the most HourSpendOnApp?</li>
    <li style="color: white;">What is the relation between NumberOfAddress and CityTier within the churn segment?</li>
    <li style="color: white;">What is the relation between Complain and DaySinceLastOrder?</li>
    <li style="color: white;">Is there a relationship between PreferredLoginDevice and Churn?</li>
    <li style="color: white;">What is the distance between warehouse to customer house in different city tier?</li>
    <li style="color: white;">Does different CityTiers has different prefered products?</li>
    <li style="color: white;">What is the preferred payment mode for different CityTiers?</li>
    <li style="color: white;">Which CityTier has the highest OrderCount?</li>
    <li style="color: white;">Does the percentage increase in order amount from last year affect churn rate?</li>
    <li style="color: white;">What is the relation between Complain and DaySinceLastOrder?</li>
    <li style="color: white;">What is ordercount for customers with high HourSpendOnApp?</li>
    <li style="color: white;">Is there a relationship between preferred order category and churn rate?</li>
    <li style="color: white;">Do customers who used more coupons have lower churn rates?</li>
    <li style="color: white;">Is there a connection between satisfaction score and number of orders in the past month?</li>
    <li style="color: white;">There is relation between CashbackAmount and  order counts within churn?</li>
    <li style="color: white;">Are customers who complained more likely to churn?</li>       
  </ol>
</div>

</div>

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
binary_cat_cols = ['Complain']
outcome = ['Churn']
cat_cols = ['PreferredLoginDevice', 'CityTier', 'PreferredPaymentMode',
       'Gender', 'NumberOfDeviceRegistered', 'PreferedOrderCat',
       'SatisfactionScore', 'MaritalStatus', 'NumberOfAddress', 'Complain']
num_cols = ['Tenure', 'WarehouseToHome', 'HourSpendOnApp', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount', 'DaySinceLastOrder', 'CashbackAmount']


In [None]:
df_c = df[df['Churn']==1].copy()
df_nc = df[df['Churn']==0].copy()

fig, ax = plt.subplots(2,4,figsize=(20, 15))
fig.suptitle('Density of Numeric Features by Churn', fontsize=20)
ax = ax.flatten()

for idx,c in enumerate(num_cols):
    sns.kdeplot(df_c[c], linewidth= 3,
             label = 'Churn',ax=ax[idx])
    sns.kdeplot(df_nc[c], linewidth= 3,
             label = 'No Churn',ax=ax[idx])

    ax[idx].legend(loc='upper right')

plt.show()


<div class="Distributions Insights" style="background-color:#F5F5DC; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h3><center>Distributions Insights Of the Numeric Features</center></h3>
  <ul>
    <li >Tenure: Customers with longer tenure seem less likely to churn. Makes sense as longer tenure indicates satisfaction.</li>
    <li>CityTier: Churn rate looks similar across tiers. City tier does not seem predictive of churn.</li>
    <li>WarehouseToHome: Shorter warehouse to home distances have a lower churn rate. Faster deliveries may improve satisfaction.</li>
    <li>HourSpendOnApp: More time spent on app correlates with lower churn. App engagement is a good sign.</li>
    <li>NumberOfDeviceRegistered: More registered devices associates with lower churn. Access across devices improves convenience.</li>
    <li>SatisfactionScore: Higher satisfaction scores strongly associate with lower churn, as expected. Critical driver.</li>
    <li>NumberOfAddress: Slight downward trend in churn as number of addresses increases. More addresses indicates loyalty.</li>
    <li>Complain: More complaints associate with higher churn, though relationship isn't very strong. Complaints hurt satisfaction.</li>
    <li>OrderAmountHikeFromLastYear: Big spenders from last year are less likely to churn. Good to retain big customers.</li>
    <li>CouponUsed: Coupon usage correlates with lower churn. Coupons enhance loyalty.</li>
    <li>OrderCount: Higher order counts associate with lower churn. Frequent usage builds habits.</li>
    <li>DaySinceLastOrder: Longer since last order correlates with higher churn. Recency is a good predictor.</li>
 </ul>
 </div>

In [None]:
df_c = df2[df2['Churn']=='1'].copy()
df_nc = df2[df2['Churn']=='0'].copy()

fig, ax = plt.subplots(4,3,figsize=(20, 18))
fig.suptitle('Density of Numeric Features by Churn', fontsize=20)
ax = ax.flatten()

for idx,c in enumerate(cat_cols):
    sns.histplot(df_c[c], linewidth= 3,
             label = 'Churn',ax=ax[idx])
    sns.histplot(df_nc[c], linewidth= 3,
             label = 'No Churn',ax=ax[idx])

    ax[idx].legend(loc='upper right')

plt.show()

In [None]:
# color palettes
pie_palette = ['#3E885B','#7694B6','#85BDA6', '#80AEBD', '#2F4B26', '#3A506B']
green_palette = ['#2F4B26', '#3E885B', '#85BDA6', '#BEDCFE', '#C0D7BB']
blue_palette = ['#3A506B', '#7694B6', '#80AEBD', '#5BC0BE', '#3E92CC']
custom_palette = ['#3A506B', '#7694B6', '#80AEBD', '#3E885B', '#85BDA6']
red_palette = ['#410B13', '#CD5D67', '#BA1F33', '#421820', '#91171F']

### 1-Is there a relationship between Gender and Churn? & Which Gender has more Orders?

In [None]:
df['Gender'].value_counts()

In [None]:
df.groupby("Churn")["Gender"].value_counts() # the churned females ratio 348/2246 * 100
                                              # the churned males ratio 600/3384 * 100

In [None]:
df.groupby("PreferredLoginDevice")["OrderCount"].value_counts() # the churned females ratio 348/2246 * 100


In [None]:
gender_orders = df.groupby('Gender')['OrderCount'].mean().plot(kind='bar')

gender_orders  # females have more order count avg

there is not a big difference between the males and the femals: avg order

In [None]:
percentageM =600/3384 * 100

percentageM   #the percentage of the leaving males out of the males

In [None]:
percentageF =348/2246 * 100

percentageF  #the percentage of the leaving females out of the females


In [None]:
import pandas as pd
import plotly.express as px

# Create figure
fig = px.pie(df, values='Churn', names='Gender')
fig.update_traces(marker=dict(colors=['pink ', 'baby blue']))

# Update layout
fig.update_layout(
  title='Churn Rate by Gender',
  legend_title='Gender'
)

# Show plot
fig.show()

# # Create figure
# fig = px.pie(df, values='OrderCount', names='Gender')
# fig.update_traces(marker=dict(colors=['pink ', 'baby blue']))

# # Update layout
# fig.update_layout(
#   title='order Rate by Gender',
#   legend_title='Gender'
# )

# # Show plot
# fig.show()

as we see the males are more likely to churn as we have 63.3 % churned males from the app may be the company should consider incresing the products that grap the males interest and so on.. we are going to see if there is another factors that makes the highest segment of churned customers are males.

### 2-Which MartialStatus has the highest Churn rate?

In [None]:
df.groupby("Churn")["MaritalStatus"].value_counts()

In [None]:
sns.countplot(x='MaritalStatus',hue='Churn',data=df,palette='Set2')
plt.title("churn Rates by MaritalStatus")
plt.ylabel("Churn Rate")

-the married are the highest customer segment in the comapny may be the comapny should consider taking care of the products that suits the single and the married customers as the singles are the most likely to churn from the app

### 3-Which CityTier has higher Tenure and OrderCount?


In [None]:
df_grouped_tenure = df.groupby('CityTier')['Tenure'].agg(['mean', 'max'])
df_grouped_tenure

In [None]:
df_grouped_OrderCount = df.groupby('CityTier')['OrderCount'].agg(['mean', 'max'])
df_grouped_OrderCount

In [None]:
# means = df_grouped['Tenure']['mean']
# means.plot(kind='pie',autopct='%1.1f%%')
# plt.xlabel('CityTier')
# plt.ylabel('Mean Tenure')


citytier 2 has the highest tenure rate but the tenure rate does not seen to be a strong factor

In [None]:
df.groupby("CityTier")["OrderCount"].mean()

citytier 3 has the highest order avg but it not to be a strong factor in the customer churning

### 4-Is Customer with High SatisfactionScore have high HourSpendOnApp?
### Is there a correlation between SatisfactionScore and HourSpendOnApp?

In [None]:
df['SatisfactionScore'].dtypes

In [None]:
import matplotlib.pyplot as plt

# plot
fig = px.histogram(df2, x="HourSpendOnApp", y="SatisfactionScore", orientation="h", color="Churn" ,text_auto= True , title="<b>"+'HourSpendOnApp Vs SatisfactionScore' , color_discrete_sequence = ['#BA1F33','#3A506B','#3E885B'])

# Customize the plot
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='HourSpendOnApp',
yaxis_title='SatisfactionScore',
)
fig.show()



# sns.barplot(x='SatisfactionScore',y='HourSpendOnApp',data=df)
# ax = df[['SatisfactionScore','HourSpendOnApp']].value_counts().plot(kind='bar')


as we see people with less satisfaction score spend less time on the app than the people of satisfaction score 5 but also i do not think there is any realation between the satisfaction score and people's spent time on the app

### 5-Which CityTier has the most HourSpendOnApp?

In [None]:
g = sns.FacetGrid(df, col='CityTier')
g.map(sns.distplot, 'HourSpendOnApp')

city tier 1 has the most spended hours on the app

### 6-What is the relation between NumberOfAddress and CityTier within the churn segment?

In [None]:
df.groupby("CityTier")["NumberOfAddress"].value_counts()

In [None]:
# Violin plots
import seaborn as sns
sns.violinplot(x='CityTier', y='NumberOfAddress', data=df[df['Churn']==1])


There is a negative correlation between CityTier and NumberOfAddress. Higher CityTiers are associated with lower average NumberOfAddress and a more concentrated distribution.
Customers in larger cities (CityTier 1) tend to have more addresses on average compared to smaller cities and towns in lower tiers.
The relationship suggests address density and type of locality (metro vs smaller cities vs towns) impacts how many addresses customers have across city types.

### 7-What is the relation between Complain and DaySinceLastOrder?

In [None]:
# Pearson correlation
df[['DaySinceLastOrder', 'Complain']].corr()

In [None]:
import plotly.express as px

fig = px.scatter(df, x='DaySinceLastOrder', y='Complain', facet_col='Churn')
fig.update_layout(hovermode='closest')
fig.show()

there is a weak negative relation between complainig and the number of dayes since last order




### 8-Is there a relationship between PreferredLoginDevice and churn?



In [None]:
# Bar chart with churn rate
import seaborn as sns
# sns.catplot(x='PreferredLoginDevice', y='Churn', data=df, kind='bar')

# Group the data by 'OverTime' and 'Attrition', and calculate the count
grouped_data = df.groupby(['PreferredLoginDevice', 'Churn']).size().unstack().plot(kind='bar', stacked=True)

# Set the plot title, x-label, and y-label
plt.title('Churn by PreferredLoginDevice ')
plt.xlabel('PreferredLoginDevice')
plt.ylabel('Count')

# Show the plot
plt.show()


mobile phone users are likely to churn may be this indicates a problem on the app user experience on the app mobile version


## 9-What is distancebetween warehosue to customer house in different city tier ?



In [None]:
df3 = df.copy()

df3['CityTier'].astype('str')
plt.figure(figsize = (5,7))
sns.stripplot(x = 'CityTier', y = 'WarehouseToHome', data = df3, jitter = False)
plt.ylabel(' Distance between warehouse to home');

Inference: As the distance from warehouse to home is similar in all city tier which means company had build warehouse in lower city tier also.

### 10-Does different citytiers has different prefered products?


In [None]:
import plotly.express as px
earth_palette = ["#A67C52", "#8F704D", "#B09B71", "#7E786E"]


fig=px.histogram(df,x="PreferedOrderCat",facet_col="CityTier",color="CityTier",color_discrete_sequence=earth_palette,text_auto= True , title="<b>"+'CityTier Vs PreferedOrderCat')

# Customize the plot
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='PreferredPaymentMode',
yaxis_title='count',
)
fig.show()

laptop & accesories and mobile phones are the prefered category for all the city tiers




### 11- What is the preferred payment mode for different CityTiers?

In [None]:
df2['PreferredPaymentMode'].value_counts()

In [None]:
df2.groupby('CityTier')[['PreferredPaymentMode']].value_counts()

In [None]:
import plotly.express as px

fig=px.histogram(df2,x="PreferredPaymentMode",facet_col="CityTier",color="CityTier",color_discrete_sequence=red_palette,text_auto= True , title="<b>"+'CityTier Vs PaymentMethod')

# Customize the plot
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='PreferredPaymentMode',
yaxis_title='count',
)
fig.show()

#### preferred payment method for CityTier '1' ==> DebitCard
#### preferred payment method for CityTier '2' ==> UPI
#### preferred payment method for CityTier '3' ==> E wallet

### 12-Which CityTier has the highest OrderCount?

In [None]:
df2.groupby('CityTier')[['OrderCount']].sum()

In [None]:
fig = px.histogram(df2, x="OrderCount", y="CityTier", orientation="h", color="CityTier" ,text_auto= True , title="<b>"+'CityTier Vs Sum of OrderCount' , color_discrete_sequence = ['#BA1F33','#3A506B','#3E885B'])

# Customize the plot
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='Sum of OrderCount',
yaxis_title='count',
)
fig.show()

### CityTier '1' has highest order count with 10298 orders

### 13-Does the percentage increase in order amount from last year affect churn rate?

In [None]:
df2['OrderAmountHikeFromlastYear'].value_counts()

In [None]:
df2.groupby('OrderAmountHikeFromlastYear')['Churn'].count()

In [None]:
comp_ten = df2.groupby(["OrderAmountHikeFromlastYear", "Churn"]).size().reset_index(name="Count")

# Create a bubble chart using Plotly
fig_bubble = px.scatter(comp_ten, x="OrderAmountHikeFromlastYear", y="Count", size="Count", color="Churn", title="<b>"+'OrderAmountHikeFromlastYear VS Churn',
                        color_discrete_sequence=["#d62728", "#1f77b4"])

# Customize the plot
fig_bubble.update_layout(hovermode='x',title_font_size=30)
fig_bubble.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='OrderAmountHikeFromlastYear',
yaxis_title='count',
)
fig_bubble.show()

#### Graph Show when the percentage of order last year increase the churn rate decrease so OrderAmountHikeFromlastYear has postive effect on Churn rate and we need to focus when customer has percentage 12% - 14%


### 14-What is the relation between Complain and DaySinceLastOrder for churned customers?

In [None]:
df_c.groupby('Complain')[['DaySinceLastOrder']].sum()

In [None]:
fig = px.histogram(df2, x="DaySinceLastOrder", color="Complain",text_auto= True , title="<b>DaySinceLastOrder Vs Complain" , color_discrete_sequence = ['#BA1F33','#3A506B'],
                   marginal="box") # or violin, rug)

# Customize the plot
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='DaySinceLastOrder',
yaxis_title='count',
)
fig.show()

#### customers who didn't made complain has higher DaySinceLastOrder , however it's only one customer so its an outlier if we remove it we will customers with no complain has lower DaySinceLastOrder


### 15-What is the order counts for customers with high HourSpendOnApp?

In [None]:
# we will make binnig for column HourSpendOnApp
df2['HourSpendOnApp'].agg(['min','max'])

In [None]:
# Define the bin range
bins = [0 , 1 , 3 , 6]
label = ['low' , 'medium' , 'high']
# Create a new column 'HourSpendOnApp_bins' with the binned values
df2['HourSpendOnApp_bins'] = pd.cut(df2['HourSpendOnApp'], bins=bins , labels = label)

In [None]:
df2.groupby(['HourSpendOnApp_bins','OrderCount'])[['CustomerID']].count()

In [None]:
sunbrust_gr = df2.loc[:,['HourSpendOnApp_bins','OrderCount']].dropna()

In [None]:
fig = px.sunburst(sunbrust_gr,path=['HourSpendOnApp_bins','OrderCount'],title="<b>"+'HourSpendOnApp VS OrderCount',template="plotly" , color_discrete_sequence=["#78b4d5", "#d57f86" ,'#3E885B'])
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
)
fig.update_traces(textinfo="label+percent parent")

fig.show()

#### Segment of customers has high spendtime on App has OrderCount 2 with percentage 67%

### 16-Is there a relationship between preferred order category and churn rate?

In [None]:
df2.groupby(['PreferedOrderCat' , 'Gender'])[['CustomerID']].count()

In [None]:
# Group and count by 'PreferedOrderCat' and 'Churn'
ordercat_churnrate = pd.DataFrame(df2.groupby('PreferedOrderCat')['Gender'].value_counts())
ordercat_churnrate = ordercat_churnrate.rename(columns={'Gender': 'Count'})
ordercat_churnrate = ordercat_churnrate.reset_index()


fig = px.histogram(ordercat_churnrate, x='PreferedOrderCat', y = 'count',color='Gender', barmode='group',color_discrete_sequence=pie_palette,title="<b>"+'Prefered Category Vs Gender', barnorm = "percent",text_auto= True)
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='PreferedOrderCat',
yaxis_title='count',
)
fig.show()

#### Top 2 Preferd Category For Males == > [ Others , Mobile Phone ]
#### Top 2 Preferd Category For Females == > [ Grocery , Fashion ]

### 17-Do customers who used more coupons have lower churn rates?

In [None]:
df2.groupby(['CouponUsed' , 'Churn'])[['CustomerID']].count()

In [None]:
# Group and count by 'Coup' and 'Churn'
coupoun_churnrate = pd.DataFrame(df2.groupby('CouponUsed')['Churn'].value_counts())
coupoun_churnrate = coupoun_churnrate.rename(columns={'Churn': 'Count'})
coupoun_churnrate = coupoun_churnrate.reset_index()


fig = px.bar(coupoun_churnrate, x='CouponUsed', y = 'count',color='Churn', barmode='group',color_discrete_sequence=['rgba(58, 71, 80, 0.6)' ,'rgba(246, 78, 139, 1.0)'],title="<b>"+'CouponUsed Vs Churn Rate',text_auto= True)
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='CouponUsed',
yaxis_title='count',
)
fig.show()

#### Grpah shows Churn become less when more coupons used

### 18-Is there a connection between satisfaction score and number of orders in the past month?

In [None]:
df2.groupby('SatisfactionScore')[['OrderCount']].count()

In [None]:
fig = px.box(df2, y="OrderCount", x='SatisfactionScore', color="SatisfactionScore", title="<b>"+'SatisfactionScore Vs OrderCount',
             boxmode="overlay", points='all')
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='SatisfactionScore',
yaxis_title='OrderCount',
)
fig.show()

#### StatisfactionScore doesn't have affect on OrderCount

##19-There is relation between CashbackAmount and  order counts within churn?

In [None]:
df_c.groupby(['OrderCount','CashbackAmount'])[['Churn']].count()

In [None]:
# fig = px.density_contour(df2, x="HourSpendOnApp", y="OrderCount", color = 'churn',
#                          title="<b>"+'HourSpendOnApp Vs OrderCount within churn',
#                           color_discrete_sequence=["#d62728", "#1f77b4"]
#                         )
fig = px.histogram(df2, x='CashbackAmount', y='OrderCount' ,color = 'Churn', title="<b>"+'CashbackAmount Vs OrderCount within churn', color_discrete_sequence=["#d62728", "#1f77b4"])

# Customize the plot
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='CashbackAmount',
yaxis_title='OrderCount',
)
fig.show()

#### Graphs shows there is no relation between cash back amount and ordercount and there is postive relation between cashback amount and churn rate

##20-Are customers who complained more likely to churn?

In [None]:
df2.groupby('Complain')[['Churn']].count()

In [None]:
comp_churn = pd.DataFrame(df2.groupby('Complain')['Churn'].value_counts())
comp_churn = comp_churn.rename(columns={'Churn': 'Count'})
comp_churn = comp_churn.reset_index()
print(comp_churn)

comp_churn['Complain'].replace('0' , 'No Complain' , inplace = True)
comp_churn['Complain'].replace('1' , 'Complain' , inplace = True)
comp_churn['Churn'].replace('0' , 'No Churn' , inplace = True)
comp_churn['Churn'].replace('1' , 'Churn' , inplace = True)
print(comp_churn)

# Tree map
fig = px.treemap(comp_churn, path=[px.Constant("all"), 'Complain', 'Churn'], values='count' , color_discrete_sequence=["#2F4B26" , '#FF0000'],title="<b>"+'Complain Vs Churn')
fig.update_traces(textinfo="label+percent parent+value" ,root_color="lightgrey")
fig.update_layout(margin = dict(t=70, l=25, r=25, b=25))

# red_palette = ['#410B13', '#CD5D67', '#BA1F33', '#421820', '#91171F']
# Customize the plot
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
)
fig.show()

#### No complain doesn't affects on Churn as graph shows customers which made complains 68% doesn't make Churn

<div class="All EDA Insights" style="background-color:#F5F5DC; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h1><center>All EDA Insights</center></h1>
  <ul>
    <li >There is not a big difference between the males and the females: avg order</li>
    <li>The males are more likely to churn as we have 63.3 % churned males from the app may be the company should consider incresing the products that grap the males interest and so on.. we are going to see if there is another factors that makes the highest segment of churned customers are males.</li>
    <li>The married are the highest customer segment in the comapny may be the comapny should consider taking care of the products that suits the single and the married customers as the singles are the most likely to churn from the app</li>
    <li>Citytier 2 has the highest tenure rate but the tenure rate does not seen to be a strong factor</li>
    <li>Citytier 3 has the highest order avg but it not to be a strong factor in the customer churning</li>
    <li>People with less satisfaction score spend less time on the app than the people of satisfaction score 5 but also i do not think there is any realation between the satisfaction score and people's spent time on the app</li>
    <li>City tier 1 has the most spended hours on the app</li>
    <li>There is a negative correlation between CityTier and NumberOfAddress. Higher CityTiers are associated with lower average NumberOfAddress and a more concentrated distribution, Customers in larger cities (CityTier 1) tend to have more addresses on average compared to smaller cities and towns in lower tiers, The relationship suggests address density and type of locality (metro vs smaller cities vs towns) impacts how many addresses customers have across city types.</li>
    <li>There is a weak negative relation between complainig and the number of days since last order</li>
    <li>mobile phone users are likely to churn may be this indicates a problem on the app user experience on the app mobile version</li>
    <li>Inference: As the distance from warehouse to home is similar in all city tier which means company had build warehouse in lower city tier also.</li>
    <li>laptop & accesories and mobile phones are the prefered category for all the city tiers</li>
    <li> Preferred payment method for CityTier '1' ==> DebitCard <br>
         Preferred payment method for CityTier '2' ==> UPI<br>
         Preferred payment method for CityTier '3' ==> E wallet<br>
         There is big common in debit card method in  tiers</li>
    <li>CityTier '1' has highest order count with 10298 orders <br>
    CityTier '3' has highest mean ordercount that means CityTier '3' thier count small and they have many orders 'richTier' </li>
    <li>When the percentage of order last year increase the churn rate decrease so OrderAmountHikeFromlastYear has postive effect on Churn rate and we need to focus when customer has percentage 12% - 15%</li>
    <li>customers who didn't made complain has higher DaySinceLastOrder , however it's only one customer so its an outlier if we remove it we will customers with no complain has lower DaySinceLastOrder</li>
    <li>Segment of customers has high spendtime on App has OrderCount 2 with percentage 67%</li>
    <li>Top 2 Preferd Category For Males == > [ Others , Mobile Phone ]<br>
   Top 2 Preferd Category For Females == > [ Grocery , Fashion ]</li>
    <li>Churn become less when more coupons used</li>
    <li>StatisfactionScore doesn't have affect on OrderCount</li>
    <li>There is no relation between cash back amount and ordercount and there is postive relation between cashback amount and churn rate</li>
    <li>Complain doesn't affects on Churn, Customers which made complains 68% doesn't make Churn</li>         

  </ul>
</div>

<a id="Data_Preprocessing"></a>
<h1 style='background:#F5F5DC;border:0; color:black;
    box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);
    transform: rotateX(10deg);
    '><center>Data Preprocessing</center></h1>

## Handling Missing Values

In [None]:
round((df.isnull().sum()*100 / df.shape[0]),2)

In [None]:
msno.matrix(df)

In [None]:
msno.bar(df , color="tab:green")

#### All Missing values less than 6% so we can impute them

In [None]:
sns.kdeplot(df , x='Tenure')

In [None]:
# impute with bfill Method
df['Tenure'] = df['Tenure'].fillna(method = 'bfill')

In [None]:
sns.kdeplot(df , x='Tenure')

In [None]:
df['Tenure'].isnull().sum()

-------------------------------

In [None]:
sns.kdeplot(df , x='WarehouseToHome')

In [None]:
# Impute with simple imputer
from sklearn.impute import SimpleImputer
s_imp = SimpleImputer(missing_values=np.nan , strategy = 'most_frequent')
df['WarehouseToHome'] = s_imp.fit_transform(pd.DataFrame(df['WarehouseToHome']))

In [None]:
sns.kdeplot(df , x='WarehouseToHome')

In [None]:
df['WarehouseToHome'].isnull().sum()

---------------------------------------------------

In [None]:
sns.kdeplot(df , x='HourSpendOnApp')

In [None]:
fill_list = df['HourSpendOnApp'].dropna()
df['HourSpendOnApp'] = df['HourSpendOnApp'].fillna(pd.Series(np.random.choice(fill_list , size = len(df['HourSpendOnApp'].index))))

In [None]:
sns.kdeplot(df , x='HourSpendOnApp')

In [None]:
df['HourSpendOnApp'].isnull().sum()

-------------------------------------------------------

In [None]:
sns.kdeplot(df , x='OrderAmountHikeFromlastYear')

In [None]:
# impute with ffill method
df['OrderAmountHikeFromlastYear'] = df['OrderAmountHikeFromlastYear'].fillna(method = 'ffill')

In [None]:
sns.kdeplot(df , x='OrderAmountHikeFromlastYear')

In [None]:
df['OrderAmountHikeFromlastYear'].isnull().sum()

--------------------------------------------

In [None]:
sns.kdeplot(df , x='CouponUsed')

In [None]:
# Impute with KNN Imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
df['CouponUsed']=imputer.fit_transform(df[['CouponUsed']])

In [None]:
sns.kdeplot(df , x='CouponUsed')

In [None]:
df['CouponUsed'].isnull().sum()

--------------------------------------------

In [None]:
sns.kdeplot(df , x='OrderCount')

In [None]:
# Impute with KNN imputer
imputer_2 = KNNImputer(n_neighbors=2)
df['OrderCount']=imputer_2.fit_transform(df[['OrderCount']])

In [None]:
sns.kdeplot(df , x='OrderCount')

In [None]:
df['OrderCount'].isnull().sum()

----------------------------------

In [None]:
sns.kdeplot(df , x='DaySinceLastOrder')

In [None]:
# impute with bfill Method
df['DaySinceLastOrder'] = df['DaySinceLastOrder'].fillna(method = 'bfill')

In [None]:
sns.kdeplot(df , x='DaySinceLastOrder')

In [None]:
df['DaySinceLastOrder'].isnull().sum()

In [None]:
# After we Checked the data the Customer ID Column not important for our Models so We drop it
df.drop('CustomerID' , axis = 1 , inplace = True)

In [None]:
df.shape

#### We Handled Mssing Values

## Encoding

In [None]:
# check before encoding that my catogries for my columns are limited
for i in df.columns:
    if df[i].dtype == 'object':
        print(df[i].value_counts())
        print('*' * 40)


In [None]:
# cat columns
data = df[df.select_dtypes(exclude=np.number).columns]
data

In [None]:
le = LabelEncoder()

In [None]:
# Encode for cat_cols
for i in df.columns:
  if df[i].dtype == 'object':
    df[i] = le.fit_transform(df[i])

df.head(4)

In [None]:
for i in data.columns:
    data[i] = le.fit_transform(data[i])

data.head(4)

## Handling Outliers

In [None]:
df.dtypes

In [None]:
fig = plt.figure(figsize=(12,18))
for i in range(len(df.columns)):
    fig.add_subplot(9,4,i+1)
    sns.boxplot(y=df.iloc[:,i])

plt.tight_layout()
plt.show()

In [None]:
# lets detect True Outliers
def handle_outliers(df , column_name):
  Q1 = df[column_name].quantile(0.25)
  Q3 = df[column_name].quantile(0.75)
  IQR = Q3 - Q1

  # Define Upper and lower boundaries
  Upper = Q3 + IQR * 1.5
  lower = Q1 - IQR * 1.5

  # lets make filter for col values
  new_df = df[ (df[column_name] > lower) & (df[column_name] < Upper) ]

  return new_df

In [None]:
df.columns

In [None]:
# lets Give our Functions columns contains outlier
cols_outliers = ['Tenure' , 'WarehouseToHome' , 'NumberOfAddress' , 'DaySinceLastOrder' , 'HourSpendOnApp' , 'NumberOfDeviceRegistered']

for col in cols_outliers:
    df = handle_outliers(df , col)

df.head(4)

In [None]:
fig = plt.figure(figsize=(12,18))
for i in range(len(df.columns)):
    fig.add_subplot(9,4,i+1)
    sns.boxplot(y=df.iloc[:,i])

plt.tight_layout()
plt.show()

#### we made Trim on cols that contains outliers but after we check we saw many inforamtion deleted so we made Trimming only on cols that not conatins many outliers

In [None]:
corr_matrix = df.corr()
corr_matrix

In [None]:
plt.figure(figsize = (18,15))
sns.heatmap(df.corr() , annot = True , cmap = 'Blues')

In [None]:
churn_corr_vector = corr_matrix['Churn'].sort_values(ascending = False)
churn_corr_vector

In [None]:
plt.figure(figsize = (10,10))
sns.barplot(x = churn_corr_vector , y = churn_corr_vector.index , palette = 'coolwarm')
plt.title('Relation Between Features and target')

In [None]:
fig = px.histogram(df2, x="Churn", color="Churn" ,text_auto= True , title="<b>"+'Check Imbalance' , color_discrete_sequence = ['#BA1F33','#3A506B'])

# Customize the plot
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='Churn',
yaxis_title='count',
)
fig.show()

#### Our Data Imbalanced so lets make Over sample for it Using SMOTETomek

## Handling Imbalanced Data

In [None]:
X = df.drop('Churn' , axis = 1)
Y = df['Churn']

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
smt = SMOTETomek(random_state=42)
x_over , y_over = smt.fit_resample(X , Y)

In [None]:
x_over.shape, y_over.shape

## Split Data

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x_over , y_over , test_size = 0.30 , random_state = 42)

In [None]:
# Now we  will make normalization for all data to make them in commom range
from sklearn.preprocessing import MinMaxScaler , StandardScaler , RobustScaler

MN = MinMaxScaler()
# SC = StandardScaler()
# Rb = RobustScaler()
x_train_scaled = MN.fit_transform(x_train)
x_test_scaled = MN.fit_transform(x_test)

<a id="Modeling"></a>
<h1 style='background:#F5F5DC;border:0; color:black;
    box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);
    transform: rotateX(10deg);
    '><center>Modeling</center></h1>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
import warnings

warnings.filterwarnings("ignore")

In [None]:
logisreg_clf = LogisticRegression()
svm_clf = SVC()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
XGB_clf = XGBClassifier()
ada_clf = AdaBoostClassifier()

In [None]:
clf_list = [logisreg_clf, svm_clf, dt_clf, rf_clf, XGB_clf, ada_clf]
clf_name_list = ['Logistic Regression', 'Support Vector Machine', 'Decision Tree', 'Random Forest', 'XGBClassifier' , 'AdaBoostClassifier']

for clf in clf_list:
    clf.fit(x_train_scaled,y_train)

In [None]:
train_acc_list = []
test_acc_list = []

for clf,name in zip(clf_list,clf_name_list):
    y_pred_train = clf.predict(x_train_scaled)
    y_pred_test = clf.predict(x_test_scaled)
    print(f'Using model: {name}')
    print(f'Trainning Score: {clf.score(x_train_scaled, y_train)}')
    print(f'Test Score: {clf.score(x_test_scaled, y_test)}')
    print(f'Acc Train: {accuracy_score(y_train, y_pred_train)}')
    print(f'Acc Test: {accuracy_score(y_test, y_pred_test)}')
    train_acc_list.append(accuracy_score(y_train, y_pred_train))
    test_acc_list.append(accuracy_score(y_test, y_pred_test))
    print(' ' * 60)
    print('*' * 60)
    print(' ' * 60)

In [None]:
# graph to determine best 2 models

all_models = pd.DataFrame({'Train_Accuarcy': train_acc_list , 'Test_Accuarcy' : test_acc_list}  , index = clf_name_list)
all_models

In [None]:
# Models vs Train Accuracies
fig = px.bar(all_models, x=all_models['Train_Accuarcy'], y = all_models.index ,color=all_models['Train_Accuarcy'],title="<b>"+'Models Vs Train Accuracies',text_auto= True , color_continuous_scale='Reds')
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='Train Sccracy',
yaxis_title='Models Names',
)
fig.show()


# Models vs Test Accuracies
fig = px.bar(all_models, x=all_models['Test_Accuarcy'], y = all_models.index ,color=all_models['Test_Accuarcy'],title="<b>"+'Models Vs Test Accuracies',text_auto= True , color_continuous_scale='Reds')
fig.update_layout(hovermode='x',title_font_size=30)
fig.update_layout(
title_font_color="black",
template="plotly",
title_font_size=30,
hoverlabel_font_size=20,
title_x=0.5,
xaxis_title='Test Accuarcy',
yaxis_title='Models Names',
)
fig.show()

### from Graphs Best 2 Models in Train and Test are [ Random Forest , XGBoost]

In [None]:
!pip install mlxtend

In [None]:
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, RocCurveDisplay

<a id="Evaluation"></a>
<h1 style='background:#F5F5DC;border:0; color:black;
    box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);
    transform: rotateX(10deg);
    '><center>Evaluation</center></h1>

In [None]:
# Logistic regression
model= LogisticRegression()
model.fit(x_train_scaled,y_train)
y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
roc_auc1 = roc_auc_score(y_test, y_pred)
print("Accuracy = {}".format(accuracy))
print("ROC Area under Curve = {}".format(roc_auc1))
print(classification_report(y_test,y_pred,digits=5))
plot_confusion_matrix(confusion_matrix(y_test , y_pred))
print('*' * 70)
RocCurveDisplay.from_estimator(model , x_test_scaled , y_test)

In [None]:
# Support Vector Machine
model=SVC()
model.fit(x_train_scaled,y_train)
y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
roc_auc2 = roc_auc_score(y_test, y_pred)
print("Accuracy = {}".format(accuracy))
print("ROC Area under Curve = {}".format(roc_auc2))
print(classification_report(y_test,y_pred,digits=5))
plot_confusion_matrix(confusion_matrix(y_test , y_pred))
RocCurveDisplay.from_estimator(model , x_test_scaled , y_test)

In [None]:
# Decision Tree
model=DecisionTreeClassifier()
model.fit(x_train_scaled,y_train)
y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
roc_auc3 = roc_auc_score(y_test, y_pred)
print("Accuracy = {}".format(accuracy))
print("ROC Area under Curve = {}".format(roc_auc3))
print(classification_report(y_test,y_pred,digits=5))
plot_confusion_matrix(confusion_matrix(y_test , y_pred))
RocCurveDisplay.from_estimator(model , x_test_scaled , y_test)

In [None]:
# random forest
model=RandomForestClassifier()
model.fit(x_train_scaled,y_train)
y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
roc_auc4 = roc_auc_score(y_test, y_pred)
print("Accuracy = {}".format(accuracy))
print("ROC Area under Curve = {}".format(roc_auc4))
print(classification_report(y_test,y_pred,digits=5))
plot_confusion_matrix(confusion_matrix(y_test , y_pred))
RocCurveDisplay.from_estimator(model , x_test_scaled , y_test)

In [None]:
# XGBoost
model=XGBClassifier()
model.fit(x_train_scaled,y_train)
y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
roc_auc5 = roc_auc_score(y_test, y_pred)
print("Accuracy = {}".format(accuracy))
print("ROC Area under Curve = {}".format(roc_auc5))
print(classification_report(y_test,y_pred,digits=5))
plot_confusion_matrix(confusion_matrix(y_test , y_pred))
RocCurveDisplay.from_estimator(model , x_test_scaled , y_test)

In [None]:
# adaboost
model=AdaBoostClassifier()
model.fit(x_train_scaled,y_train)
y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
roc_auc6 = roc_auc_score(y_test, y_pred)
print("Accuracy = {}".format(accuracy))
print("ROC Area under Curve = {}".format(roc_auc6))
print(classification_report(y_test,y_pred,digits=5))
plot_confusion_matrix(confusion_matrix(y_test , y_pred))
RocCurveDisplay.from_estimator(model , x_test_scaled , y_test)

<a id="Auto_ML"></a>
<h1 style='background:#F5F5DC;border:0; color:black;
    box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);
    transform: rotateX(10deg);
    '><center>Auto ML</center></h1>

In [None]:
pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
# init setup
model_setup = setup(df , target = 'Churn' , train_size=0.7)

In [None]:
# model training and selection
best_model = compare_models()

In [None]:
# evaluate trained model
evaluate_model(best_model)

<a id="Recommendations_&_Conclustion"></a>
<div class="Recommendations & Conclusion" style="background-color:#F5F5DC; color:black; padding: 20px; margin: 10px; font-size: 110%; border-radius: 25px; box-shadow: 10px 10px 5px 0px rgba(0,0,0,0.75);">
  <h1><center>Recommendations & Conclusion</center></h1>
  <ul>
    <li >Should consider that the higher percntage are males incresing the products that grap the males interest and so on.</li>
    <li>May be the comapny should consider taking care of the products that suits the single and the married customers as the single are more likly to churn</li>
    <li>The company should consider the products that not that expensive or that cheap so that the medium city tier 2 can trust buying it cause as a medium tier customer i wo't afford buying expensive products and i won't trust buying cheap products as they won't meet my needs</li>
    <li>The company should think of another technique other than satisfaction score or complaining may be a hot line to recive the complains to get fast results or provied regular phone calls to recive feedback from the customers</li>
    <li>The company should check the mobile version of the store to see if there is any problem with the ui/ux</li>
    <li>Once the customer has reached 12%-15% orderamount the company should consider focusing more on grap their attention with the products they like</li>
    <li>For customers who have just bought electronic goods, cross-selling can be done by offering electronic accessories, such as keyboards, mice, etc.</li>      

  </ul>
</div>