In [None]:
import pandas as pd
import sklearn
import csv
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import plotly.express as ex
import seaborn as sns
from scipy.stats import chi2
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from plotly.subplots import make_subplots

In [None]:

excel_file_path = '/kaggle/input/e-shop/E_commerce_dataset.xlsx'
df = pd.read_excel(excel_file_path)

In [None]:
df.head()

In [None]:
df

As shown from the records above dataset includes a range of features from customer demographics to their purchasing behaviors. Key attributes include:

* Customer Demographics: Gender, City Tier, Marital Status.
* Shopping Preferences: Preferred Login Device, Preferred Payment Mode, Preferred Order Categories.
* Engagement Metrics: Hours Spent on App, Number of Devices Registered, Satisfaction Score.
*Transactional Behavior: Order Amount Hike from Last Year, Number of Addresses, Frequency of Coupon Usage

In [None]:
#Next im checking for any dupicate values. 
#The return is 0 indicating that we do no have any duplicates in our dataset. 

df.duplicated().sum()

In [None]:
#Here im rewview the datatypes contained in the dataset 
data_types = df.dtypes
print(data_types)

In [None]:
#Due to the dataset being limited i dont wish to drop any row if a missing value exist. 
#Insted im repalcing 0 values with the medina inorder to account for less varaince than for example the mean. 

for colum in df.columns:
    if df[colum].isnull().sum()>0:
        df[colum].fillna(df[colum].median(),inplace=True)
df.isnull().sum()

## Data analysis

In [None]:
avg_tenure=df.groupby('Churn')['Tenure'].mean()
plt.figure(figsize=(8,5))
sns.barplot(x=avg_tenure.index,y=avg_tenure.values)
for index, value in enumerate(avg_tenure):
    plt.text(index,value, f'{value:.2f}', ha='center', va='bottom')
    plt.title('Average customer tenure by churn')
    plt.xlabel('Churn')
    plt.ylabel('Average Tenure')
    plt.xticks([0,1],['Retained','Churn'])
    plt.show()

# The analysis of customer tenure in relation to churn reveals.

* Customers who were retained (Churn = 0) have an average tenure of approximately 11.40.
* customers who churned (Churn = 1) have a significantly lower average tenure of around 3.86.
* The bar chart visually illustrates this difference, clearly showing that customers who stay with the service tend to have a longer tenure, while those who leave (churn) typically do so after a shorter duration.


****Is there a diffrance  between the Preferd Login Device and Churn Rate?****

In [None]:
device_churn_percentage= pd.crosstab(df['PreferredLoginDevice'], df['Churn'], normalize='index') * 100
# Convert percentages for easier interpretation
device_churn_percentage=device_churn_percentage.round(2)
device_churn_percentage

# These percenatges suggest that:


* Customers who prefer using a computer for login have a slightly heigher churn rate (19.83%) compared to those who prefer mobile phones (15.62)
* Mobile user have a higher retention rate.


In [None]:
# calculate the percentage of churn and not churn fore each city tier
city_tier_churn_percentage=pd.crosstab(df['CityTier'],df['Churn'],normalize='index') *100
#convert percentages for easier interpretation
city_tier_churn_percentage=city_tier_churn_percentage.round(2)
city_tier_churn_percentage

# Chrus varies depending on city tier. 
 * Highest churn rates relate to tier 2 & 3
 * Tier 1 has the highest retention rate

In [None]:
import seaborn as sns
numeric_df = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=2, linecolor='black')
plt.title('Correlation Matrix for Numeric Columns')
plt.show()

# Correlations between numeric variables 

* Several intresting relationships are displayed. For demonstration purpuses only higher correlations are reviewed.
* Ordercountused and cupons displays a modest correlation r=0.64 Indicating that as a customer is placing more orders they tend to use coupns. 


## Does the preferred payment mode have any correlation with customer churn?


In [None]:
payment_mode_churn_percentage= pd.crosstab(df['PreferredPaymentMode'],df['Churn'],normalize='index')*100
payment_mode_churn_percentage=payment_mode_churn_percentage.round(2)

plt.figure(figsize=(16,10))
ax=payment_mode_churn_percentage.plot(kind='bar',color=['yellow','salmon'],width=0.8)
# Adding percentage values on top of the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%',(p.get_x() + p.get_width()/2.,p.get_height()),
                ha='center',va='center',xytext=(0,10),textcoords='offset points')
    
plt.title ('Churn Percenatge by preferred payment mode',size=18)
plt.xlabel('Preferred payment mode',size=14)
plt.ylabel('Percentage (%)',size=14)
plt.xticks (rotation=45)
plt.legend(title='churn',labels=['Not Churned','Churned'],loc='upper left',bbox_to_anchor=(1,1),fontsize='medium')
plt.grid(axis='y',linestyle='--',alpha=0.7)
plt.show()

* Cash on Delivery users have the highest churn rate (24.90%), followed by E wallet users (22.80%), and UPI users (17.39%).
* Credit Card and Debit Card users exhibit lower churn rates, at 14.21% and 15.38%, respectively.


# Does gender affect churn rates?

In [None]:
gender_churn_percentage= pd.crosstab(df['Gender'],df['Churn'],normalize='index')*100
gender_churn_percentage=gender_churn_percentage.round(2)

plt.figure(figsize=(20,16))
ax=gender_churn_percentage.plot(kind='bar',color=['blue','salmon'],width=0.8)
# Adding percentage values on top of the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%',(p.get_x() + p.get_width()/2.,p.get_height()),
                ha='center',va='center',xytext=(0,10),textcoords='offset points')
    
plt.title ('Churn Percenatge by Gender',size=18)
plt.xlabel('Gender',size=14)
plt.ylabel('Percentage (%)',size=14)
plt.xticks (rotation=0)
plt.legend(title='churn',labels=['Not Churned','Churned'],loc='upper left',bbox_to_anchor=(1,1),fontsize='medium')
plt.grid(axis='y',linestyle='--',alpha=0.7)
plt.show()

These percentages indicate that there are indeed gender-based patterns in customer churn:

* Male customers have a slightly higher churn rate (17.73%) compared to female customers (15.49%).
* Female customers exhibit a higher retention rate (84.51%) than male customers (82.27%).

In [None]:
# Analysing the relationship between customers complain and churn rate
complaint_churn_rate=pd.crosstab(df['Complain'],df['Churn'],normalize='index')*100
complaint_churn_rate=complaint_churn_rate.round(2)

# Creating a bar plot to visualize the relationship
plt.figure(figsize=(15,12))
complaint_churn_rate.plot(kind='bar',stacked=False,color=['teal','salmon'],ax=plt.gca())
plt.title('Churn Rate by Customers Complaints',size=18)
plt.xlabel('Complained',size=14)
plt.ylabel('Percentage',size=14)
plt.xticks(ticks=[0,1],labels=['No Complaint','complaint'],rotation=0)
plt.legend(title='Churn', labels=['Not Churned','Churned'],loc='upper left',bbox_to_anchor=(1,1))
plt.grid(axis='y',linestyle='--',alpha=0.7)
plt.show()

* As expected customers who submitt complaints tend to Churn to a higher degree than customers who don't submitt a complaint


In [None]:
# calculating the churn rate for different marital statuses
marital_status_churn_rate=pd.crosstab(df['MaritalStatus'],df['Churn'],normalize='index')*100
marital_status_churn_rate=marital_status_churn_rate.round(2)
# visualize impact of marital status on the likelihood of churn
plt.figure(figsize=(12,9))
marital_status_churn_rate.plot(kind='bar',stacked=False,color=['teal','salmon'],ax=plt.gca())
plt.title('Impact of Marital Status on churn likelihood',size=16)
plt.xlabel('marital status')
plt.ylabel('churn (%)')
plt.show()

* Clients that are singel display the highest rates of churn 


In [None]:
#Creating a crosstab to analyze churn rate by the combination of gender and marital status
gender_marital_status_churn=pd.crosstab(index=[df['Gender'],df['MaritalStatus']],columns=df['Churn'],normalize='index')*100
gender_marital_status_churn=gender_marital_status_churn.round(2)
gender_marital_status_churn