# CUSTOMER CHURN

The project aims to analyze customer churn in a
telecommunications company and develop
predictive models to identify at-risk customers. The
ultimate goal is to provide actionable insights and
recommendations to reduce churn and improve
customer retention.

# TASK 1 : Data Preparation

In [1]:
import pandas as pd
import numpy as np
import sklearn
import warnings
warnings.filterwarnings("ignore")

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [4]:
df=pd.read_csv('Telco_Customer_Churn_Dataset.csv')

In [5]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

Finding the Missing values 

In [7]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

Unnamed: 0,Total,Percent
customerID,0,0.0
DeviceProtection,0,0.0
TotalCharges,0,0.0
MonthlyCharges,0,0.0
PaymentMethod,0,0.0
PaperlessBilling,0,0.0
Contract,0,0.0
StreamingMovies,0,0.0
StreamingTV,0,0.0
TechSupport,0,0.0


NO Missing values

In [8]:
df.duplicated().sum()

0

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [10]:
df.drop(columns=['customerID'],inplace=True)

In [11]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [12]:
x=df.drop(columns=['Churn'])

In [13]:
y=df['Churn']

                                         Encoding the Categorical Data
         Label Encoder for the Label(output)
         OneHotEncoder for the features

In [14]:
from sklearn.compose import ColumnTransformer

In [15]:
df_cat=x.select_dtypes(exclude=np.number)

In [16]:
df_cat_cols=df_cat.columns

In [17]:
df_cat_cols

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'TotalCharges'],
      dtype='object')

In [18]:
ohe=OneHotEncoder(drop='first')

In [19]:
trans=ColumnTransformer(transformers=[('ohe',ohe,df_cat_cols)],remainder="passthrough")

In [20]:
x_new=trans.fit_transform(x)

In [21]:
le=LabelEncoder()

In [22]:
y_new=le.fit_transform(y)

In [23]:
x_new

<7043x6559 sparse matrix of type '<class 'numpy.float64'>'
	with 82299 stored elements in Compressed Sparse Row format>

In [24]:
y_new

array([0, 0, 1, ..., 0, 1, 0])

# Tasks 2: Split Data for Training and Testing:

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x_train,x_test,y_train,y_test=train_test_split(x_new,y_new,test_size=0.2,random_state=42)

# Tasks 4: Model Selection

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [28]:
model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = DecisionTreeClassifier()

In [29]:
from sklearn.metrics import accuracy_score

# Define a function for each metric
# R²
def acc_score(test, pred):
    
    acc_ = accuracy_score(test, pred)
    return acc_


# Print the scores
def print_score(test, pred, model):

    print(f"**** Classifier: {model} ****")
    print(f"ACCURACY: {accuracy_score(test, pred)}")

In [30]:
model1.fit(x_train,y_train)

In [31]:
y_pred=model1.predict(x_test)

In [32]:
print_score(y_test, y_pred, "Logistic")

**** Classifier: Logistic ****
ACCURACY: 0.8062455642299503


In [33]:
model_list = []
acc_list = []

model_list.append(model1.__class__.__name__)
acc_list.append(round(acc_score(y_test, y_pred), 3))

In [34]:
model2.fit(x_train,y_train)

In [35]:
Y_pred1=model2.predict(x_test)

In [36]:
print_score(y_test, Y_pred1, "RandomForest")

**** Classifier: RandomForest ****
ACCURACY: 0.794180269694819


In [37]:
model_list.append(model2.__class__.__name__)
acc_list.append(round(acc_score(y_test, Y_pred1), 3))

In [38]:
model3.fit(x_train,y_train)

In [39]:
Y_Pred=model3.predict(x_test)

In [40]:
print_score(y_test, Y_Pred, "DecisionTree")

**** Classifier: DecisionTree ****
ACCURACY: 0.7743080198722498


In [41]:
model_list.append(model3.__class__.__name__)
acc_list.append(round(acc_score(y_test, Y_Pred), 3))

In [42]:
model_results = pd.DataFrame({"Model": model_list,
                              "Accuracy_Score": acc_list,
                              })

In [43]:
model_results

Unnamed: 0,Model,Accuracy_Score
0,LogisticRegression,0.806
1,RandomForestClassifier,0.794
2,DecisionTreeClassifier,0.774


Logistic Regression performs best

# Tasks 5: Model Training

In [44]:
lr=LogisticRegression()
lr.fit(x_train,y_train)

In [45]:
Y_PRED=lr.predict(x_test)

# Tasks 6: Model Evaluation

In [46]:
from sklearn.metrics import accuracy_score

In [47]:
from sklearn.metrics import precision_score

In [48]:
from sklearn.metrics import recall_score

In [49]:
from sklearn.metrics import f1_score

In [50]:
accuracy=accuracy_score(y_test,Y_PRED)

In [51]:
precision = precision_score(y_test, Y_PRED, average='binary')

In [52]:
recall = recall_score(y_test, Y_PRED, average='binary')

In [53]:
f1=f1_score(y_test,Y_PRED,average='binary')

In [54]:
accuracy

0.8062455642299503

In [55]:
data=[['ACCURACY_SCORE',accuracy],['PRESICION_SCORE',precision],['RECALL_SCORE',recall],['F!_SCORE',f1]]

In [56]:
data_f=pd.DataFrame(data,columns=['Preformance measure','Score'])

In [57]:
data_f

Unnamed: 0,Preformance measure,Score
0,ACCURACY_SCORE,0.806246
1,PRESICION_SCORE,0.650602
2,RECALL_SCORE,0.579088
3,F!_SCORE,0.612766
