## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Read data

In [2]:
data = pd.read_csv("churn.csv")
data.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
6724,7359-SSBJK,Female,1,No,No,64,Yes,No,DSL,Yes,...,No,Yes,Yes,No,Two year,Yes,Credit card (automatic),70.2,4481.0,Yes
1064,3800-LYTRK,Female,0,No,No,14,Yes,No,DSL,Yes,...,No,Yes,No,No,One year,No,Mailed check,55.7,795.15,No
2714,6838-YAUVY,Female,0,No,No,54,Yes,Yes,Fiber optic,No,...,Yes,Yes,No,Yes,Two year,Yes,Bank transfer (automatic),95.1,5064.85,No
3452,8950-MTZNV,Male,0,No,No,1,Yes,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,44.95,44.95,No
6512,1353-LJWEM,Male,0,No,No,11,Yes,No,DSL,No,...,Yes,Yes,No,No,Month-to-month,Yes,Electronic check,60.9,688.5,No


### Exploratory data Analysis

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
data.shape

(7043, 21)

In [5]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [7]:
# distribution of customers who churn

data['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [12]:
# check the nulls
data.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [9]:
# Compare the values for both churns
data.groupby('Churn').mean()

Unnamed: 0_level_0,SeniorCitizen,tenure,MonthlyCharges
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,0.128721,37.569965,61.265124
Yes,0.254682,17.979133,74.441332


Frauds transactions are more than the legit transactions

### Data Preprocessing

In [10]:
df = data.copy()

In [25]:
## Encoding the data
from sklearn.preprocessing import LabelEncoder

cat_cols =[x for x in df.columns if df[x].dtype=="object"]

le=LabelEncoder()

df = df.apply(lambda cat_cols: le.fit_transform(cat_cols.astype(str)), axis=0, result_type='expand')

In [26]:
df.drop(columns="customerID",inplace=True)

KeyError: "['customerID'] not found in axis"

In [27]:
df.sample(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
498,0,1,0,0,45,0,2,1,0,1,0,0,0,0,0,0,0,428,568,1
1287,0,0,1,1,24,1,1,2,0,1,1,1,1,1,0,1,0,142,2822,0
4663,0,1,0,0,34,1,1,2,0,0,0,0,0,0,0,1,0,1109,2793,1
1372,1,0,0,0,16,1,0,2,0,1,0,0,1,1,0,0,1,1498,1812,0
2837,0,0,0,0,1,1,0,0,2,2,2,2,2,2,0,0,1,343,1616,1


In [29]:
X = df.drop(columns = 'Churn',axis = 1)
y = df['Churn']

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   gender            7043 non-null   int32
 1   SeniorCitizen     7043 non-null   int32
 2   Partner           7043 non-null   int32
 3   Dependents        7043 non-null   int32
 4   tenure            7043 non-null   int32
 5   PhoneService      7043 non-null   int32
 6   MultipleLines     7043 non-null   int32
 7   InternetService   7043 non-null   int32
 8   OnlineSecurity    7043 non-null   int32
 9   OnlineBackup      7043 non-null   int32
 10  DeviceProtection  7043 non-null   int32
 11  TechSupport       7043 non-null   int32
 12  StreamingTV       7043 non-null   int32
 13  StreamingMovies   7043 non-null   int32
 14  Contract          7043 non-null   int32
 15  PaperlessBilling  7043 non-null   int32
 16  PaymentMethod     7043 non-null   int32
 17  MonthlyCharges    7043 non-null  

### Modelling

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

In [30]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, stratify = y, random_state = 23)
print(X_train.shape, X_test.shape, y_train.shape,y_test.shape)

(5634, 19) (1409, 19) (5634,) (1409,)


In [31]:
lr =  LogisticRegression()
lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [32]:
preds = lr.predict(X_test)

In [33]:
pred = accuracy_score(preds, y_test)
pred

0.7821149751596878

In [34]:
# Create new data out of the samples
df.to_csv("data1.csv")
df.to_csv("data2.csv")

In [38]:

df1 = df[df['tenure'] < 12] # Customers less than a year
df2 = df[df['tenure'] > 12] # Customers less than a year
print(df1.shape)
print(df2.shape)

(1577, 20)
(5228, 20)


In [39]:
# Create new data out of the samples
df1.to_csv("data1.csv")
df2.to_csv("data2.csv")