# Step 1 : Importing packages

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix , classification_report

# Step 2 : Load Dataset

In [34]:
df = pd.read_csv("7 churn.csv")

In [13]:
df.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [14]:
df["Contract"].value_counts()  #It returns a pandas Series showing the count of each unique value in the "Contract" column, sorted in descending order of frequency by default.



Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: count, dtype: int64

# Step 3 : Data Processing

### Peform data preprocessing tasks such as..

#### 1. Handling missing values.
#### 2. Encoding categorical values.
#### 3. Feature scaling

In [15]:
df.shape

(7043, 21)

In [17]:
df.isnull().sum()  #It returns a Series with the count of missing values (NaN) in each column of the DataFrame.

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [18]:
#With .sum() → column-wise counts

#Without .sum() → full True/False matrix

df.isnull()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7039,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7040,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7041,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Keeping important columns 

##### ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'Contract', 'TotalCharges', 'Churn']

In [35]:
# Define the columns to keep
columns_to_keep = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'Contract', 'TotalCharges', 'Churn']
# Select only the specified columns
df = df[columns_to_keep]

In [20]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,Month-to-month,29.85,No
1,Male,0,No,No,34,Yes,No,One year,1889.5,No
2,Male,0,No,No,2,Yes,No,Month-to-month,108.15,Yes
3,Male,0,No,No,45,No,No phone service,One year,1840.75,No
4,Female,0,No,No,2,Yes,No,Month-to-month,151.65,Yes


In [27]:
df["MultipleLines"].value_counts()

MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64

# Jugaru Techniques

In [None]:
#Jugaru technique 
binary_columns = ["Partner","Dependents","PhoneService","Churn"]

df[binary_columns] = df[binary_columns].replace({"Yes":1,"No":0})

In [None]:
df[["MultipleLines","Contract"]] = df[["MultipleLines","Contract"]].replace({"Yes":1,"No":0,"No phone service":2,"Month-to-month":1,"One year":2,"Two year":3})

In [None]:
df["gender"] = df["gender"].replace({"Male":1,"Female":0})

# Encode binary varaibles ( e.g., Yes/No columns)

#### binary_columns = ["Partner","Dependents","PhoneService","MultipleLines","Contract","Churn"]

In [39]:
#use LabelEncoder

from sklearn.preprocessing import LabelEncoder

#Initialize the LabelEncoder

le = LabelEncoder()

#List the columns to label encode

categorical_cols = ["Partner","Dependents","PhoneService","MultipleLines","Contract","Churn"]

#Apply label encoding to each column

for i in categorical_cols:
    df[i] = le.fit_transform(df[i])

In [40]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,0,0,1,0,1,0,2,0,29.85,0
1,1,0,0,0,34,1,0,1,1889.5,0
2,1,0,0,0,2,1,0,0,108.15,1
3,1,0,0,0,45,0,2,1,1840.75,0
4,0,0,0,0,2,1,0,0,151.65,1


In [41]:
df["MultipleLines"].value_counts()

MultipleLines
0    3390
1    2971
2     682
Name: count, dtype: int64

In [44]:
df["TotalCharges"].value_counts()

TotalCharges
20.2      11
          11
19.75      9
19.65      8
19.9       8
          ..
1990.5     1
7362.9     1
346.45     1
306.6      1
108.15     1
Name: count, Length: 6531, dtype: int64

In [48]:
df.isnull().sum()

gender           0
SeniorCitizen    0
Partner          0
Dependents       0
tenure           0
PhoneService     0
MultipleLines    0
Contract         0
TotalCharges     0
Churn            0
dtype: int64

# Split the data into training and testing sets

In [42]:
#Split the data into features (X) and target(y)

X = df.drop("Churn",axis = 1)
y = df["Churn"]

In [63]:
#Split data into training and testing sets

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [64]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5634 entries, 2142 to 860
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   gender         5634 non-null   int64 
 1   SeniorCitizen  5634 non-null   int64 
 2   Partner        5634 non-null   int64 
 3   Dependents     5634 non-null   int64 
 4   tenure         5634 non-null   int64 
 5   PhoneService   5634 non-null   int64 
 6   MultipleLines  5634 non-null   int64 
 7   Contract       5634 non-null   int64 
 8   TotalCharges   5634 non-null   object
dtypes: int64(8), object(1)
memory usage: 440.2+ KB


In [53]:
# Here the Dtype of TotalCharges is object which can cause problem in future so first convert it into float

In [65]:
# Convert 'TotalCharges' column to float, and handle errors='coerce' to replace non-numeric values with NaN
X_train['TotalCharges'] = pd.to_numeric(X_train['TotalCharges'], errors='coerce')
X_test['TotalCharges'] = pd.to_numeric(X_test['TotalCharges'], errors='coerce')

In [66]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5634 entries, 2142 to 860
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         5634 non-null   int64  
 1   SeniorCitizen  5634 non-null   int64  
 2   Partner        5634 non-null   int64  
 3   Dependents     5634 non-null   int64  
 4   tenure         5634 non-null   int64  
 5   PhoneService   5634 non-null   int64  
 6   MultipleLines  5634 non-null   int64  
 7   Contract       5634 non-null   int64  
 8   TotalCharges   5624 non-null   float64
dtypes: float64(1), int64(8)
memory usage: 440.2 KB


In [67]:
df.isnull().sum()

gender           0
SeniorCitizen    0
Partner          0
Dependents       0
tenure           0
PhoneService     0
MultipleLines    0
Contract         0
TotalCharges     0
Churn            0
dtype: int64

In [68]:
# Replace missing values in the 'TotalCharges' column with the mean of the colum
X_train["TotalCharges"] = X_train["TotalCharges"].fillna(X_train["TotalCharges"].mean())
X_test["TotalCharges"] = X_test["TotalCharges"].fillna(X_test["TotalCharges"].mean())

In [69]:
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges
2142,0,0,0,1,21,1,0,1,1336.800000
1623,0,0,0,0,54,1,1,2,5129.450000
6074,1,0,1,0,1,0,2,0,23.450000
1362,1,0,0,0,4,1,0,0,237.950000
6754,1,0,0,1,0,1,1,2,2291.154605
...,...,...,...,...,...,...,...,...,...
3772,1,0,1,0,1,1,0,0,95.000000
5191,0,0,1,1,23,1,1,2,2198.300000
5226,1,0,1,1,12,1,0,0,306.050000
5390,1,1,0,0,12,1,1,0,1200.150000


# Standardize features ( optional but often beneficial for logistic regression) 

In [70]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [71]:
X_train

array([[-1.02516569, -0.4377492 , -0.96957859, ..., -0.94630024,
         0.37290835, -0.42210502],
       [-1.02516569, -0.4377492 , -0.96957859, ...,  0.57653768,
         1.5775905 ,  1.25536015],
       [ 0.97545208, -0.4377492 ,  1.03137591, ...,  2.0993756 ,
        -0.83177379, -1.00299144],
       ...,
       [ 0.97545208, -0.4377492 ,  1.03137591, ..., -0.94630024,
        -0.83177379, -0.87799925],
       [ 0.97545208,  2.28441306, -0.96957859, ...,  0.57653768,
        -0.83177379, -0.48254445],
       [ 0.97545208, -0.4377492 , -0.96957859, ..., -0.94630024,
         0.37290835, -0.81110232]], shape=(5634, 9))

# Logistic regression

In [75]:
lg = LogisticRegression()
lg.fit(X_train,y_train)

y_pred = lg.predict(X_test)

In [76]:
y_pred

array([1, 0, 0, ..., 0, 0, 1], shape=(1409,))

# Accuracy Score

### It tells you what fraction of predictions are correct.
#### What is a “good” value?

##### 100% (1.0) → Perfect (rare in real world, often means overfitting).

##### > 0.9 (90%) → Generally very good.

##### 0.7 – 0.9 (70–90%) → Acceptable depending on the problem.

##### < 0.7 (70%) → Often considered weak, unless the problem is very hard.

##### ~0.5 (50%) → Random guessing for binary classification.

In [77]:
from sklearn.metrics import accuracy_score

In [78]:
accuracy_score(y_test,y_pred)

0.7785663591199432

# save model

In [79]:
import pickle
pickle.dump(lg,open("logistic_model_basic_project.pkl","wb"))

In [80]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,0,0,1,0,1,0,2,0,29.85,0
1,1,0,0,0,34,1,0,1,1889.5,0
2,1,0,0,0,2,1,0,0,108.15,1
3,1,0,0,0,45,0,2,1,1840.75,0
4,0,0,0,0,2,1,0,0,151.65,1


# Classification system

In [89]:
def prediction(gender,Seniorcitizen,Partner,Dependents,tenure,Phoneservice,multiline,contact,totalcharge):
    data = {
            "gender" :[gender],
            "SeniorCitizen":[Seniorcitizen],
            "Partner":[Partner],
            "Dependents":[Dependents],
            "tenure":[tenure],
            "PhoneService":[Phoneservice],
            "MultipleLines":[multiline],
            "Contract":[contact],
            "TotalCharges":[totalcharge]
        }

    #Create a DataFrame from the dictionary

    df = pd.DataFrame(data)

    #Encode the categorial columns
    categorical_columns = ['gender',
                           'SeniorCitizen',
                           'Partner',
                           'Dependents',
                           'PhoneService',
                           'MultipleLines',
                           'Contract'
                          ]
    for i in categorical_columns:
        df[i] = le.fit_transform(df[i])

    df = scaler.fit_transform(df)

    result = lg.predict(df).reshape(1,-1)

    return result[0]
    

In [90]:
gender = "Female"
Seniorcitizen = "No"
Partner = "Yes"
Dependents = "No"
tenure = 1
Phoneservice="No"
multiline = "No phone service"
contact="Month-to-month"
totalcharge = 29.85
result = prediction(gender,Seniorcitizen,Partner,Dependents,tenure,Phoneservice,multiline,contact,totalcharge)

if result==1:
    print('churn')
else:
    print('Not churn')

Not churn
