In [69]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [70]:
path = "data/data.xlsx"
df = pd.read_excel(path)
df = pd.DataFrame(df)
df.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   Count              7043 non-null   int64  
 2   Country            7043 non-null   object 
 3   State              7043 non-null   object 
 4   City               7043 non-null   object 
 5   Zip Code           7043 non-null   int64  
 6   Lat Long           7043 non-null   object 
 7   Latitude           7043 non-null   float64
 8   Longitude          7043 non-null   float64
 9   Gender             7043 non-null   object 
 10  Senior Citizen     7043 non-null   object 
 11  Partner            7043 non-null   object 
 12  Dependents         7043 non-null   object 
 13  Tenure Months      7043 non-null   int64  
 14  Phone Service      7043 non-null   object 
 15  Multiple Lines     7043 non-null   object 
 16  Internet Service   7043 

In [72]:
df.shape

(7043, 33)

In [73]:
df.describe()

Unnamed: 0,Count,Zip Code,Latitude,Longitude,Tenure Months,Monthly Charges,Churn Value,Churn Score,CLTV
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,1.0,93521.964646,36.282441,-119.79888,32.371149,64.761692,0.26537,58.699418,4400.295755
std,0.0,1865.794555,2.455723,2.157889,24.559481,30.090047,0.441561,21.525131,1183.057152
min,1.0,90001.0,32.555828,-124.301372,0.0,18.25,0.0,5.0,2003.0
25%,1.0,92102.0,34.030915,-121.815412,9.0,35.5,0.0,40.0,3469.0
50%,1.0,93552.0,36.391777,-119.730885,29.0,70.35,0.0,61.0,4527.0
75%,1.0,95351.0,38.224869,-118.043237,55.0,89.85,1.0,75.0,5380.5
max,1.0,96161.0,41.962127,-114.192901,72.0,118.75,1.0,100.0,6500.0


In [74]:
df.isnull().sum()

CustomerID              0
Count                   0
Country                 0
State                   0
City                    0
Zip Code                0
Lat Long                0
Latitude                0
Longitude               0
Gender                  0
Senior Citizen          0
Partner                 0
Dependents              0
Tenure Months           0
Phone Service           0
Multiple Lines          0
Internet Service        0
Online Security         0
Online Backup           0
Device Protection       0
Tech Support            0
Streaming TV            0
Streaming Movies        0
Contract                0
Paperless Billing       0
Payment Method          0
Monthly Charges         0
Total Charges           0
Churn Label             0
Churn Value             0
Churn Score             0
CLTV                    0
Churn Reason         5174
dtype: int64

In [75]:
# Drop columns that are not relevant for model training
df = df.drop(columns=['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code', 'Lat Long', 'Churn Reason'])


In [76]:
df.shape

(7043, 25)

In [77]:
""" 
- now we need to use the principle of "DRY" and create a function that will return all the numerical columns in the data
- the function receie the data Frame as an argument
- the function will return a list of the numerical columns
- the function will be called "get_numerical_columns"

"""
def get_numerical_columns(data_frame):
    """
    This function returns a list of the numerical columns in the data frame
    """
    return data_frame.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_columns=get_numerical_columns(df)
numerical_columns

['Latitude',
 'Longitude',
 'Tenure Months',
 'Monthly Charges',
 'Churn Value',
 'Churn Score',
 'CLTV']

In [78]:
""" 
Q- the Total Charges columns is numerical in fact but dtype is object
    - so we need to convert it to numerical
"""
df["Total Charges"]=pd.to_numeric(df["Total Charges"],errors="coerce")
df["Total Charges"].dtype

dtype('float64')

In [79]:
from sklearn.preprocessing import StandardScaler

def scale_numerical_columns(df):
    scaler = StandardScaler()
    numerical_columns = get_numerical_columns(df)
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df



df_scaled = scale_numerical_columns(df)

print(df_scaled[numerical_columns].head())


   Latitude  Longitude  Tenure Months  Monthly Charges  Churn Value  \
0 -0.944111   0.707268      -1.236724        -0.362660     1.663829   
1 -0.905362   0.691215      -1.236724         0.197365     1.663829   
2 -0.909951   0.697457      -0.992402         1.159546     1.663829   
3 -0.904204   0.687374      -0.177995         1.330711     1.663829   
4 -0.913530   0.710276       0.677133         1.294151     1.663829   

   Churn Score      CLTV  
0     1.268402 -0.981675  
1     0.385650 -1.436462  
2     1.268402  0.821409  
3     1.175481  0.509483  
4     1.407784  0.794358  


In [85]:
from typing import List
def get_categorical_columns(df: pd.DataFrame) -> List[str]:
    return df.select_dtypes(include=['object', 'category']).columns.tolist()

categorical_columns=get_categorical_columns(df)
categorical_columns.remove('Churn Label')
categorical_columns

['Gender',
 'Senior Citizen',
 'Partner',
 'Dependents',
 'Phone Service',
 'Multiple Lines',
 'Internet Service',
 'Online Security',
 'Online Backup',
 'Device Protection',
 'Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Contract',
 'Paperless Billing',
 'Payment Method']

In [90]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
df_encoded.head()


Unnamed: 0,Latitude,Longitude,Tenure Months,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Gender_Male,...,Streaming TV_No internet service,Streaming TV_Yes,Streaming Movies_No internet service,Streaming Movies_Yes,Contract_One year,Contract_Two year,Paperless Billing_Yes,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,-0.944111,0.707268,-1.236724,-0.36266,-0.959649,Yes,1.663829,1.268402,-0.981675,True,...,False,False,False,False,False,False,True,False,False,True
1,-0.905362,0.691215,-1.236724,0.197365,-0.940457,Yes,1.663829,0.38565,-1.436462,False,...,False,False,False,False,False,False,True,False,True,False
2,-0.909951,0.697457,-0.992402,1.159546,-0.645369,Yes,1.663829,1.268402,0.821409,False,...,False,True,False,True,False,False,True,False,True,False
3,-0.904204,0.687374,-0.177995,1.330711,0.336516,Yes,1.663829,1.175481,0.509483,False,...,False,True,False,True,False,False,True,False,True,False
4,-0.91353,0.710276,0.677133,1.294151,1.214589,Yes,1.663829,1.407784,0.794358,True,...,False,True,False,True,False,False,True,False,False,False


In [91]:
df_encoded['Churn Label'] = df_encoded['Churn Label'].apply(lambda x: 1 if x == 'Yes' else 0)

In [92]:
df_encoded.shape


(7043, 36)

In [93]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 36 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Latitude                                7043 non-null   float64
 1   Longitude                               7043 non-null   float64
 2   Tenure Months                           7043 non-null   float64
 3   Monthly Charges                         7043 non-null   float64
 4   Total Charges                           7032 non-null   float64
 5   Churn Label                             7043 non-null   int64  
 6   Churn Value                             7043 non-null   float64
 7   Churn Score                             7043 non-null   float64
 8   CLTV                                    7043 non-null   float64
 9   Gender_Male                             7043 non-null   bool   
 10  Senior Citizen_Yes                      7043 non-null   bool

In [103]:
df_encoded["Churn Value"].info()



<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: Churn Value
Non-Null Count  Dtype  
--------------  -----  
7043 non-null   float64
dtypes: float64(1)
memory usage: 55.2 KB


In [106]:
df_encoded['Total Charges']=df_encoded['Total Charges'].fillna(df_encoded['Total Charges'].mean())

In [111]:
from sklearn.model_selection import train_test_split

# Splitting the data into features (X) and target (y)
X = df_encoded.drop('Churn Label', axis=1)
y = df_encoded['Churn Label']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Printing the shapes of each variable
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (5634, 35)
X_test shape: (1409, 35)
y_train shape: (5634,)
y_test shape: (1409,)


NameError: name 'make_pipeline' is not defined

In [109]:
X.isnull().sum() 



Latitude                                  0
Longitude                                 0
Tenure Months                             0
Monthly Charges                           0
Total Charges                             0
Churn Value                               0
Churn Score                               0
CLTV                                      0
Gender_Male                               0
Senior Citizen_Yes                        0
Partner_Yes                               0
Dependents_Yes                            0
Phone Service_Yes                         0
Multiple Lines_No phone service           0
Multiple Lines_Yes                        0
Internet Service_Fiber optic              0
Internet Service_No                       0
Online Security_No internet service       0
Online Security_Yes                       0
Online Backup_No internet service         0
Online Backup_Yes                         0
Device Protection_No internet service     0
Device Protection_Yes           

In [110]:
"""
Q- Create a train and evaluate pipeline for the model

Steps:
    - Standardize the features: Use "StandardScaler" to scale the features.
    - Model Selection: Use "LogisticRegression" as the initial model
    - Model Evaluation: Train the model
    - Make predictions: Predict on the test data
    - Return the accuracy

"""

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

def train_evaluate_pipeline(X_train, X_test, y_train, y_test):
    # Create the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression())
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Usage example:
accuracy = train_evaluate_pipeline(X_train, X_test, y_train, y_test)
print(f"Model accuracy: {accuracy}")

Model accuracy: 1.0
