#### 1.Importing Libraries

In [103]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

#### 2. Data Loading

In [65]:
df = pd.read_csv('../data/raw/telco.csv')

#### 3. Initial Data Exploration

In [66]:
df.sample(10)

Unnamed: 0,Customer ID,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents,Country,State,...,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Satisfaction Score,Customer Status,Churn Label,Churn Score,CLTV,Churn Category,Churn Reason
6830,0516-OOHAR,Male,39,No,No,Yes,Yes,2,United States,California,...,0,216.44,3015.44,3,Stayed,No,50,3411,,
4894,4633-MKHYU,Female,23,Yes,No,No,No,0,United States,California,...,0,319.05,1107.65,5,Stayed,No,32,2952,,
2510,1750-CSKKM,Male,40,No,No,No,Yes,3,United States,California,...,0,369.18,818.93,1,Churned,Yes,70,2688,Competitor,Competitor had better devices
6904,6300-BWMJX,Female,45,No,No,Yes,No,0,United States,California,...,0,1226.02,6576.45,3,Stayed,No,44,4814,,
3511,3012-VFFMN,Female,61,No,No,Yes,Yes,1,United States,California,...,0,35.07,194.02,3,Stayed,No,74,4058,,
5465,5827-MWCZK,Male,39,No,No,Yes,Yes,3,United States,California,...,0,1317.12,6749.32,5,Stayed,No,66,6484,,
3545,2782-JEEBU,Male,39,No,No,No,No,0,United States,California,...,0,0.0,780.15,4,Stayed,No,64,3378,,
4350,8857-CUPFQ,Male,57,No,No,Yes,No,0,United States,California,...,0,1094.94,2332.59,3,Stayed,No,20,4538,,
2808,8734-DKSTZ,Female,20,Yes,No,Yes,Yes,2,United States,California,...,0,65.0,923.6,3,Stayed,No,20,4922,,
1563,8624-GIOUT,Female,40,No,No,No,No,0,United States,California,...,0,1757.63,4691.93,1,Churned,Yes,78,6006,Other,Don't know


In [67]:
df.shape


(7043, 50)

In [68]:
df.columns

Index(['Customer ID', 'Gender', 'Age', 'Under 30', 'Senior Citizen', 'Married',
       'Dependents', 'Number of Dependents', 'Country', 'State', 'City',
       'Zip Code', 'Latitude', 'Longitude', 'Population', 'Quarter',
       'Referred a Friend', 'Number of Referrals', 'Tenure in Months', 'Offer',
       'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Satisfaction Score', 'Customer Status', 'Churn Label',
       'Churn Score', 'CLTV', 'Churn Category', 'Churn Reason'],
      dtype='object')

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             7043 non-null   int64  
 1   Age                                7043 non-null   int64  
 2   Married                            7043 non-null   int64  
 3   Number of Dependents               7043 non-null   int64  
 4   City                               7043 non-null   object 
 5   Referred a Friend                  7043 non-null   int64  
 6   Number of Referrals                7043 non-null   int64  
 7   Tenure in Months                   7043 non-null   int64  
 8   Offer                              7043 non-null   float64
 9   Phone Service                      7043 non-null   int64  
 10  Avg Monthly Long Distance Charges  7043 non-null   float64
 11  Multiple Lines                     7043 non-null   int64

#### 4. Data Cleaning: Dropping Unnecessary Columns

In [69]:
df.drop(columns=["Customer ID", "Latitude", "Longitude","Zip Code","Churn Reason","Churn Category","CLTV","Churn Score","Customer Status"], inplace=True)

#### 5. Handling Missing Values

In [70]:
df.isnull().sum() * 100 / df.shape[0]

Gender                                0.000000
Age                                   0.000000
Under 30                              0.000000
Senior Citizen                        0.000000
Married                               0.000000
Dependents                            0.000000
Number of Dependents                  0.000000
Country                               0.000000
State                                 0.000000
City                                  0.000000
Population                            0.000000
Quarter                               0.000000
Referred a Friend                     0.000000
Number of Referrals                   0.000000
Tenure in Months                      0.000000
Offer                                55.047565
Phone Service                         0.000000
Avg Monthly Long Distance Charges     0.000000
Multiple Lines                        0.000000
Internet Service                      0.000000
Internet Type                        21.666903
Avg Monthly G

In [72]:
df['Internet Type'] = df['Internet Type'].fillna("No Internet Service")

In [73]:
df['Offer'] = df['Offer'].fillna("No Offer")

#### 6. Dropping Redundant Columns

In [74]:
df.drop(columns=["Senior Citizen", "Under 30"], inplace=True)

##### 7. Feature Engineering: Add-On Services

Original 8 columns: Online Security, Online Backup, Device Protection Plan, Premium Tech Support, Streaming TV, Streaming Movies, Streaming Music, Unlimited Data.

Reduced to 3 columns:

Security_Backup_Protection → counts security/backup/protection add-ons.

Streaming_Services → counts streaming services.

Unlimited_Data → binary (Yes → 1, No → 0).

In [75]:
df['Security_Backup_Protection'] = df[['Online Security', 'Online Backup', 
                                       'Device Protection Plan', 'Premium Tech Support']].apply(lambda x: x.eq('Yes').sum(), axis=1)

df['Streaming_Services'] = df[['Streaming TV', 'Streaming Movies', 'Streaming Music']].apply(lambda x: x.eq('Yes').sum(), axis=1)


df['Unlimited_Data'] = df['Unlimited Data'].replace({'Yes':1, 'No':0})

# Drop the original 8 columns
df.drop(columns=['Online Security', 'Online Backup', 'Device Protection Plan', 
                 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 
                 'Streaming Music', 'Unlimited Data'], inplace=True)


df.head()


Unnamed: 0,Gender,Age,Married,Dependents,Number of Dependents,Country,State,City,Population,Quarter,...,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Satisfaction Score,Churn Label,Security_Backup_Protection,Streaming_Services,Unlimited_Data
0,Male,78,No,No,0,United States,California,Los Angeles,68701,Q3,...,39.65,0.0,20,0.0,59.65,3,Yes,1,1,0
1,Female,74,Yes,Yes,1,United States,California,Los Angeles,55668,Q3,...,633.3,0.0,0,390.8,1024.1,3,Yes,1,0,1
2,Male,71,No,Yes,3,United States,California,Los Angeles,47534,Q3,...,1752.55,45.61,0,203.94,1910.88,2,Yes,0,3,1
3,Female,78,Yes,Yes,1,United States,California,Inglewood,27778,Q3,...,2514.5,13.43,0,494.0,2995.07,2,Yes,2,2,1
4,Female,80,Yes,Yes,1,United States,California,Whittier,26265,Q3,...,2868.15,0.0,0,234.21,3102.36,2,Yes,0,0,1


#### 8. Dropping More Unnecessary Columns

In [None]:
df.drop(columns=["Country", "State", "Population","Dependents","Total Long Distance Charges","Quarter"], inplace=True)

In [77]:
df.shape

(7043, 29)

In [79]:
for c in df.columns:
    print(c)
    print(df[c].unique())
    print("-" * 20)

Gender
['Male' 'Female']
--------------------
Age
[78 74 71 80 72 76 66 70 77 65 67 68 69 79 75 73 37 19 31 23 38 21 29 61
 27 20 56 51 48 32 34 41 30 26 62 64 45 53 63 42 24 54 39 43 50 22 40 47
 60 52 55 59 49 58 25 28 33 44 57 46 36 35]
--------------------
Married
['No' 'Yes']
--------------------
Number of Dependents
[0 1 3 2 5 4 6 7 8 9]
--------------------
City
['Los Angeles' 'Inglewood' 'Whittier' ... 'Topaz' 'Jacumba' 'Holtville']
--------------------
Quarter
['Q3']
--------------------
Referred a Friend
['No' 'Yes']
--------------------
Number of Referrals
[ 0  1  6  2  4  3  7  5  8  9 10 11]
--------------------
Tenure in Months
[ 1  8 18 25 37 27 58 15  7 11  3 13 16 24  4 32 54  2 21 55  5 20 65 49
 43 56 52 33 30 46  9 10 35 44 62 14 12 17 39 68 34 22 19 23 42 41 57 38
 29 66 69 40 31 59 36 45 53 70 47 50 51 72 26  6 28 48 61 71 60 64 67 63]
--------------------
Offer
['No Offer' 'Offer E' 'Offer D' 'Offer C' 'Offer B' 'Offer A']
--------------------
Phone Service
['No'

#### 9. Encoding Binary and Categorical Columns


In [80]:
df['Referred a Friend'] = df['Referred a Friend'].map({'Yes': 1, 'No': 0})
df['Married'] = df['Married'].map({'Yes': 1, 'No': 0})
df['Paperless Billing'] = df['Paperless Billing'].map({'Yes': 1, 'No': 0})
df['Internet Service'] = df['Internet Service'].map({'Yes': 1, 'No': 0})
df['Multiple Lines'] = df['Multiple Lines'].map({'Yes': 1, 'No': 0})
df['Phone Service'] = df['Phone Service'].map({'Yes': 1, 'No': 0})
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

In [81]:
df.head()

Unnamed: 0,Gender,Age,Married,Number of Dependents,City,Quarter,Referred a Friend,Number of Referrals,Tenure in Months,Offer,...,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Revenue,Satisfaction Score,Churn Label,Security_Backup_Protection,Streaming_Services,Unlimited_Data
0,1,78,0,0,Los Angeles,Q3,0,0,1,No Offer,...,39.65,39.65,0.0,20,59.65,3,Yes,1,1,0
1,0,74,1,1,Los Angeles,Q3,1,1,8,Offer E,...,80.65,633.3,0.0,0,1024.1,3,Yes,1,0,1
2,1,71,0,3,Los Angeles,Q3,0,0,18,Offer D,...,95.45,1752.55,45.61,0,1910.88,2,Yes,0,3,1
3,0,78,1,1,Inglewood,Q3,1,1,25,Offer C,...,98.5,2514.5,13.43,0,2995.07,2,Yes,2,2,1
4,0,80,1,1,Whittier,Q3,1,1,37,Offer C,...,76.5,2868.15,0.0,0,3102.36,2,Yes,0,0,1


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 29 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             7043 non-null   int64  
 1   Age                                7043 non-null   int64  
 2   Married                            7043 non-null   int64  
 3   Number of Dependents               7043 non-null   int64  
 4   City                               7043 non-null   object 
 5   Quarter                            7043 non-null   object 
 6   Referred a Friend                  7043 non-null   int64  
 7   Number of Referrals                7043 non-null   int64  
 8   Tenure in Months                   7043 non-null   int64  
 9   Offer                              7043 non-null   object 
 10  Phone Service                      7043 non-null   int64  
 11  Avg Monthly Long Distance Charges  7043 non-null   float

#### 10. Ordinal Encoding for 'Contract' and 'Offer'

In [83]:
df['Contract'] = df['Contract'].str.strip().str.title()

od = OrdinalEncoder(categories=[['Month-To-Month', 'One Year', 'Two Year']],)
df['Contract'] = od.fit_transform(df[['Contract']])
df['Contract'].unique()

array([0., 1., 2.])

In [84]:
categorical_cols = df.select_dtypes(include=['object']).columns
categorical_cols

Index(['City', 'Quarter', 'Offer', 'Internet Type', 'Payment Method',
       'Churn Label'],
      dtype='object')

In [85]:
od = OrdinalEncoder(categories=[['No Offer', 'Offer A', 'Offer B','Offer C','Offer D','Offer E']],)
df['Offer'] = od.fit_transform(df[['Offer']])
df['Offer'].unique()

array([0., 5., 4., 3., 2., 1.])

In [87]:
df['Internet Type'].unique()

array(['DSL', 'Fiber Optic', 'Cable', 'No Internet Service'], dtype=object)

#### 11. One-Hot Encoding for 'Internet Type' and 'Payment Method'

In [88]:
df = pd.get_dummies(df, columns=['Internet Type'], drop_first=False, dtype=int)

In [89]:
df = pd.get_dummies(df, columns=['Payment Method'], drop_first=False, dtype=int)

In [90]:
df.columns

Index(['Gender', 'Age', 'Married', 'Number of Dependents', 'City',
       'Referred a Friend', 'Number of Referrals', 'Tenure in Months', 'Offer',
       'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Avg Monthly GB Download', 'Contract',
       'Paperless Billing', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Revenue', 'Satisfaction Score',
       'Churn Label', 'Security_Backup_Protection', 'Streaming_Services',
       'Unlimited_Data', 'Internet Type_Cable', 'Internet Type_DSL',
       'Internet Type_Fiber Optic', 'Internet Type_No Internet Service',
       'Payment Method_Bank Withdrawal', 'Payment Method_Credit Card',
       'Payment Method_Mailed Check'],
      dtype='object')

#### 13. Data Visualization

In [91]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Set the aesthetic style of the plots
# sns.set(style="whitegrid")

# # Visualize the distribution of the target variable 'Churn Label'
# plt.figure(figsize=(8, 5))
# sns.countplot(data=df, x='Churn Label', palette='pastel')
# plt.title('Distribution of Churn Label')
# plt.xlabel('Churn Label')
# plt.ylabel('Count')
# plt.show()

# # Visualize the correlation heatmap
# plt.figure(figsize=(12, 8))
# correlation_matrix = df.corr(numeric_only=True)
# sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
# plt.title('Correlation Heatmap')
# plt.show()

# # Visualize the relationship between 'Monthly Charges' and 'Churn Label'
# plt.figure(figsize=(10, 6))
# sns.boxplot(data=df, x='Churn Label', y='Monthly Charges', palette='Set2')
# plt.title('Monthly Charges vs Churn Label')
# plt.xlabel('Churn Label')
# plt.ylabel('Monthly Charges')
# plt.show()

#### 14. Model Preparation

In [93]:
X = df.drop(columns=["Churn Label","City"])
y = df['Churn Label']

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### 15. Decision Tree Classifier

In [95]:
d = DecisionTreeClassifier(max_depth=5)

In [96]:
d.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [97]:
y_pred = d.predict(X_test)

In [98]:
print(confusion_matrix(y_test,y_pred))

[[986  23]
 [ 58 342]]


In [99]:
print(accuracy_score(y_test, y_pred))

0.9425124201561391


In [100]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.94      0.98      0.96      1009
         Yes       0.94      0.85      0.89       400

    accuracy                           0.94      1409
   macro avg       0.94      0.92      0.93      1409
weighted avg       0.94      0.94      0.94      1409



In [101]:
scores = cross_val_score(d, X, y, cv=5, scoring='accuracy')
print(scores)

[0.80056778 0.9418027  0.94464159 0.94886364 0.93678977]


#### 16. Random Forest Classifier

In [102]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

X = df.drop(columns=["Churn Label","City"])
y = df['Churn Label']

rf = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42)

scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean Accuracy:", np.mean(scores))
print("Std Dev:", np.std(scores))


Cross-validation scores: [0.9020582  0.95031938 0.95741661 0.95809659 0.95454545]
Mean Accuracy: 0.944487245144848
Std Dev: 0.021390596646971915
