In [1]:
import pandas as pd                  # For data manipulation and analysis using DataFrames (e.g., loading CSVs, filtering, grouping)\n",
import numpy as np                   # For numerical operations, and mathematical functions used in data preprocessing\n",
import seaborn as sns                # For advanced, beautiful statistical visualizations (like heatmaps, boxplots, etc.)\n",
import matplotlib.pyplot as plt      # For creating plots and graphs (line charts, bar charts, confusion matrix visualization)\n",
import joblib                        # For saving and loading trained models\n",
from sklearn.model_selection import train_test_split   # For splitting the dataset into training and testing sets\n",
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
df= pd.read_csv("telecom_churn.csv")

In [4]:
df.shape

(3333, 11)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Churn            3333 non-null   int64  
 1   AccountWeeks     3333 non-null   int64  
 2   ContractRenewal  3333 non-null   int64  
 3   DataPlan         3333 non-null   int64  
 4   DataUsage        3333 non-null   float64
 5   CustServCalls    3333 non-null   int64  
 6   DayMins          3333 non-null   float64
 7   DayCalls         3333 non-null   int64  
 8   MonthlyCharge    3333 non-null   float64
 9   OverageFee       3333 non-null   float64
 10  RoamMins         3333 non-null   float64
dtypes: float64(5), int64(6)
memory usage: 286.6 KB


In [6]:
df.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [7]:
df.tail()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
3328,0,192,1,1,2.67,2,156.2,77,71.7,10.78,9.9
3329,0,68,1,0,0.34,3,231.1,57,56.4,7.67,9.6
3330,0,28,1,0,0.0,2,180.8,109,56.0,14.44,14.1
3331,0,184,0,0,0.0,2,213.8,105,50.0,7.98,5.0
3332,0,74,1,1,3.7,0,234.4,113,100.0,13.3,13.7


In [8]:
df.tail()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
3328,0,192,1,1,2.67,2,156.2,77,71.7,10.78,9.9
3329,0,68,1,0,0.34,3,231.1,57,56.4,7.67,9.6
3330,0,28,1,0,0.0,2,180.8,109,56.0,14.44,14.1
3331,0,184,0,0,0.0,2,213.8,105,50.0,7.98,5.0
3332,0,74,1,1,3.7,0,234.4,113,100.0,13.3,13.7


In [9]:
df.isnull().sum()

Churn              0
AccountWeeks       0
ContractRenewal    0
DataPlan           0
DataUsage          0
CustServCalls      0
DayMins            0
DayCalls           0
MonthlyCharge      0
OverageFee         0
RoamMins           0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,0.144914,101.064806,0.90309,0.276628,0.816475,1.562856,179.775098,100.435644,56.305161,10.051488,10.237294
std,0.352067,39.822106,0.295879,0.447398,1.272668,1.315491,54.467389,20.069084,16.426032,2.535712,2.79184
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0
25%,0.0,74.0,1.0,0.0,0.0,1.0,143.7,87.0,45.0,8.33,8.5
50%,0.0,101.0,1.0,0.0,0.0,1.0,179.4,101.0,53.5,10.07,10.3
75%,0.0,127.0,1.0,1.0,1.78,2.0,216.4,114.0,66.2,11.77,12.1
max,1.0,243.0,1.0,1.0,5.4,9.0,350.8,165.0,111.3,18.19,20.0


In [11]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
3328    False
3329    False
3330    False
3331    False
3332    False
Length: 3333, dtype: bool

In [12]:
for col in df.columns:
    print(f"\nColumn: {col}")
print(df[col].value_counts(dropna=False)) 



Column: Churn

Column: AccountWeeks

Column: ContractRenewal

Column: DataPlan

Column: DataUsage

Column: CustServCalls

Column: DayMins

Column: DayCalls

Column: MonthlyCharge

Column: OverageFee

Column: RoamMins
RoamMins
10.0    62
11.3    59
9.8     56
10.9    56
10.1    53
        ..
2.7      1
18.9     1
17.2     1
1.3      1
2.5      1
Name: count, Length: 162, dtype: int64


In [13]:
# Features and Target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 
# random_state=42 is used to control randomness in functions that involve random splitting or shuffling

In [14]:
## Model Training
# We are traing Random forest,gradient boosting and logistic regression

In [15]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Standard Scaler is used to standardize the data to be in a scale of (0,1), which means feature has mean = 0, and standard deviation = 1. This helps 
# to improve the model performance

In [16]:
X_train

Unnamed: 0,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
2016,80,1,0,0.23,3,202.4,118,59.3,13.01,9.2
1362,63,1,0,0.00,3,132.9,122,29.0,3.35,9.9
2670,116,1,1,2.43,2,221.0,108,75.3,7.55,9.0
2210,71,1,0,0.00,0,278.9,110,64.0,9.51,11.7
1846,120,1,1,3.11,1,177.9,117,77.1,8.76,11.5
...,...,...,...,...,...,...,...,...,...,...
1095,106,1,0,0.00,1,274.4,120,64.0,9.93,6.0
1130,122,1,0,0.00,1,35.1,62,22.0,9.04,12.7
1294,66,1,0,0.24,1,87.6,76,40.4,13.10,9.2
860,169,1,0,0.00,2,179.2,111,46.0,8.76,9.9


In [17]:
X_test

Unnamed: 0,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
438,113,1,0,0.00,1,155.0,93,55.0,16.53,13.5
2674,67,1,0,0.00,0,109.1,117,38.0,10.87,12.8
1345,98,1,0,0.00,4,0.0,0,14.0,7.98,6.8
1957,147,1,0,0.33,1,212.8,79,57.3,10.21,10.2
2148,96,1,0,0.30,1,144.0,102,47.0,11.24,10.0
...,...,...,...,...,...,...,...,...,...,...
3080,82,1,0,0.21,0,135.4,102,46.1,11.86,17.5
2548,117,1,1,3.21,1,153.2,112,81.1,13.17,11.9
2916,104,1,0,0.30,2,113.6,87,36.0,7.93,10.5
2655,64,1,0,0.40,1,219.2,73,56.0,8.35,10.0


In [18]:

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
dt_model= DecisionTreeClassifier(max_depth=5, random_state=42)


# Train models
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [19]:
#Model Evaluation

In [20]:
models = {
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model,
    'Decision Tree': dt_model
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} Results")
    print("Accuracy:", acc)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion matrix:\n",confusion_matrix(y_test,y_pred))


Random Forest Results
Accuracy: 0.937
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96       857
           1       0.91      0.62      0.74       143

    accuracy                           0.94      1000
   macro avg       0.92      0.81      0.85      1000
weighted avg       0.94      0.94      0.93      1000

Confusion matrix:
 [[848   9]
 [ 54  89]]

Gradient Boosting Results
Accuracy: 0.943
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       857
           1       0.89      0.69      0.77       143

    accuracy                           0.94      1000
   macro avg       0.92      0.84      0.87      1000
weighted avg       0.94      0.94      0.94      1000

Confusion matrix:
 [[845  12]
 [ 45  98]]

Decision Tree Results
Accuracy: 0.923
Classification Report:
               precision    recall  f1-score   support

           0  

In [21]:
print("Confusion matrix:\n",confusion_matrix(y_test,y_pred))

Confusion matrix:
 [[841  16]
 [ 61  82]]


In [22]:
y_test.shape

(1000,)

In [23]:
import joblib

In [24]:
joblib.dump(gb_model, "gb_model.pkl")
print("model saved successfully")

model saved successfully
