In [82]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# OneHotEncoder is not needed if using pd.get_dummies()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # import accuracy_score


In [83]:
# Load data
telco_demog = pd.read_csv('telecom_demographics.csv')
telco_usage = pd.read_csv('telecom_usage.csv')


In [84]:
# Join data
churn_df = telco_demog.merge(telco_usage, on='customer_id')

In [85]:
# Identify churn rate
churn_rate = churn_df['churn'].value_counts() / len(churn_df)
print(churn_rate*100)

churn
0    79.953846
1    20.046154
Name: count, dtype: float64


In [86]:
# Identify categorical variables
print(churn_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         6500 non-null   int64 
 1   telecom_partner     6500 non-null   object
 2   gender              6500 non-null   object
 3   age                 6500 non-null   int64 
 4   state               6500 non-null   object
 5   city                6500 non-null   object
 6   pincode             6500 non-null   int64 
 7   registration_event  6500 non-null   object
 8   num_dependents      6500 non-null   int64 
 9   estimated_salary    6500 non-null   int64 
 10  calls_made          6500 non-null   int64 
 11  sms_sent            6500 non-null   int64 
 12  data_used           6500 non-null   int64 
 13  churn               6500 non-null   int64 
dtypes: int64(9), object(5)
memory usage: 711.1+ KB
None


In [87]:
churn_df.duplicated().sum()

0

In [88]:
churn_df.shape

(6500, 14)

In [89]:
# One Hot Encoding for categorical variables
churn_df = pd.get_dummies(churn_df, columns=['telecom_partner', 'gender', 'state', 'city'])

In [90]:
features = churn_df.drop(['customer_id', 'churn','registration_event','pincode'], axis=1)

In [91]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 46 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   age                           6500 non-null   int64
 1   num_dependents                6500 non-null   int64
 2   estimated_salary              6500 non-null   int64
 3   calls_made                    6500 non-null   int64
 4   sms_sent                      6500 non-null   int64
 5   data_used                     6500 non-null   int64
 6   telecom_partner_Airtel        6500 non-null   bool 
 7   telecom_partner_BSNL          6500 non-null   bool 
 8   telecom_partner_Reliance Jio  6500 non-null   bool 
 9   telecom_partner_Vodafone      6500 non-null   bool 
 10  gender_F                      6500 non-null   bool 
 11  gender_M                      6500 non-null   bool 
 12  state_Andhra Pradesh          6500 non-null   bool 
 13  state_Arunachal Pradesh       650

In [92]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [93]:
# Target variable
target = churn_df['churn']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Instantiate the Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

# Logistic Regression predictions
logreg_pred = logreg.predict(X_test)

# Logistic Regression evaluation
print(confusion_matrix(y_test, logreg_pred))
print(classification_report(y_test, logreg_pred))
print(f"Logistic Regrssion Accuracy: {accuracy_score(y_test, logreg_pred)*100}")

[[1027    0]
 [ 273    0]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1027
           1       0.00      0.00      0.00       273

    accuracy                           0.79      1300
   macro avg       0.40      0.50      0.44      1300
weighted avg       0.62      0.79      0.70      1300

Logistic Regrssion Accuracy: 79.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [94]:
# Instantiate the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Random Forest predictions
rf_pred = rf.predict(X_test)

# Random Forest evaluation
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred)*100}")

[[1027    0]
 [ 273    0]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1027
           1       0.00      0.00      0.00       273

    accuracy                           0.79      1300
   macro avg       0.40      0.50      0.44      1300
weighted avg       0.62      0.79      0.70      1300

Random Forest Accuracy: 79.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [95]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=60) # You can adjust the number of neighbors
knn.fit(X_train, y_train)

# Make predictions
knn_pred = knn.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, knn_pred))
print(classification_report(y_test, knn_pred))
print(f"KNN Accuracy: {accuracy_score(y_test, knn_pred) * 100}")

[[1027    0]
 [ 273    0]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1027
           1       0.00      0.00      0.00       273

    accuracy                           0.79      1300
   macro avg       0.40      0.50      0.44      1300
weighted avg       0.62      0.79      0.70      1300

KNN Accuracy: 79.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [96]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42) # You can tune hyperparameters here

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
xgb_pred = xgb_model.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))
print(f"XGBoost Accuracy: {accuracy_score(y_test, xgb_pred) * 100}")



[[983  44]
 [264   9]]
              precision    recall  f1-score   support

           0       0.79      0.96      0.86      1027
           1       0.17      0.03      0.06       273

    accuracy                           0.76      1300
   macro avg       0.48      0.50      0.46      1300
weighted avg       0.66      0.76      0.69      1300

XGBoost Accuracy: 76.3076923076923
