In [190]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder

file_path = 'telecom.csv'
telecom_data = pd.read_csv(file_path)

telecom_data.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [191]:
le = LabelEncoder()

# Using all fields except state, phone number, and area code as independent variables
data_to_analyze = telecom_data.drop(['state', 'phone number', 'area code'], axis=1)

# Convert categorical variables to numerical
data_to_analyze['international plan'] = le.fit_transform(data_to_analyze['international plan'])
data_to_analyze['voice mail plan'] = le.fit_transform(data_to_analyze['voice mail plan'])
data_to_analyze['churn'] = le.fit_transform(data_to_analyze['churn'])

independent = data_to_analyze.drop(['churn'], axis=1)
target = data_to_analyze['churn']

ind_train, ind_test, tar_train, tar_test = train_test_split(independent, target, test_size=0.3, random_state=42)

print('Independent Training Data:\n%s\n\nIndependent Test Data:\n%s\n\nTarget Training Data:\n%s\n\nTarget Test Data:\n%s' % (ind_train.shape, ind_test.shape, tar_train.shape, tar_test.shape))

Independent Training Data:
(1214, 17)

Independent Test Data:
(521, 17)

Target Training Data:
(1214,)

Target Test Data:
(521,)


In [192]:
from sklearn.metrics import classification_report

model_1 = LogisticRegression(max_iter=10000)
model_1.fit(ind_train, tar_train)

tar_pred = model_1.predict(ind_test)

accuracy = model_1.score(ind_test, tar_test)
class_report = classification_report(tar_test, tar_pred)
cross_val = cross_val_score(model_1, ind_train, tar_train, cv=10)

print('Logistic Regression\n\nAccuracy: %s\n\n%s\nCross Validation Scores:\n%s' % (accuracy, class_report, cross_val))

Logistic Regression

Accuracy: 0.8579654510556622

              precision    recall  f1-score   support

           0       0.88      0.97      0.92       447
           1       0.50      0.18      0.26        74

    accuracy                           0.86       521
   macro avg       0.69      0.57      0.59       521
weighted avg       0.82      0.86      0.83       521

Cross Validation Scores:
[0.86065574 0.87704918 0.89344262 0.91803279 0.90082645 0.89256198
 0.8677686  0.87603306 0.85950413 0.85123967]


In [193]:
from sklearn.ensemble import RandomForestClassifier

# Change classification algorithm to Random Forest to see if accuracy improves
model_2 = RandomForestClassifier(n_estimators=100, random_state=42)
model_2.fit(ind_train, tar_train)

tar_pred = model_2.predict(ind_test)

accuracy = model_2.score(ind_test, tar_test)
class_report = classification_report(tar_test, tar_pred)
cross_val = cross_val_score(model_2, ind_train, tar_train, cv=10)

print('Random Forest\n\nAccuracy: %s\n\n%s\nCross Validation Scores:\n%s' % (accuracy, class_report, cross_val))

Random Forest

Accuracy: 0.9347408829174664

              precision    recall  f1-score   support

           0       0.94      0.99      0.96       447
           1       0.92      0.59      0.72        74

    accuracy                           0.93       521
   macro avg       0.93      0.79      0.84       521
weighted avg       0.93      0.93      0.93       521

Cross Validation Scores:
[0.95081967 0.95081967 0.93442623 0.95081967 0.95867769 0.94214876
 0.91735537 0.95041322 0.91735537 0.92561983]
