### Importing Libraries

In [1]:
!pip install imbalanced-learn




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

### Reading Churn csv tabulated before

In [3]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [4]:
df=df.drop('Unnamed: 0',axis=1)

### Feature Selection

In [5]:
x=df.drop('Churn',axis=1)
x.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


In [6]:
y=df['Churn']
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

### Train-Test Split (Model Building)

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

#### Decision Tree Classifier

In [8]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [9]:
model_dt.fit(x_train,y_train)

In [10]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [11]:
model_dt.score(x_test,y_test)

0.7874911158493249

In [12]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1057
           1       0.59      0.49      0.53       350

    accuracy                           0.79      1407
   macro avg       0.71      0.69      0.70      1407
weighted avg       0.78      0.79      0.78      1407



We can see because of the imbalanced datset, the minority have very bad score so we will use sampling techniques

In [13]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [14]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [15]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [16]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9435414884516681
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       524
           1       0.95      0.95      0.95       645

    accuracy                           0.94      1169
   macro avg       0.94      0.94      0.94      1169
weighted avg       0.94      0.94      0.94      1169



In [17]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[490  34]
 [ 32 613]]


Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.

#### Random Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [20]:
model_rf.fit(x_train,y_train)

In [21]:
y_pred=model_rf.predict(x_test)

In [22]:
model_rf.score(x_test,y_test)

0.7910447761194029

In [23]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1057
           1       0.62      0.42      0.50       350

    accuracy                           0.79      1407
   macro avg       0.72      0.67      0.68      1407
weighted avg       0.77      0.79      0.78      1407



Upsampling this Classifier as well

In [24]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [25]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [26]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [27]:
model_rf_smote.fit(xr_train1,yr_train1)

In [28]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [29]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [30]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9251471825063078
              precision    recall  f1-score   support

           0       0.94      0.88      0.91       520
           1       0.91      0.96      0.94       669

    accuracy                           0.93      1189
   macro avg       0.93      0.92      0.92      1189
weighted avg       0.93      0.93      0.92      1189



In [31]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))


[[459  61]
 [ 28 641]]


We can see better results than Decision Tree Classifier

### Principal component Analysis

In [32]:
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [33]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [34]:
model.fit(xr_train_pca,yr_train1)

In [35]:
yr_predict_pca = model.predict(xr_test_pca)

In [36]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [37]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.7451640033641715
              precision    recall  f1-score   support

           0       0.73      0.67      0.70       520
           1       0.76      0.81      0.78       669

    accuracy                           0.75      1189
   macro avg       0.74      0.74      0.74      1189
weighted avg       0.74      0.75      0.74      1189



With PCA, we couldn't see any better results, hence we finalise the model which was created by RF Classifier, and save the model

### Pickling the model

In [38]:
import pickle

In [39]:
filename = 'churn_model.sav'

In [40]:
pickle.dump(model_rf_smote, open(filename, 'wb'))
load_model = pickle.load(open(filename, 'rb'))
model_score_r1 = load_model.score(xr_test1, yr_test1)
model_score_r1

0.9251471825063078

Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.