In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score ,GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [11]:
df = pd.read_csv('../data/processed_data.csv')
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 26 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 7043 non-null   int64  
 1   SeniorCitizen                          7043 non-null   int64  
 2   Partner                                7043 non-null   int64  
 3   Dependents                             7043 non-null   int64  
 4   tenure                                 7043 non-null   float64
 5   PhoneService                           7043 non-null   int64  
 6   OnlineSecurity                         7043 non-null   int64  
 7   DeviceProtection                       7043 non-null   int64  
 8   TechSupport                            7043 non-null   int64  
 9   StreamingTV                            7043 non-null   int64  
 10  StreamingMovies                        7043 non-null   int64  
 11  Pape

Firs we will do a manual split on the dtaset into a 70 to 30 ratio and see the performing rates of random forest

In [12]:
X = df.drop('Churn',axis=1)
y = df['Churn']

#We will be spliting the data into training and test subdatasets
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
rf_split = RandomForestClassifier()
rf_split.fit(X_train,y_train)

#Rewieving Scores from Random tree
y_pred_rf_split = rf_split.predict(X_test)
print(accuracy_score(y_test,y_pred_rf_split))
print(confusion_matrix(y_test,y_pred_rf_split))
print(classification_report(y_test,y_pred_rf_split))

0.791292001893043
[[1399  140]
 [ 301  273]]
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1539
           1       0.66      0.48      0.55       574

    accuracy                           0.79      2113
   macro avg       0.74      0.69      0.71      2113
weighted avg       0.78      0.79      0.78      2113



Now we will use random tree with a randomly splitted data still using a 70:30 ratio

In [14]:
X = df.drop('Churn',axis=1)
y = df['Churn']

#We will be spliting the data into training and test subdatasets
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [15]:
rf_Rsplit = RandomForestClassifier()
rf_Rsplit.fit(X_train,y_train)

#Rewieving Scores from Random tree
y_pred_rf_Rsplit = rf_Rsplit.predict(X_test)
print(accuracy_score(y_test,y_pred_rf_Rsplit))
print(confusion_matrix(y_test,y_pred_rf_Rsplit))
print(classification_report(y_test,y_pred_rf_Rsplit))

0.7960246095598675
[[1408  131]
 [ 300  274]]
              precision    recall  f1-score   support

           0       0.82      0.91      0.87      1539
           1       0.68      0.48      0.56       574

    accuracy                           0.80      2113
   macro avg       0.75      0.70      0.71      2113
weighted avg       0.78      0.80      0.78      2113



And here we will use random split but with stratification

In [16]:
X = df.drop('Churn',axis=1)
y = df['Churn']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

In [17]:
rf_RsplitS = RandomForestClassifier()
rf_RsplitS.fit(X_train,y_train)

#Rewieving Scores from Random tree
y_pred_rf_RsplitS = rf_RsplitS.predict(X_test)
print(accuracy_score(y_test,y_pred_rf_RsplitS))
print(confusion_matrix(y_test,y_pred_rf_RsplitS))
print(classification_report(y_test,y_pred_rf_RsplitS))

0.7875059157595835
[[1390  162]
 [ 287  274]]
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1552
           1       0.63      0.49      0.55       561

    accuracy                           0.79      2113
   macro avg       0.73      0.69      0.71      2113
weighted avg       0.78      0.79      0.78      2113



And finaly we will try out wiht a 10 fold cross validation

In [19]:
rf_CV = RandomForestClassifier()
cv_scores = cross_val_score(rf_CV,X,y,cv=10,scoring='accuracy')
print("\nCross-Validation Results")
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")


Cross-Validation Results
Cross-validation scores: [0.79574468 0.8070922  0.79007092 0.78409091 0.77982955 0.75284091
 0.80397727 0.79971591 0.80255682 0.80397727]
Mean cross-validation score: 0.7919896437782076


Now we will proceed with hyperparameter tuning using GridSearch and a 3 fold CV

In [28]:
from scipy.stats import randint


param_grid = {
    'n_estimators': [100],  # Fewer estimators
    'max_depth': [10],  # Fewer depths
    'min_samples_split':  [5],  # Fewer options for split
    'min_samples_leaf': [1],   # Fewer leaf options
    'max_features': ['sqrt'],  # Reduce to fewer choices
    'bootstrap': [True]  # Use one option for bootstrap
}

# RandomizedSearchCV with n_iter=50 to sample 50 combinations
random_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid,  cv=3, n_jobs=-1, verbose=2, scoring='accuracy', error_score='raise')

random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters found: ", random_search.best_params_)

# Evaluate the best model on the test data
y_pred = random_search.best_estimator_.predict(X_test)
print("Best Model Performance on Test Data")
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters found:  {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best Model Performance on Test Data
0.7950780880265026
[[1400  152]
 [ 281  280]]
              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1552
           1       0.65      0.50      0.56       561

    accuracy                           0.80      2113
   macro avg       0.74      0.70      0.72      2113
weighted avg       0.78      0.80      0.79      2113

