<a href="https://colab.research.google.com/github/Eman-Adly/projects-NTI/blob/main/Choose_the_best_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.decomposition import PCA

from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
data = pd.read_csv('/content/churn-bigml-20.csv')

In [3]:
data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False


In [4]:
data.isnull().sum()

Unnamed: 0,0
State,0
Account length,0
Area code,0
International plan,0
Voice mail plan,0
Number vmail messages,0
Total day minutes,0
Total day calls,0
Total day charge,0
Total eve minutes,0


In [5]:
data.duplicated().sum()

0

In [6]:
non_numerical_colums=data.select_dtypes(exclude=["number"]).columns.tolist()
non_numerical_colums

['State', 'International plan', 'Voice mail plan', 'Churn']

In [7]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
for i in non_numerical_colums:
  data[i]=LE.fit_transform(data[i])

In [8]:
y = data["Churn"]
x = data.drop("Churn",axis=1 )

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X=scaler.fit_transform(x)

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [11]:
pca = PCA(n_components=10)
pca.fit(x_train)

In [30]:
svc = SVC(C=1, degree=15, kernel='sigmoid', probability = True )

logReg = LogisticRegression(C=0.1, solver='saga', max_iter=3000)

RF = RandomForestClassifier(criterion='entropy', max_depth=9, max_features=9, min_samples_split=3, n_estimators=30)

DT = DecisionTreeClassifier(max_depth=4, max_features=30, min_samples_split=5)

ada_model = AdaBoostClassifier(n_estimators=100, learning_rate=1.0)
gd_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=1.0, eval_metric="logloss" )

NB = BernoulliNB(alpha=0.05)

VC = VotingClassifier(estimators=[('LogisticRegression', logReg), ('DecisionTreeClassifier', DT),
          ('RandomForestClassifier', RF),('BernoulliNB' , NB), ('SVC' , svc)], voting='soft')


In [31]:
Models = {'LogisticRegression':logReg,
          'DecisionTreeClassifier':DT,
          'RandomForestClassifier': RF,
          'BernoulliNB' : NB,
          'SVC' : svc,
          'VotingClassifier' : VC,
          'AdaBoostClassifier' : ada_model,
          'GradientBoostingClassifier' : gd_model,
          'XGBClassifier' : xgb_model
         }

In [33]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, zero_one_loss, recall_score, precision_score, classification_report

best_model = None
best_score = 0.0

for model in Models.keys():
    # Model Fitting
    Models[model].fit(x_train, y_train)
    print(str(model).center(30))

    # Prediction
    y_pred = Models[model].predict(x_test)

    # Metrics
    CM = confusion_matrix(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, average='micro')
    accuracy = accuracy_score(y_test, y_pred, normalize=False)
    loss = zero_one_loss(y_test, y_pred, normalize=False)
    recall = recall_score(y_test, y_pred, average='micro')
    precision = precision_score(y_test, y_pred, average='micro')

    # Classification Report
    report = classification_report(y_test, y_pred)

    # Score
    print('Training Score:', Models[model].score(x_train, y_train))
    print('Testing Score:', Models[model].score(x_test, y_test))

    # Update best model
    # if best_score < F1:
    #     best_score = F1
    #     best_model = model

    if best_score < recall:
        best_score = recall
        best_model = model

    # Print metrics
    print()
    print('Confusion Matrix\n', CM)
    print('Classification Report\n', report)  # Print classification report
    print('______________________________________________________________________________')


      LogisticRegression      
Training Score: 0.8686679174484052
Testing Score: 0.9104477611940298

Confusion Matrix
 [[119   0]
 [ 12   3]]
Classification Report
               precision    recall  f1-score   support

           0       0.91      1.00      0.95       119
           1       1.00      0.20      0.33        15

    accuracy                           0.91       134
   macro avg       0.95      0.60      0.64       134
weighted avg       0.92      0.91      0.88       134

______________________________________________________________________________
    DecisionTreeClassifier    
Training Score: 0.9474671669793621
Testing Score: 0.9477611940298507

Confusion Matrix
 [[116   3]
 [  4  11]]
Classification Report
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       119
           1       0.79      0.73      0.76        15

    accuracy                           0.95       134
   macro avg       0.88      0.85      0.86   

In [34]:
# print('Best  Model:', best_model)
# print('Best Score:', best_score)

print('Best Model:', best_model)
print('Best Recall Score:', best_score)

Best Model: RandomForestClassifier
Best Recall Score: 0.9701492537313433
