In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import numpy as np

In [20]:
numbers = pd.read_csv('../GetPhoneInfo/wrong_and_connect.csv')
numbers.head()
numbers3 = numbers.drop(columns=['First Name Mismatch'])

In [21]:
X = numbers3.drop(columns = ['OFFICE_TELEPHONE', 'label'])
y = numbers3['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 66, stratify = y)

In [8]:
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
# predictions
rfc_predict = rfc.predict(X_test)



In [10]:
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')

In [12]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict, labels=['wrong', 'connected']))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[28 19]
 [ 8 40]]


=== Classification Report ===
              precision    recall  f1-score   support

   connected       0.68      0.83      0.75        48
       wrong       0.78      0.60      0.67        47

   micro avg       0.72      0.72      0.72        95
   macro avg       0.73      0.71      0.71        95
weighted avg       0.73      0.72      0.71        95



=== All AUC Scores ===
[0.7465374  0.60803324 0.81024931 0.5900277  0.6634349  0.69252078
 0.55124654 0.89196676 0.7867036  0.82253086]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.716325108580418


In [22]:
from sklearn.model_selection import RandomizedSearchCV
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the model
rfc_random.fit(X_train, y_train)
# print results
print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.2min finished


{'n_estimators': 1400, 'max_features': 'sqrt', 'max_depth': 260}


In [23]:
rfc = RandomForestClassifier(n_estimators=1400, max_depth=260, max_features='sqrt')
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict, labels=['wrong', 'connected']))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[28 19]
 [ 8 40]]


=== Classification Report ===
              precision    recall  f1-score   support

   connected       0.68      0.83      0.75        48
       wrong       0.78      0.60      0.67        47

   micro avg       0.72      0.72      0.72        95
   macro avg       0.73      0.71      0.71        95
weighted avg       0.73      0.72      0.71        95



=== All AUC Scores ===
[0.80193906 0.63296399 0.82825485 0.61634349 0.68559557 0.72853186
 0.55540166 0.88088643 0.79224377 0.80246914]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.7324629800622413
