In [None]:
import os
import numpy as np
import json
import pickle
import zipfile
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    precision_score,
    roc_curve,
    auc,
    confusion_matrix,
)
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from google.colab import drive

In [None]:
drive.mount('/content/drive')
with zipfile.ZipFile('/content/drive/MyDrive/CJPR/tfidf_embeddings.zip', 'r') as zip_ref:
    zip_ref.extractall('tfidf_embeddings')

Mounted at /content/drive


In [None]:
folder = '/content/tfidf_embeddings/tfidf_embeddings'
train_embed = np.load(os.path.join(folder, 'train_embed.npy'))
val_embed = np.load(os.path.join(folder, 'val_embed.npy'))
test_embed = np.load(os.path.join(folder, 'test_embed.npy'))
train_labels = np.load(os.path.join(folder, 'train_labels.npy'))
val_labels = np.load(os.path.join(folder, 'val_labels.npy'))
test_labels = np.load(os.path.join(folder, 'test_labels.npy'))

In [None]:
param_grid = {
    'n_estimators': [10, 50, 100, 200],  # Number of estimators (trees)
    'max_depth': [None, 10, 20, 30],   # Maximum depth of trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required at each leaf node
}

In [None]:
rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1')

grid_search.fit(train_embed, train_labels)

In [None]:
all_results = []

for params, mean_test_score, std_test_score in zip(
    grid_search.cv_results_['params'],
    grid_search.cv_results_['mean_test_score'],
    grid_search.cv_results_['std_test_score']
):
    params_str = str(params)

    # Train the model with the current parameters
    rf.set_params(**params)
    rf.fit(train_embed, train_labels)
    test_predictions = rf.predict(test_embed)
    fpr, tpr, _ = roc_curve(test_labels, rf.predict_proba(test_embed)[:, 1])
    roc_auc = auc(fpr, tpr)
    conf_matrix = confusion_matrix(test_labels, test_predictions)

    # Calculate evaluation metrics
    accuracy = accuracy_score(test_labels, test_predictions)
    f1 = f1_score(test_labels, test_predictions)
    recall = recall_score(test_labels, test_predictions)
    precision = precision_score(test_labels, test_predictions)

    # Store results in a dictionary
    result = {
        'params': params,
        'AUC': roc_auc,
        'accuracy': accuracy,
        'f1_score': f1,
        'recall': recall,
        'precision': precision,
        'confusion_matrix': conf_matrix.tolist()
    }

    # Append results to the list
    all_results.append(result)

In [None]:
best_rf_model = grid_search.best_estimator_
best_rf_model_probs = best_rf_model.predict_proba(test_embed)[:, 1]

In [None]:
# Calculate the ROC curve
fpr, tpr, _ = roc_curve(test_labels, best_rf_model_probs)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')

# Save the ROC curve plot as an image
roc_curve_filename = '/content/drive/MyDrive/CJPR/Classical/Random_Forest/TFIDF/best_roc_curve_tfidf_RF.png'
plt.savefig(roc_curve_filename)
plt.show()

result = {
        'Best AUC': roc_auc ,
        'Best TPR': list(tpr) ,
        'Best FPR': list(fpr)
    }

all_results.append(result)

In [None]:
conf_matrix = confusion_matrix(test_labels, best_rf_model.predict(test_embed))

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

# Save the confusion matrix plot as an image
confusion_matrix_filename = '/content/drive/MyDrive/CJPR/Classical/Random_Forest/TFIDF/best_confusion_matrix_tfidf_RF.png'
plt.savefig(confusion_matrix_filename)
plt.show()

In [None]:
# Save all results, including parameter combinations, to a JSON file
results_filename = '/content/drive/MyDrive/CJPR/Classical/Random_Forest/TFIDF/random_forest_tfidf_all_results.json'
with open(results_filename, 'w') as json_file:
    json.dump(all_results, json_file, indent=4)

# Save the best parameters to a JSON file
best_params_filename = '/content/drive/MyDrive/CJPR/Classical/Random_Forest/TFIDF/best_random_forest_params_tfidf.json'
with open(best_params_filename, 'w') as json_file:
    json.dump(grid_search.best_params_, json_file, indent=4)

In [None]:
best_result = grid_search.best_params_

rf.set_params(**best_result)
rf.fit(train_embed, train_labels)
test_predictions = rf.predict(test_embed)
fpr, tpr, _ = roc_curve(test_labels, rf.predict_proba(test_embed)[:, 1])
roc_auc = auc(fpr, tpr)
conf_matrix = confusion_matrix(test_labels, test_predictions)

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, test_predictions)
f1 = f1_score(test_labels, test_predictions)
recall = recall_score(test_labels, test_predictions)
precision = precision_score(test_labels, test_predictions)

# Store results in a dictionary
best_result = {
    'params': best_result,
    'AUC': roc_auc,
    'accuracy': accuracy,
    'f1_score': f1,
    'recall': recall,
    'precision': precision,
    'confusion_matrix': conf_matrix.tolist()
}

# Save the best results to a JSON file
best_result_filename = '/content/drive/MyDrive/CJPR/RandomForest/TFIDF/best_random_forest_result_tfidf.json'
with open(best_result_filename, 'w') as json_file:
    json.dump(best_result, json_file, indent=4)

In [None]:
best_model_filename = '/content/drive/MyDrive/CJPR/RandomForest/TFIDF/best_random_forest_model_tfidf.pkl'
with open(best_model_filename, 'wb') as model_file:
    pickle.dump(best_rf_model, model_file)