In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle

In [2]:
dataset_name = "prostate_cancer"
#dataset_name = "animal_faces"
#model_from = "task1"
model_from = "pretrained"

In [3]:
dataset_url = "/content/drive/MyDrive/MLProject/Codes/Task2/Features/features_labels_"+model_from+"_model_"+dataset_name+".csv"
dataset = pd.read_csv(dataset_url)

In [4]:
print(dataset.shape)
print(dataset.head)

(6000, 513)
<bound method NDFrame.head of              0         1         2         3         4         5         6  \
0     1.438999  0.078806  1.141035  1.938414  0.488909  0.426002  0.445668   
1     1.546778  0.052872  0.288157  0.307745  2.364518  0.513626  0.104222   
2     0.585797  0.199779  0.008568  2.106367  0.872578  0.000000  0.083597   
3     0.608120  0.112737  0.016985  0.914887  1.424414  0.166883  0.160807   
4     0.502600  0.051433  0.294563  0.527870  1.450859  0.105591  0.148526   
...        ...       ...       ...       ...       ...       ...       ...   
5995  0.122367  0.072377  0.385997  0.924579  0.890233  0.001040  0.265616   
5996  0.208139  0.314666  0.006673  0.770476  1.734351  0.201996  0.008273   
5997  1.909944  0.328913  0.238146  1.136451  1.832376  1.241467  0.187652   
5998  0.933171  0.057718  0.000000  0.788418  0.719934  0.530506  0.048958   
5999  1.210402  0.024202  0.475783  1.112730  0.748531  0.589873  0.116697   

             7       

In [5]:
# Separate features and labels
X = dataset.iloc[:, :-1]  # Features (all columns except the last one)
y = dataset.iloc[:, -1]   # Labels (last column)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
rf_classifier = RandomForestClassifier()

In [8]:
# Define hyperparameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [9]:
grid_search = GridSearchCV(rf_classifier, param_grid, cv=10, scoring='accuracy',return_train_score=True)
grid_search.fit(X_train, y_train)

In [10]:
train_accuracy = grid_search.cv_results_['mean_train_score']
test_accuracy = grid_search.cv_results_['mean_test_score']

In [11]:
grid_search_cv_results = pd.DataFrame(columns=['hyperparameters', 'train_accuracy', 'test_accuracy'])

In [14]:
for params, train_acc, test_acc in zip(grid_search.cv_results_['params'], train_accuracy, test_accuracy):
    print(f"Model with Hyperparameters: {params}")
    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Testing Accuracy: {test_acc:.4f}\n")
    new_row = pd.Series({'hyperparameters': params, 'train_accuracy': train_acc, 'test_accuracy': test_acc})
    grid_search_cv_results = grid_search_cv_results.add(new_row)

Model with Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Training Accuracy: 1.0000
Testing Accuracy: 0.9519

Model with Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Training Accuracy: 1.0000
Testing Accuracy: 0.9564

Model with Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 150}
Training Accuracy: 1.0000
Testing Accuracy: 0.9583

Model with Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Training Accuracy: 1.0000
Testing Accuracy: 0.9562

Model with Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Training Accuracy: 1.0000
Testing Accuracy: 0.9579

Model with Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}
Training Accuracy: 1.0000
Testing Accuracy: 0.9581

Model with Hyperparameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 50}
Training Accuracy: 0.9998
Testing Accur

In [None]:
grid_search_cv_results.to_csv("/content/drive/MyDrive/MLProject/Codes/Task2/Results/grid_search_cv_results_RF_"+model_from+"_model_"+dataset_name+".csv",index=False)

In [None]:
with open('/content/drive/MyDrive/MLProject/Codes/Task2/Results/grid_search_results_RF_'+model_from+'_model_'+dataset_name+'.txt', 'w') as file:
    file.write("Hyperparameters\tTrain_Accuracy\tTest_Accuracy\n")
    for params, train_acc, test_acc in zip(grid_search.cv_results_['params'], train_accuracy, test_accuracy):
        file.write(f"{params}\t{train_acc:.4f}\t{test_acc:.4f}\n")

In [15]:
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 150}


In [17]:
best_model = grid_search.best_estimator_
pickle.dump(best_model, open('/content/drive/MyDrive/MLProject/Codes/Task2/Models/RF_best_model_config_'+model_from+'_model_'+dataset_name+'.sav', 'wb'))

In [None]:
#loaded_model = pickle.load(open('/content/drive/MyDrive/MLProject/Codes/Task2/Models/RF_best_model_config_'+model_from+'_model_'+dataset_name+'.sav', 'rb'))

In [18]:
print(best_model)

RandomForestClassifier(min_samples_split=10, n_estimators=150)


In [19]:
# Assess the performance on training data
y_train_pred = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_conf_matrix = confusion_matrix(y_train, y_train_pred)
train_classification_rep = classification_report(y_train, y_train_pred)

# Print training results
print("Training Data Results:")
print("Accuracy:", train_accuracy)
print("Confusion Matrix:")
print(train_conf_matrix)
print("Classification Report:")
print(train_classification_rep)

Training Data Results:
Accuracy: 0.9997619047619047
Confusion Matrix:
[[1407    0    0]
 [   1 1388    0]
 [   0    0 1404]]
Classification Report:
              precision    recall  f1-score   support

       gland       1.00      1.00      1.00      1407
    nongland       1.00      1.00      1.00      1389
       tumor       1.00      1.00      1.00      1404

    accuracy                           1.00      4200
   macro avg       1.00      1.00      1.00      4200
weighted avg       1.00      1.00      1.00      4200



In [None]:
with open('/content/drive/MyDrive/MLProject/Codes/Task2/Results/training_results_Best_RF_'+model_from+'_model_'+dataset_name+'.txt', 'w') as file:
    file.write("Training Data Results:\n")
    file.write(f"Accuracy: {train_accuracy}\n")
    file.write("Confusion Matrix:\n")
    file.write(f"{train_conf_matrix}\n")
    file.write("Classification Report:\n")
    file.write(f"{train_classification_rep}\n")

In [20]:
# Assess the performance on testing data
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_rep = classification_report(y_test, y_test_pred)

# Print testing results
print("\nTesting Data Results:")
print("Accuracy:", test_accuracy)
print("Confusion Matrix:")
print(test_conf_matrix)
print("Classification Report:")
print(test_classification_rep)


Testing Data Results:
Accuracy: 0.9533333333333334
Confusion Matrix:
[[570  18   5]
 [ 42 569   0]
 [ 19   0 577]]
Classification Report:
              precision    recall  f1-score   support

       gland       0.90      0.96      0.93       593
    nongland       0.97      0.93      0.95       611
       tumor       0.99      0.97      0.98       596

    accuracy                           0.95      1800
   macro avg       0.95      0.95      0.95      1800
weighted avg       0.95      0.95      0.95      1800



In [None]:
with open('/content/drive/MyDrive/MLProject/Data/Task2/Results/testing_results_Best_RF_'+model_from+'_model_'+dataset_name+'.txt', 'w') as file:
    file.write("Testing Data Results:\n")
    file.write(f"Accuracy: {test_accuracy}\n")
    file.write("Confusion Matrix:\n")
    file.write(f"{test_conf_matrix}\n")
    file.write("Classification Report:\n")
    file.write(f"{test_classification_rep}\n")