In [1]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
from python_scripts import csv_processing as dp
#import dtreeviz as dt
import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, f1_score, recall_score, \
        confusion_matrix, classification_report, precision_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("../data/3year.csv")
X_train, X_test, y_train, y_test = dp.pre_process(df)

In [3]:
rdt = RandomForestClassifier(n_estimators=100, max_depth=5)

rdt.fit(X_train, y_train)
y_pred = rdt.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall Score:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Accuracy: 0.7143763884481117
F1 Score: 0.1727941176470588
Recall Score: 0.7014925373134329
Confusion Matrix:
 [[2157  860]
 [  40   94]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.71      0.83      3017
           1       0.10      0.70      0.17       134

    accuracy                           0.71      3151
   macro avg       0.54      0.71      0.50      3151
weighted avg       0.94      0.71      0.80      3151



In [4]:
rdt = RandomForestClassifier(n_estimators=100, max_depth=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc']
#scores = cross_val_score(rdt, X_train, y_train, cv=kf, scoring=scoring)

print("Cross Validation Scores:")
for metric in scoring:
    scores = cross_val_score(rdt, X_train, y_train, cv=kf, scoring=metric)
    print(f"{metric}: {np.mean(scores)}")


Cross Validation Scores:
accuracy: 0.7755768654940228
f1: 0.7922982170853927
recall: 0.8516668051323313
precision: 0.7418519433922264
roc_auc: 0.8534924465566391


In [5]:
# Define the model
rdt = RandomForestClassifier()

# Define the hyperparameters and their values for grid search
param_grid = {
    'n_estimators': [100, 200, 300],  # Example values
    'max_depth': [5, 10, 15, None],   # Added None to consider unlimited depth
    'min_samples_split': [2, 5, 10],  # Example values
    'min_samples_leaf': [1, 2, 4],    # Example values
    'max_features': ['auto', 'sqrt']  # Example values
}

# Initialize the GridSearchCV object with 'f1' scoring
grid_search = GridSearchCV(estimator=rdt, param_grid=param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best F1 score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# Use the best estimator to make predictions
y_pred = grid_search.predict(X_test)

# Evaluate the model with the selected metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall Score:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best F1 Score: 0.963407302851375
Accuracy: 0.9251031418597271
F1 Score: 0.2027027027027027
Recall Score: 0.22388059701492538
Confusion Matrix:
 [[2885  132]
 [ 104   30]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96      3017
           1       0.19      0.22      0.20       134

    accuracy                           0.93      3151
   macro avg       0.58      0.59      0.58      3151
weighted avg       0.93      0.93      0.93      3151



In [7]:
# Define the model
rdt = RandomForestClassifier()

# Define the hyperparameters and their values for grid search
param_grid = {
    'n_estimators': [200],  # Example values
    'max_depth': [5, 10, 15,20,40],   # Added None to consider unlimited depth
    'min_samples_split': [2],  # Example values
    'min_samples_leaf': [1],    # Example values
    'max_features': ['sqrt']  # Example values
}

# Initialize the GridSearchCV object with 'f1' scoring
grid_search = GridSearchCV(estimator=rdt, param_grid=param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best F1 score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# Use the best estimator to make predictions
y_pred = grid_search.predict(X_test)

# Evaluate the model with the selected metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall Score:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best F1 Score: 0.9634809154505923
Accuracy: 0.9235163440177722
F1 Score: 0.17747440273037543
Recall Score: 0.19402985074626866
Confusion Matrix:
 [[2884  133]
 [ 108   26]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      3017
           1       0.16      0.19      0.18       134

    accuracy                           0.92      3151
   macro avg       0.56      0.57      0.57      3151
weighted avg       0.93      0.92      0.93      3151



In [9]:
# Define the model
rdt = RandomForestClassifier()

# Define the hyperparameters and their values for grid search
param_grid = {
    'n_estimators': [200],  # Example values
    'max_depth': [None],   # Added None to consider unlimited depth
    'min_samples_split': [2],  # Example values
    'min_samples_leaf': [1],    # Example values
    'max_features': ['sqrt']  # Example values
}

# Initialize the GridSearchCV object with 'f1' scoring
grid_search = GridSearchCV(estimator=rdt, param_grid=param_grid, cv=5, scoring='f1_macro', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best F1 score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# Use the best estimator to make predictions
y_pred = grid_search.predict(X_test)

# Evaluate the model with the selected metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall Score:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best F1 Score: 0.9622852782619022
Accuracy: 0.9247857822913361
F1 Score: 0.20202020202020202
Recall Score: 0.22388059701492538
Confusion Matrix:
 [[2884  133]
 [ 104   30]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96      3017
           1       0.18      0.22      0.20       134

    accuracy                           0.92      3151
   macro avg       0.57      0.59      0.58      3151
weighted avg       0.93      0.92      0.93      3151



In [10]:
# Assuming X_train, y_train, X_test, y_test are predefined

# Start with no features for forward selection, or all features for backward elimination
selected_features = []  # For forward selection
# selected_features = list(X_train.columns)  # For backward elimination

best_f1 = 0  # Keep track of the best performance
features = list(X_train.columns)

# Forward Selection
for _ in range(len(features)):
    f1_scores = []
    for feature in features:
        if feature not in selected_features:
            # Add the feature temporarily
            temp_features = selected_features + [feature]
            # Train model and predict
            rdt = RandomForestClassifier(n_estimators=200, )
            rdt.fit(X_train[temp_features], y_train)
            y_pred = rdt.predict(X_test[temp_features])
            # Evaluate and store F1 score
            f1 = f1_score(y_test, y_pred)
            f1_scores.append((feature, f1))
    
    # Find the best feature of this round
    f1_scores.sort(key=lambda x: x[1], reverse=True)
    best_feature, best_feature_f1 = f1_scores[0]

    # If performance improved, update the model
    if best_feature_f1 > best_f1:
        print(f"Adding {best_feature} improved F1 to {best_feature_f1}")
        best_f1 = best_feature_f1
        selected_features.append(best_feature)
    else:
        break  # Stop if no improvement

# Your final set of features
print("Selected features:", selected_features)


Adding Attr24 improved F1 to 0.14074074074074075
Adding Attr48 improved F1 to 0.1588628762541806
Adding Attr13 improved F1 to 0.1632996632996633
Adding Attr32 improved F1 to 0.1661129568106312
Selected features: ['Attr24', 'Attr48', 'Attr13', 'Attr32']
