In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import json
import pandas as pd
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
with open('x_vector.json', 'r') as file:
    x_vector = json.load(file)

with open('y_vector.json', 'r') as file:
    y_vector = json.load(file)


filtered_indices = [
    i for i, sublist in enumerate(x_vector) 
    if not any(
        (isinstance(value, (float, int)) and (np.isinf(value) or np.abs(value) > np.finfo(np.float32).max or np.isnan(value) or value == 'nan'))
        for value in sublist
    )
]

x_vector_filtered = [x_vector[i] for i in filtered_indices]
y_vector_filtered = [y_vector[i] for i in filtered_indices]


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_vector_filtered, y_vector_filtered, test_size=0.20, random_state=42)

In [6]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

rf_clf.fit(x_train, y_train)

y_pred = rf_clf.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.7154471544715447

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.79      0.79       164
           1       0.57      0.56      0.57        82

    accuracy                           0.72       246
   macro avg       0.68      0.68      0.68       246
weighted avg       0.71      0.72      0.71       246

Confusion Matrix:
[[130  34]
 [ 36  46]]
