In [26]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
import data_processing as dp
import warnings

warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    confusion_matrix,
    classification_report,
)

In [27]:
data = arff.loadarff("../data/3year.arff")
df = pd.DataFrame(data[0])
X_train, X_test, y_train, y_test = dp.pre_process(df)

In [28]:
# X_train, X_test, y_train, y_test = dp.get_train_test(df)

In [29]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall Score:", recall)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report)

Accuracy: 0.8508410028562361
F1 Score: 0.14855072463768115
Recall Score: 0.30597014925373134
Confusion Matrix:
 [[2640  377]
 [  93   41]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.88      0.92      3017
           1       0.10      0.31      0.15       134

    accuracy                           0.85      3151
   macro avg       0.53      0.59      0.53      3151
weighted avg       0.93      0.85      0.89      3151



In [30]:
# Reset the index of the DataFrame
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
num_folds = 5

# Initialize KFold cross-validation iterator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Create an empty list to store confusion matrices for each fold
confusion_matrices = []
reports = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Fit the classifier on the training fold
    dt.fit(X_train_fold, y_train_fold)

    # Predict on the validation fold
    y_pred_fold = dt.predict(X_val_fold)

    # Calculate and store the confusion matrix for the fold
    cm_fold = confusion_matrix(y_val_fold, y_pred_fold)
    confusion_matrices.append(cm_fold)
    report = classification_report(y_val_fold, y_pred_fold)

    reports.append(report)

# Print confusion matrices for each fold
""" for i, cm in enumerate(confusion_matrices):
    print(f"Confusion matrix for fold {i + 1}:")
    print(cm)
    print() """
print(len(reports))
for i, r in enumerate(reports):
    print(f"Classification report for fold {i + 1}:")
    print(r)
    print()

# hey

TypeError: 'str' object is not callable