In [1]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
import data_processing as dp

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix

In [3]:
data = arff.loadarff("../data/3year.arff")
df = pd.DataFrame(data[0])
X_train, X_test, y_train, y_test = dp.pre_process(df)

In [None]:
#X_train, X_test, y_train, y_test = dp.get_train_test(df)

In [4]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall Score:", recall)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8559187559504919
F1 Score: 0.1684981684981685
Recall Score: 0.34328358208955223
Confusion Matrix:
 [[2651  366]
 [  88   46]]


In [12]:
from sklearn.model_selection import KFold

# Define the number of folds
num_folds = 5

# Initialize KFold cross-validation iterator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Create an empty list to store confusion matrices for each fold
confusion_matrices = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]
    
    # Fit the classifier on the training fold
    dt.fit(X_train_fold, y_train_fold)
    
    # Predict on the validation fold
    y_pred_fold = dt.predict(X_val_fold)
    
    # Calculate and store the confusion matrix for the fold
    cm_fold = confusion_matrix(y_val_fold, y_pred_fold)
    confusion_matrices.append(cm_fold)

# Print confusion matrices for each fold
for i, cm in enumerate(confusion_matrices):
    print(f"Confusion matrix for fold {i + 1}:")
    print(cm)
    print()


KeyError: "None of [Index([    0,     1,     2,     3,     4,     5,     6,     7,     9,    10,\n       ...\n       13967, 13968, 13969, 13970, 13971, 13972, 13973, 13974, 13975, 13977],\n      dtype='int32', length=11182)] are in the [columns]"

In [None]:
x_unp, y_unp = dp.df_null_removal(df)
X_train_unp, X_test_unp, y_train_unp, y_test_unp = train_test_split(
    x_unp, y_unp, test_size=0.2, random_state=42
)

In [None]:
dt.fit(X_train_unp, y_train_unp)
y_pred_unp = dt.predict(X_test_unp)

accuracy_unp = sklearn.metrics.accuracy_score(y_test_unp, y_pred_unp)
print("Accuracy: ", accuracy_unp)

Accuracy:  1.0
