In [5]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
import data_processing as dp

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix

In [12]:
data = arff.loadarff("../data/3year.arff")
df = pd.DataFrame(data[0])
X, y = dp.pre_process(df)

In [8]:
X_train, X_test, y_train, y_test = dp.get_train_test(df)

In [13]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall Score:", recall)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 1.0
F1 Score: 1.0
Recall Score: 1.0
Confusion Matrix:
 [[2003    0]
 [   0   98]]


In [13]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Create a Decision Tree classifier instance
dt = DecisionTreeClassifier()

# Perform k-fold cross-validation
# Replace `X` and `y` with your feature matrix and target vector
# `cv` parameter specifies the number of folds
# `scoring` parameter specifies the evaluation metric (e.g., accuracy)
scores = cross_val_score(dt, X, y, cv=5, scoring="accuracy")

# Print the accuracy scores for each fold
print("Accuracy scores for each fold:", scores)

# Calculate and print the mean accuracy across all folds
mean_accuracy = scores.mean()
print("Mean accuracy:", mean_accuracy)

# Perform predictions using cross-validation
y_pred_cv = cross_val_predict(dt, X, y, cv=5)

# Calculate F1 score, recall, and accuracy
f1 = f1_score(y, y_pred_cv)
recall = recall_score(y, y_pred_cv)
accuracy = accuracy_score(y, y_pred_cv)

print("F1 Score:", f1)
print("Recall Score:", recall)
print("Accuracy:", accuracy)

Accuracy scores for each fold: [1. 1. 1. 1. 1.]
Mean accuracy: 1.0
F1 Score: 1.0
Recall Score: 1.0
Accuracy: 1.0


In [9]:
x_unp, y_unp = dp.df_null_removal(df)
X_train_unp, X_test_unp, y_train_unp, y_test_unp = train_test_split(
    x_unp, y_unp, test_size=0.2, random_state=42
)

In [14]:
dt.fit(X_train_unp, y_train_unp)
y_pred_unp = dt.predict(X_test_unp)

accuracy_unp = sklearn.metrics.accuracy_score(y_test_unp, y_pred_unp)
print("Accuracy: ", accuracy_unp)

Accuracy:  1.0
