In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
current_directory = os.getcwd()
folder_path = os.path.join(current_directory, 'k10_full_plain_retrans')
files_in_folder = [filename for filename in os.listdir(folder_path)]

In [3]:
accuracy_score_list = []
for file in files_in_folder:
    path_to_file = os.path.join(folder_path, file)
    df_k2l = pd.read_csv(path_to_file)
    
    X = df_k2l[df_k2l.columns[:-1]].values
    y = df_k2l[df_k2l.columns[-1]].values

    # Running the algorithm multiple times for each file
    avg_accuracy_per_file = 0
    num_iterations = 100  # Adjust as needed
    
    for _ in range(num_iterations):
        # Splitting the set into training and test set following an 80% - 20% split (train_size=0.8)
        # Using seed (random state), shuffle the dataset (shuffle=True) and drawing equal number of samples for each label (stratify=y)
        X_train_2l, X_test_2l, y_train_2l, y_test_2l = train_test_split(X, y, train_size=0.8, shuffle=True, stratify=y)
        
        # Gaussian Naive Bayes
        nb_model = GaussianNB()
        nb_model_2l = nb_model.fit(X_train_2l, y_train_2l)
        y_pred_2l = nb_model_2l.predict(X_test_2l)
        
        avg_accuracy_per_file += accuracy_score(y_test_2l, y_pred_2l, normalize=True)
    
    avg_accuracy_per_file /= num_iterations
    accuracy_score_list.append(avg_accuracy_per_file)

# Calculating the overall average accuracy across all files
overall_avg_accuracy = sum(accuracy_score_list) / len(accuracy_score_list)

print("Average accuracy for each file:")
print(accuracy_score_list)
print("---------------------------------------------------")
print("Number of files processed:", len(accuracy_score_list))
print("---------------------------------------------------")
print("Overall average accuracy:", overall_avg_accuracy)

Average accuracy for each file:
[0.5302500000000001, 0.4682500000000001, 0.4660000000000001, 0.5990000000000002, 0.44900000000000007, 0.7225, 0.586, 0.42999999999999994, 0.43174999999999997, 0.6304999999999998, 0.6280000000000001, 0.7152500000000002, 0.5707499999999999, 0.46125, 0.33899999999999997, 0.49725000000000025, 0.48950000000000027, 0.5187499999999999, 0.5307500000000003, 0.47350000000000014]
---------------------------------------------------
Number of files processed: 20
---------------------------------------------------
Overall average accuracy: 0.5268625000000001
