In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
current_directory = os.getcwd()
folder_path = os.path.join(current_directory, 'k2_full_plain_retrans')
files_in_folder = [filename for filename in os.listdir(folder_path)]

In [3]:
accuracy_score_list = []
for file in files_in_folder:
    path_to_file = os.path.join(folder_path, file)
    df_k2f = pd.read_csv(path_to_file)
    
    X = df_k2f[df_k2f.columns[:-1]].values
    y = df_k2f[df_k2f.columns[-1]].values

    # Running the algorithm multiple times for each file
    avg_accuracy_per_file = 0
    num_iterations = 100  # Adjust as needed
    
    for _ in range(num_iterations):
        # Splitting the set into training and test set following an 80% - 20% split (train_size=0.8)
        # Using seed (random state), shuffle the dataset (shuffle=True) and drawing equal number of samples for each label (stratify=y)
        X_train_2l, X_test_2l, y_train_2l, y_test_2l = train_test_split(X, y, train_size=0.8, shuffle=True, stratify=y)
        
        # Gaussian Naive Bayes
        nb_model = GaussianNB()
        nb_model_2l = nb_model.fit(X_train_2l, y_train_2l)
        y_pred_2l = nb_model_2l.predict(X_test_2l)
        
        avg_accuracy_per_file += accuracy_score(y_test_2l, y_pred_2l, normalize=True)
    
    avg_accuracy_per_file /= num_iterations
    accuracy_score_list.append(avg_accuracy_per_file)

# Calculating the overall average accuracy across all files
overall_avg_accuracy = sum(accuracy_score_list) / len(accuracy_score_list)

print("Average accuracy for each file:")
print(accuracy_score_list)
print("---------------------------------------------------")
print("Number of files processed:", len(accuracy_score_list))
print("---------------------------------------------------")
print("Overall average accuracy:", overall_avg_accuracy)

Average accuracy for each file:
[0.88875, 0.92875, 1.0, 0.93875, 0.9175, 0.99375, 0.92, 0.8775, 0.98375, 0.8875, 0.98, 0.92625, 1.0, 0.875, 0.97375, 0.95, 1.0, 0.9175, 0.55125, 1.0, 1.0, 1.0, 0.955, 0.93625, 0.9825, 0.94875, 0.9425, 0.89625, 0.51125, 0.94625, 0.9375, 0.98, 0.6125, 0.85375, 0.58375, 0.95875, 0.89625, 0.9175, 0.9675, 1.0, 1.0, 1.0, 0.83375, 1.0, 0.77375, 0.72125, 0.5275, 0.92125, 0.92, 0.94625, 1.0, 0.80375, 0.9775, 1.0, 0.93875, 0.92375, 0.895, 0.9625, 0.64625, 1.0, 0.9425, 1.0, 0.945, 1.0, 0.98, 0.84875, 1.0, 1.0, 0.645, 0.965, 0.87875, 0.95125, 0.95125, 1.0, 1.0, 0.60625, 0.93875, 0.69875, 0.92125, 0.95125, 1.0, 0.95875, 1.0, 0.975, 0.8375, 0.95, 1.0, 0.98875, 1.0, 0.86, 0.90375, 0.975, 0.95875, 1.0, 0.9825, 1.0, 0.6125, 0.87625, 1.0, 0.97]
---------------------------------------------------
Number of files processed: 100
---------------------------------------------------
Overall average accuracy: 0.912
