In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
current_directory = os.getcwd()
folder_path = os.path.join(current_directory, 'k2_length_plain_retrans')
files_in_folder = [filename for filename in os.listdir(folder_path)]

In [3]:
accuracy_score_list = []
for file in files_in_folder:
    path_to_file = os.path.join(folder_path, file)
    df_k2l = pd.read_csv(path_to_file)
    
    X = df_k2l[df_k2l.columns[:-1]].values
    y = df_k2l[df_k2l.columns[-1]].values

    # Running the algorithm multiple times for each file
    avg_accuracy_per_file = 0
    num_iterations = 100  # Adjust as needed
    
    for _ in range(num_iterations):
        # Splitting the set into training and test set following an 80% - 20% split (train_size=0.8)
        # Using seed (random state), shuffle the dataset (shuffle=True) and drawing equal number of samples for each label (stratify=y)
        X_train_2l, X_test_2l, y_train_2l, y_test_2l = train_test_split(X, y, train_size=0.8, shuffle=True, stratify=y)
        
        # Gaussian Naive Bayes
        nb_model = GaussianNB()
        nb_model_2l = nb_model.fit(X_train_2l, y_train_2l)
        y_pred_2l = nb_model_2l.predict(X_test_2l)
        
        avg_accuracy_per_file += accuracy_score(y_test_2l, y_pred_2l, normalize=True)
    
    avg_accuracy_per_file /= num_iterations
    accuracy_score_list.append(avg_accuracy_per_file)

# Calculating the overall average accuracy across all files
overall_avg_accuracy = sum(accuracy_score_list) / len(accuracy_score_list)

print("Average accuracy for each file:")
print(accuracy_score_list)
print("---------------------------------------------------")
print("Number of files processed:", len(accuracy_score_list))
print("---------------------------------------------------")
print("Overall average accuracy:", overall_avg_accuracy)

Average accuracy for each file:
[0.995, 1.0, 1.0, 0.97, 0.9775, 0.99625, 1.0, 0.98125, 0.8625, 0.9725, 0.99125, 0.98125, 0.9925, 0.96, 0.95125, 1.0, 1.0, 0.9975, 0.985, 1.0, 0.99375, 0.9975, 0.9875, 1.0, 0.99625, 0.98, 0.9425, 0.9975, 0.82625, 1.0, 0.98125, 0.99375, 0.9725, 0.97375, 0.96, 0.99875, 0.99375, 0.99875, 1.0, 0.98375, 1.0, 1.0, 1.0, 0.995, 0.98, 0.955, 0.93625, 1.0, 0.99, 1.0, 0.96625, 1.0, 1.0, 1.0, 0.97, 1.0, 0.9975, 0.995, 0.98625, 0.9975, 0.9875, 0.98875, 0.95125, 1.0, 0.95, 0.9975, 0.9975, 1.0, 0.9775, 0.94625, 0.9825, 1.0, 1.0, 0.9775, 1.0, 0.9525, 0.9625, 0.9875, 0.98, 0.99625, 0.99375, 0.9775, 0.9525, 0.9975, 0.9675, 1.0, 0.9375, 0.9525, 0.995, 0.995, 0.995, 0.98125, 0.99875, 0.9875, 0.99625, 0.99375, 0.9875, 0.9875, 0.99625, 0.9775]
---------------------------------------------------
Number of files processed: 100
---------------------------------------------------
Overall average accuracy: 0.9826500000000007
