In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import LabelEncoder
from io import StringIO
from sklearn.model_selection import KFold
from oldslidingWindow import read_data, segment_data_by_day, sliding_window

REAL_DATA = '../Processed Data/Aruba_17/processed_data.csv'
FAKE_DATA = '../Predictions/Aruba_17_prediction_419.txt'

# Read the data as a string and split it by lines
with open(REAL_DATA, 'r') as f:
    lines = f.read().splitlines()

lines = lines[1:]
data_str = '\n'.join(lines)
data_df = pd.read_csv(StringIO(data_str), delimiter=',', header=None, dtype=float)
data_df = data_df.iloc[:10000, :]

# Read the data as a string and split it by lines
with open(FAKE_DATA, 'r') as f:
    lines = f.read().splitlines()

fake_data_str = '\n'.join(lines)
fake_data_df = pd.read_csv(StringIO(fake_data_str), delimiter=',', header=None, dtype=float)
fake_data_df = fake_data_df.iloc[:10000, :]

daily_segments_real = segment_data_by_day(data_df)
daily_segments_fake = segment_data_by_day(fake_data_df)
window_size = 7816
overlap_ratio = 0.2
windows_real = sliding_window(daily_segments_real, window_size=window_size, overlap_ratio=overlap_ratio)
windows_fake = sliding_window(daily_segments_fake, window_size=window_size, overlap_ratio=overlap_ratio)

data = data_df.iloc[:, :6].to_numpy()
labels = data_df.iloc[:, 4:].astype(str).apply(lambda x: '_'.join(x), axis=1).to_numpy()
fake_data = fake_data_df.iloc[:, :6].to_numpy()
fake_labels = fake_data_df.iloc[:, 4:].astype(str).apply(lambda x: '_'.join(x), axis=1).to_numpy()


def evaluate_classifier(data, labels):
    loo = LeaveOneOut()
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)
    
    all_reports = []
    balanced_accuracies = []

    for train_index, test_index in loo.split(data):
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = encoded_labels[train_index], encoded_labels[test_index]

        clf = RandomForestClassifier(n_jobs=-1, n_estimators=50, min_samples_leaf=20)
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        weighted_precision, weighted_recall, weighted_fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        bal_acc = balanced_accuracy_score(y_test, y_pred)
        report = {"weighted_precision": weighted_precision, "weighted_recall": weighted_recall, "weighted_fscore": weighted_fscore}
        all_reports.append(report)
        balanced_accuracies.append(bal_acc)

    mean_report = pd.DataFrame(all_reports).mean()
    mean_balanced_accuracy = np.mean(balanced_accuracies)

    return mean_report, mean_balanced_accuracy

# def evaluate_classifier(data, labels):
#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
#     label_encoder = LabelEncoder()
#     encoded_labels = label_encoder.fit_transform(labels)
    
#     all_reports = []
#     balanced_accuracies = []

#     for train_index, test_index in kf.split(data):
#         X_train, X_test = data[train_index], data[test_index]
#         y_train, y_test = encoded_labels[train_index], encoded_labels[test_index]

#         clf = RandomForestClassifier(n_jobs=-1, n_estimators=50)
#         clf.fit(X_train, y_train)

#         y_pred = clf.predict(X_test)
#         weighted_precision, weighted_recall, weighted_fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
#         bal_acc = balanced_accuracy_score(y_test, y_pred)
#         report = {"weighted_precision": weighted_precision, "weighted_recall": weighted_recall, "weighted_fscore": weighted_fscore}
#         all_reports.append(report)
#         balanced_accuracies.append(bal_acc)

#     mean_report = pd.DataFrame(all_reports).mean()
#     mean_balanced_accuracy = np.mean(balanced_accuracies)

#     return mean_report, mean_balanced_accuracy

# Prepare your data for evaluation

In [None]:

# Call the evaluation function for both datasets
mean_report_original, mean_balanced_accuracy_original = evaluate_classifier(data, labels)
mean_report_fake, mean_balanced_accuracy_fake = evaluate_classifier(fake_data, fake_labels)

# Print the evaluation metrics for both datasets
print("Original dataset:")
print(f"Balanced accuracy: {mean_balanced_accuracy_original:.2f}")
print(f"Weighted precision: {mean_report_original['weighted_precision']:.2f}")
print(f"Weighted recall: {mean_report_original['weighted_recall']:.2f}")
print(f"Weighted F-score: {mean_report_original['weighted_fscore']:.2f}")

print("\nFake dataset:")
print(f"Balanced accuracy: {mean_balanced_accuracy_fake:.2f}")
print(f"Weighted precision: {mean_report_fake['weighted_precision']:.2f}")
print(f"Weighted recall: {mean_report_fake['weighted_recall']:.2f}")
print(f"Weighted F-score: {mean_report_fake['weighted_fscore']:.2f}")