In [5]:
import joblib
import pickle
import const as ct
import logging
import argparse
import configparser
import numpy as np
import multiprocessing
from math import sqrt

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

FEATURE_NAMES = [
    "Interarrival Time Max In", "Interarrival Time Max Out", "Interarrival Time Max Total",
    "Interarrival Time Mean In", "Interarrival Time Mean Out", "Interarrival Time Mean Total",
    "Interarrival Time SD In", "Interarrival Time SD Out", "Interarrival Time SD Total",
    "Interarrival Time 75th percentile In", "Interarrival Time 75th percentile Out", "Interarrival Time 75th percentile Total",
    "Time Percentile 25 In", "Time Percentile 50 In", "Time Percentile 75 In", "Time Percentile 100 In",
    "Time Percentile 25 Out", "Time Percentile 50 Out", "Time Percentile 75 Out", "Time Percentile 100 Out",
    "Time Percentile 25 Total", "Time Percentile 50 Total", "Time Percentile 75 Total", "Time Percentile 100 Total",
    "Number of Inbound Packets", "Number of Outbound Packets", "Total Number of Packets",
    "First 30 Packets Inbound", "First 30 Packets Outbound",
    "Last 30 Packets Inbound", "Last 30 Packets Outbound",
    "Packet Concentration Std Dev", "Packet Concentration Average",
    "Packets Per Second Average", "Packets Per Second Std Dev", "Packets Per Second Median", "Packets Per Second Min", "Packets Per Second Max",
    "Average Packet Ordering Inbound", "Average Packet Ordering Outbound",
    "Std Dev Packet Ordering Inbound", "Std Dev Packet Ordering Outbound",
    "Percentage Inbound Packets", "Percentage Outbound Packets",
]+ [
    "Packet Concentration Array {}".format(i) for i in range(1, 72)
] + [
    "Packets Per Second Array {}".format(i) for i in range(1, 21)
]


def kfingerprinting(X, y):
    model = RandomForestClassifier(n_jobs=-1, n_estimators=1000, oob_score=True)
    
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1123)
    acc_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    
    mean_accuracy = np.mean(acc_scores)
    std_dev = np.std(acc_scores)
    print('kfingerprinting')
    print('10-fold Cross Validation Accuracy Scores:', acc_scores)
    print('Mean Accuracy:', mean_accuracy)
    print('Standard Deviation of Accuracy:', std_dev)
    print(acc_scores)
    z_value = 1.96  # For a 95% confidence interval
    margin_of_error = z_value * (std_dev / sqrt(10))
    lower_bound = mean_accuracy - margin_of_error
    upper_bound = mean_accuracy + margin_of_error
    confidence_interval = (lower_bound, upper_bound)
    print("95% Confidence Interval: {:.4f} to {:.4f}".format(*confidence_interval))    
    # Fitting the model on the whole dataset
    model.fit(X, y)
    joblib.dump(model, 'ranpad2_0610_2057_norm.pkl')
    feature_importances = model.feature_importances_

    # You might still want to return something here depending on your requirements
    return model, feature_importances

if __name__ == '__main__':
    # ... (The rest of your code remains the same)

    # Loading data
    dic = np.load("/Users/ct/Library/Mobile Documents/com~apple~CloudDocs/cybersecurity_robotics/WebsiteFingerprinting/attacks/kfingerprinting/results/torque_data.npy", allow_pickle=True).item()
    
    X = np.array(dic['feature'])
    Y = np.array(dic['label'])
    y = np.array([label[0] for label in Y])
    
    # Applying k-Fold Cross Validation
    model, importances = kfingerprinting(X, y)
    importances = np.array(importances)
    
    # Get the indices that would sort the array in descending order
    sorted_indices = np.argsort(importances)[::-1]

    # Now, you can print the feature names based on the sorted indices
    for index in sorted_indices:
        print(sorted_indices)
        print(FEATURE_NAMES[index], importances[index])

    # Plot feature importance
    indices = np.argsort(importances)[::-1]


kfingerprinting
10-fold Cross Validation Accuracy Scores: [0.7   0.85  0.675 0.75  0.825 0.725 0.725 0.725 0.625 0.65 ]
Mean Accuracy: 0.725
Standard Deviation of Accuracy: 0.06708203932499368
[0.7   0.85  0.675 0.75  0.825 0.725 0.725 0.725 0.625 0.65 ]
95% Confidence Interval: 0.6834 to 0.7666
[ 42  40   5   4  14   3  18  33  45  34  44  32  22   8   6  11 128 127
 112   9   7 126  60 108  67 140  72  97  94 106  61  31  37  70  24 100
  15 129  49 124  23  47  53  64  78 141  99  56 115 105  21  96  57  38
  84  13 120  35  10   1  20  90  91   0  19 136 134  58 130 111  79  55
  83 122  17  26  81 103   2  36  62  74 121  50  63  68  48  12  73  85
 114 118 131  16 101  82  41  98  46 109  76  89  93 132 107 125  75 145
 113 133  65  51  25  77  87  66 104 102  86 150  52 138 119 135  92  54
 123  59  71 142  80 139  69  88 157 110 172 156 165  95  29 146 166 158
 169 144 143 147 167 153 170 163 149  28 168 162  30 116 164  27 174 117
 171 154 151 148 155 159 152 137 173 160 161  

IndexError: list index out of range

In [7]:
acc_scores = [0.19, 0.31, 0.28, 0.33, 0.19, 0.31, 0.19, 0.31, 0.31, 0.33]
mean_accuracy = np.mean(acc_scores)
std_dev = np.std(acc_scores)
print('KNN')
print('10-fold Cross Validation Accuracy Scores:', acc_scores)
print('Mean Accuracy:', mean_accuracy)
print('Standard Deviation of Accuracy:', std_dev)
print(acc_scores)
z_value = 1.96  # For a 95% confidence interval
margin_of_error = z_value * (std_dev / sqrt(10))
lower_bound = mean_accuracy - margin_of_error
upper_bound = mean_accuracy + margin_of_error
confidence_interval = (lower_bound, upper_bound)
print("95% Confidence Interval: {:.4f} to {:.4f}".format(*confidence_interval))    

KNN
10-fold Cross Validation Accuracy Scores: [0.19, 0.31, 0.28, 0.33, 0.19, 0.31, 0.19, 0.31, 0.31, 0.33]
Mean Accuracy: 0.275
Standard Deviation of Accuracy: 0.05714017850864661
[0.19, 0.31, 0.28, 0.33, 0.19, 0.31, 0.19, 0.31, 0.31, 0.33]
95% Confidence Interval: 0.2396 to 0.3104
