## In questo file andiamo ad applicare la PCA per il dataset totale

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from matplotlib.lines import Line2D
import os

In [None]:
with open('pickle/scaled_total_document.pickle', 'rb') as handle:
    df = pickle.load(handle)

In [None]:
#eliminazione duplicati

df = df.drop_duplicates()

In [None]:
# Seleziona una frazione casuale delle righe in base a una colonna specifica
# Selezioniamo il 50% delle righe

colonna_interessata = 'Label'
frazione_da_selezionare = 0.5  # Ad esempio, seleziona il 50% delle righe


df = df.groupby(colonna_interessata).apply(lambda x: x.sample(frac=frazione_da_selezionare, random_state=42)).reset_index(drop=True)

In [None]:
#contiamo il numero di feature per label

df_prova = df.groupby(['Label'])['Label'].count()

df_prova=df_prova.to_frame()
print(df_prova)
df_prova.set_index('Label')
df_prova=df_prova.rename(columns={'Label':'Count'})

In [None]:
#creiamo un array in cui andiamo a mettere le 5 feature più importanti per filtrare poi il dataframe

import csv

results = []
with open('top_feature/top5_totale.csv', newline='') as inputfile:
    for row in csv.reader(inputfile):
        results.append(row[0])

results.pop(0)
print(results)

In [None]:
#otteniamo il dataframe con solo le 5 feature più importanti

df_filtrato = pd.DataFrame()
for a in df.head(0):
    if a in results:
        df_filtrato[a] = df[a]

df_filtrato['Label'] = df['Label']
display(df_filtrato)

In [None]:
malign = df_filtrato[df_filtrato.Label == 1]
benign = df_filtrato[df_filtrato.Label == 0]

print("Benign: ", len(benign), "Bot: ", len(malign))

In [None]:
#bilanciamento del dataset
balanced_d = pd.concat([malign, benign.sample(len(malign))])
bal_x = balanced_d.iloc[:,:-1]
bal_y = balanced_d.iloc[:,-1:]
balanced_d.shape

### Applicazione della PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, svd_solver="auto").fit(bal_x)
pca_x = pca.transform(bal_x)

In [None]:
n_clusters = len(balanced_d['Label'].unique())  # numero di cluster

In [None]:
#applicazione del kmeans
km = KMeans(n_clusters = 2, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_pred = km.fit_predict(pca_x)

# Associazione del cluster i-esimo con la classe i-esima
cluster_class_mapping = {}
for i in range(n_clusters):
    cluster_samples = pca_x[y_pred == i]
    cluster_classes = bal_y[y_pred == i]
    unique_classes, class_counts = np.unique(cluster_classes, return_counts=True)
    dominant_class = unique_classes[np.argmax(class_counts)]
    cluster_class_mapping[i] = dominant_class

In [None]:
#Plot dei risultati

plt.figure(figsize=(15,8))
plt.title('Cluster of PCAs K-means', fontsize = 30)

plt.scatter(pca_x[y_pred == 0, 0], pca_x[y_pred == 0, 1], s = 100, c = 'purple')
plt.scatter(pca_x[y_pred == 1, 0], pca_x[y_pred == 1, 1], s = 100, c = 'yellow')

centers = pca.transform(km.cluster_centers_)
plt.scatter(centers[:, 0], centers[:, 1], s = 50, c = 'black', label="center")

difference = pd.DataFrame(data=[],columns=['num_cluster','num_class'])

# Stampiamo l'associazione del cluster con la classe
for i in range(n_clusters):
    if(i != cluster_class_mapping[i]):
        new_row = pd.Series({'num_cluster': i, 'num_class': cluster_class_mapping[i]})
        difference = pd.concat([difference,new_row.to_frame().T],ignore_index=True)

plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend()

title = "KMeans_PCA"
if not os.path.exists('image/clustering_totale/'):
    os.makedirs('image/clustering_totale/')
plt.savefig('image/clustering_totale/'+ title +'.png')

plt.show()

In [None]:
# se il numero del cluster e la classe non coincidono, andiamo ad effettuare uno switch

indici = []

for a in difference['num_cluster']:
    indici.append([
    index for index in range(len(y_pred))
    if y_pred[index] == a
])

for ind,true_value in zip(indici,difference['num_class']):
    for a in ind:
        y_pred[a] = true_value

In [None]:
accuracy_kmeans = str(accuracy_score(y_pred,bal_y))
precision_kmeans = str(precision_score(y_pred,bal_y))
recall_kmeans = str(recall_score(y_pred,bal_y))

# accuratezza kmeans
print("accuratezza" + accuracy_kmeans)
# precision kmeans
print("precision" + precision_kmeans)
# recall kemans
print("recall" + recall_kmeans)


metriche = pd.DataFrame({
    'metriche': ["accuracy","precision","recall"],
    'valori': [accuracy_kmeans,precision_kmeans,recall_kmeans]
})

if not os.path.exists('metriche/'):
    os.makedirs('metriche/')

metriche.to_csv('metriche/metriche_kmeans_pca_totale.csv',index=False)