# Node identifier: Machine Learning approach

## Preparation

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from imblearn.over_sampling import SMOTE

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
# Load train and test set
path = 'C:/Users/User/Desktop/Assignatures/Synthesis project/GraphAnomaly/dades_arnau/'

In [4]:
# Use balanced split
with open(path + 'balanced_train.pkl', 'rb') as f:
    train_set = pickle.load(f)
with open(path + 'balanced_test.pkl', 'rb') as f:
    test_set = pickle.load(f)

In [25]:
# Separate labels
train_set = train_set.loc[train_set['class'].isin([0, 1])]
y_train = train_set['class']
X_train = train_set.drop(columns=['class'])

# y_test = test_set.loc[test_set['class'].isin([0, 1])]['class']
# X_test = test_set.loc[test_set['class'].isin([0, 1])].drop(columns=['class'])

In [26]:
y_train

1         1
7         0
12        1
13        1
17        1
         ..
160587    1
160591    1
160596    1
160597    1
160601    1
Name: class, Length: 37326, dtype: object

In [None]:
# # Dimensionality reduction
# def apply_PCA(ncomponents):
#     # Standardize the features
#     scaler = StandardScaler()
#     df_pca = train_set.drop(columns=['class'])
#     scaled_data = scaler.fit_transform(df_pca)

#     # Apply PCA
#     pca = PCA(n_components=ncomponents)  # You can choose the number of components you want to keep
#     principal_components = pca.fit_transform(scaled_data)

#     # Create a DataFrame for the principal components
#     columns = [f"PC{i+1}" for i in range(principal_components.shape[1])]
#     principal_df = pd.DataFrame(data=principal_components, columns=columns)


#     explained_variance_ratio = pca.explained_variance_ratio_
#     cumulative_variance_ratio = explained_variance_ratio.sum()

#     print(f"\nExplained variance ratio: {cumulative_variance_ratio}")
#     print(f"Data reduction, from shape {df_pca.shape} to {principal_df.shape}")
    
#     # Add two columns to be able to apply ML models later on
#     principal_df['node'] = df_pca.index
#     principal_df['class'] = list(train_set['class'])
    
#     return principal_df

In [None]:
class machineLearning2:
    def __init__(self, train_set, test_set):
        self.train_set = train_set
        self.test_set = test_set
        self.metrics = pd.DataFrame(columns=["Classifier", "Accuracy", "Precision", "Recall", "F1 score"])
        self.predictions = {}
   

    
    def train_and_test(self, algorithm, display_conf_matrix=False):
        X_train = self.train_set.loc[self.train_set['class'].isin([1, 2])].drop(columns=['class'])
        y_train = self.train_set.loc[self.train_set['class'].isin([1, 2])]['class']
        
        X_test = self.test_set.loc[self.test_set['class'].isin([1, 2])].drop(columns=['class'])
        y_test = self.test_set.loc[self.test_set['class'].isin([1, 2])]['class']
        
        print(f"\nTraining {algorithm}...\n")
        try:
            if algorithm == "Logistic regression":
                model = LogisticRegression()
                
            elif algorithm == "Random forest":
                model = RandomForestClassifier()
                
            elif algorithm == "SVM":
                model = SVC()
            
            elif algorithm == "Decision tree":
                model = DecisionTreeClassifier()
            
        except:
            return "Error! No machine learning model chosen."
        
        
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        self.predictions[algorithm] = y_pred
        
        print(f"Testing {algorithm}...\n")
        accuracy = round(accuracy_score(y_test, y_pred), 2)
        print("Accuracy: {:.2f}%".format(accuracy * 100))
        
        precision = round(precision_score(y_test, y_pred, pos_label=1), 2)
        print("Precision: {:.2f}%".format(precision * 100))
        
        recall = round(recall_score(y_test, y_pred, pos_label=1), 2)
        print("Recall: {:.2f}%".format(recall * 100))
        
        f1 = round(f1_score(y_test, y_pred, pos_label=1),2)
        print("F1 Score: {:.2f}%".format(f1 * 100))
        
        self.metrics.loc[len(self.metrics)] = [algorithm, accuracy, precision, recall, f1]
        
        if display_conf_matrix:
            cm = confusion_matrix(y_test, y_pred, labels=[1, 2])
            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Ilicit', 'Licit'])
            disp.plot()
            plt.title(algorithm)
            plt.show()


            # conf_matrix = confusion_matrix(y_test, y_pred)
            # plt.figure(figsize=(8, 6))
            # sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', 
            #             xticklabels=model.classes_, 
            #             yticklabels=model.classes_)
            # plt.xlabel('Predicted Labels')
            # plt.ylabel('True Labels')
            # plt.title(f'Confusion Matrix for {algorithm}')
            # plt.show()
    
    def get_metrics(self):
        return self.metrics
    
    
    def get_predictions(self):
        return self.predictions

## Models

In [24]:
model = DecisionTreeClassifier()

model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

print(f"Testing {model}...\n")
accuracy = round(accuracy_score(y_test, y_pred), 2)
print("Accuracy: {:.2f}%".format(accuracy * 100))

precision = round(precision_score(y_test, y_pred, pos_label=1), 2)
print("Precision: {:.2f}%".format(precision * 100))

recall = round(recall_score(y_test, y_pred, pos_label=1), 2)
print("Recall: {:.2f}%".format(recall * 100))

f1 = round(f1_score(y_test, y_pred, pos_label=1),2)
print("F1 Score: {:.2f}%".format(f1 * 100))

cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Ilicit', 'Licit'])
disp.plot()
plt.title(model)
plt.show()

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.