# Group 4 Assignment 2 

### Authors: 
-  Chin Yee Wan 
-  Darrel Koh
-  Nguyen Gia Khanh 
-  Ngo Vu Anh	

### Main Steps
1.  Data Preprocessing 
-   Read in as SPARK dataframe for data preprocessing
-   Convert to Pandas dataframe for data exploration
2.  Data Exploration
3.  Data Modelling
4.  Data Evaluation

# Discover and Visualise the Data

## Import Libraries

In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler, VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from sklearn.metrics import confusion_matrix, classification_report

## Functions definition 

### Read in file

In [None]:
train_df = pd.read_csv('GA2Datasets/UNSW_NB15_training-set.csv')
test_df = pd.read_csv('GA2Datasets/UNSW_NB15_testing-set.csv')

In [None]:
spark = SparkSession.builder \
                            .appName("CSCI316GP2")\
                            .config("spark.sql.files.maxPartitionBytes", "1000000")\
                            .getOrCreate()

In [None]:
spark_df = spark.createDataFrame(train_df)
spark_df.show()

#### Custom pipeline for data pre-processing

In [None]:
class PreProcessPipeline:
    def __init__(self, label_encode = True, process_label = 'Binary'):
        self.label_encode = label_encode
        self.process_label = process_label
    
    def fit(self):
        return self

    def transform(self, df):
        df = df.drop('id', axis=1)
        df = df.copy()
        if self.label_encode:
            columns = ['proto', 'service', 'state', 'attack_cat']
            for column in columns:
                unique_values = df[column].unique()
                mapping = {value: index for index, value in enumerate(unique_values)}
                df[column] = df[column].map(mapping)

        if self.process_label == 'Binary':
            df.drop('attack_cat', axis=1, inplace=True)
        else:             
            df['attack_cat'], df['label'] = df['label'], df['attack_cat']
            print('change name')
            df.drop('attack_cat', axis=1, inplace=True)      

        return df

#### Dataframe Pipeline

In [None]:
'''
Author: Khanh Nguyen
Name: PySpark Dataframe Pipeline
Description:
    This class is used to create a pipeline for PySpark dataframe, accept 2 boolean parameter: smote & standardize.
    Features 
        (Default)
        - Resample: Resample the dataframe
        - Vectorize: Vectorize the dataframe
        (activate by setting the parameter to True):
        - SMOTE: Oversampling the minority class
        - Standardize: Standardize the dataframe using z-score
'''

from pyspark.sql.functions import col
class SparkDFPipeline:
    def __init__(self, smote=False, standardize=False):
        self.smote = smote
        self.standardize = standardize
    
    def fit(self):
        return self
    
    def transform(self, train_df, test_df):
        if self.smote:
            majority = train_df.filter(col('label') == 0)
            minority = train_df.filter(col('label') == 1)

            majority_count = majority.count()
            minority_count = minority.count()

            ratio = int(majority_count / minority_count)
            sample_num = int(ratio * minority_count) - minority_count
            sample = minority.sample(True, sample_num / minority_count, seed=42)
            balanced_sample = minority.union(sample)
            train_df = majority.union(balanced_sample).orderBy('label')
        
        if self.standardize:
            # Standardize the df

            # Resample the df
            num_partitions = 500
            repartitioned_df = train_df.repartition(num_partitions)

            exclude = ['proto', 'service', 'state']
            input_columns = train_df.columns[:-1]
            selected_columns = [col for col in input_columns if col not in exclude]
            # Vectorize the df
            assembler = VectorAssembler(inputCols=selected_columns, outputCol='features')
            train_df = assembler.transform(repartitioned_df)
            test_df = assembler.transform(test_df)

            # Standardize the df
            scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=True)
            scaler_model = scaler.fit(train_df)
            train_df = scaler_model.transform(train_df)

            scaler_model = scaler.fit(test_df)
            test_df = scaler_model.transform(test_df)
            test_df = test_df.drop('features')
            train_df = train_df.drop('features')
            
            # put back the categorical columns
            input_cols = ['scaled_features', 'proto', 'service', 'state']
            output_col = "features"
            assembler1 = VectorAssembler(inputCols=input_cols, outputCol=output_col)
            train_df = assembler1.transform(train_df)
            test_df = assembler1.transform(test_df)

            # return result
            test_df = test_df.select('features', 'label')
            train_df = train_df.select('features', 'label')
        else:
            # Normal vectorize df
            num_partitions = 500
            repartitioned_df = train_df.repartition(num_partitions)
            input_columns = train_df.columns[:-1]
            assembler = VectorAssembler(inputCols=input_columns, outputCol='features')
            train_df = assembler.transform(repartitioned_df)
            train_df = train_df.select('features', 'label')
            test_df = assembler.transform(test_df)
              
        return train_df, test_df

#### For Visualisation

In [None]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

#### For Evaluation

In [None]:
# Define your class labels
class_labels = ['Normal', 'Generic', 'Exploits', 'Fuzzers', 'DoS', 'Reconnaissance', 'Analysis', 'Backdoor', 'Shellcode', 'Worms']

def evaluate_model(model, val_data, model_name):
    # Make predictions on the validation data
    predictions = model.transform(val_data)

    acc_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = acc_evaluator.evaluate(predictions)

    f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
    f1_score = f1_evaluator.evaluate(predictions)

    # AUC_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="areaUnderROC")
    # AUC_score = AUC_evaluator.evaluate(predictions)

    # AUPR_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="areaUnderPR")
    # AUPR_score = AUPR_evaluator.evaluate(predictions)

    precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
    precision_score = precision_evaluator.evaluate(predictions)

    recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
    recall_score = recall_evaluator.evaluate(predictions)
    
    # Convert Spark DataFrames to Pandas DataFrames for visualization
    y_true_pd = predictions.select('label').toPandas()
    y_pred_pd = predictions.select('prediction').toPandas()

    # Generate the confusion matrix
    cm = confusion_matrix(y_true_pd['label'], y_pred_pd['prediction'])

    # Get the predicted counts for each class label
    predicted_counts = y_pred_pd['prediction'].value_counts()

    # Create a dictionary to store the counts of each class label
    class_counts = {label: 0 for label in class_labels}

    # Fill in the dictionary with actual predicted counts where available
    for key, count in predicted_counts.items():
        class_counts[class_labels[int(key)]] = count

    # Convert the dictionary values to a list
    predicted_counts_list = [class_counts[label] for label in class_labels]

    # Display the confusion matrix as a heatmap with sorted class labels and counts
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels[::-1], yticklabels=class_labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(model_name)
    plt.show()


    # Create a dictionary of model evaluation metrics
    eval_metrics = {
        'Accuracy': accuracy
        , 'F1 Score': f1_score
        # , 'Area Under ROC': AUC_score
        # , 'Area Under PR': AUPR_score
        , 'Precision': precision_score
        , 'Recall': recall_score
    }

    return eval_metrics


## Data Exploration

### Explore train_df 

In [None]:
train_df.isnull().sum()

In [None]:
pipeline = PreProcessPipeline(label_encode=True, process_label='Multi')
train_df = pipeline.transform(train_df)
test_df = pipeline.transform(test_df)

## Data Visualisation

In [None]:
rcParams["figure.figsize"]=(20,22)
train_df.hist()
plt.grid()

# Prepare the data

### Convert Pandas DF to Spark DF

In [None]:
sparktrain_df = spark.createDataFrame(train_df)
sparktest_df = spark.createDataFrame(test_df)

### Feature enabler

##### for Default

In [None]:
pipeline = SparkDFPipeline(smote=False, standardize=False)
train, test = pipeline.transform(sparktrain_df, sparktest_df)

In [None]:
train

In [None]:
test

##### for Fine Tuning

In [None]:
pipeline = SparkDFPipeline(smote=False, standardize=True)
FT_train, FT_test = pipeline.transform(sparktrain_df, sparktest_df)

In [None]:
FT_train

In [None]:
FT_test

# Model selection and training
- Select machine learning models (Logistic Regression , Decision Tree, Random Forest, Multilayer perceptron).
- Split the data into training and validation sets.
- Train the selected models using the training data.

## Logistic Regression model

### Default Model

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10, regParam=0.01, elasticNetParam=0.8)

# Train the model
model = lr.fit(train)

# Make predictions on the test data
predictions = model.transform(test)

# Evaluate the model's performance
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

In [None]:
# usage for default model
lr_default = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10)
pipeline_default = Pipeline(stages=[lr_default])
model_default = pipeline_default.fit(train)  # Use the 'train' dataset

###  Fine-Tuning 

In [None]:
# usage for fine-tuned models
lr_tuned = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10)
pipeline_tuned = Pipeline(stages=[lr_tuned])
model_tuned = pipeline_tuned.fit(FT_train)  # Use the 'FT_train' dataset

In [None]:
# Define the stages for your pipeline
lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10)

# Create a pipeline with the defined stages
pipeline = Pipeline(stages=[assembler, lr])

# Define the ParamGrid
paramGrid = ParamGridBuilder() \
    .addGrid(lr.maxIter, [10, 20, 30]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

# Instantiate CrossValidator with the pipeline and paramGrid
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

# Fit the CrossValidator on your training data
best_tuned_model = cv.fit(FT_train).bestModel



### Evaluation

In [None]:
# Evaluate models using the evaluate_model_multiclass function
evaluation_results_default = evaluate_model(model_default, test, 'Default Model')  # Use the 'test' dataset

# Evaluate models using the evaluate_model_multiclass function
evaluation_results_tuned = evaluate_model(model_tuned, FT_test, 'Fine-Tuned Model')  # Use the 'FT_test' dataset

# Corrected variable name
evaluation_results_best_tuned = evaluate_model(best_tuned_model, FT_test, 'Best-Tuned Model')  # Use the 'FT_test' dataset

# Print evaluation results for all models side by side
print("Evaluation Results:")
print(f"{'Metric':<20}{'Default Model':<20}{'Fine-Tuned Model':<20}{'Best-Tuned Model':<20}")
print("=" * 80)

for metric in evaluation_results_default.keys():
    default_value = evaluation_results_default[metric]
    tuned_value = evaluation_results_tuned[metric]
    best_tuned_value = evaluation_results_best_tuned[metric]
    print(f"{metric:<20}{default_value:<20.6f}{tuned_value:<20.6f}{best_tuned_value:<20.6f}")


##### Sci-kit Learn

In [None]:
# from sklearn.metrics import confusion_matrix, classification_report

# def evaluate_model_with_visualization(model, val_data, model_name):
#     # Make predictions on the validation data
    # predictions = model.transform(val_data)

#     # Evaluate the model using a BinaryClassificationEvaluator for AUC
#     auc_evaluator = BinaryClassificationEvaluator(labelCol='label')
#     auc = auc_evaluator.evaluate(predictions)

#     # Calculate additional metrics
#     sensitivity = predictions.filter("label = 1 and prediction = 1").count() / predictions.filter("label = 1").count()
#     specificity = predictions.filter("label = 0 and prediction = 0").count() / predictions.filter("label = 0").count()
#     precision = predictions.filter("prediction = 1").count() / predictions.filter("prediction = 1 or prediction = 0").count()
#     recall = sensitivity
#     f1_score = 2 * (precision * recall) / (precision + recall)
#     accuracy = (predictions.filter("label = prediction").count()) / predictions.count()

#     # Convert Spark DataFrames to Pandas DataFrames for visualization
#     y_true_pd = predictions.select('label').toPandas()
#     y_pred_pd = predictions.select('prediction', 'probability').toPandas()
    
#     # Convert prediction probabilities to binary predictions
#     y_pred_binary = [1 if prob[1] >= 0.5 else 0 for prob in y_pred_pd['probability']]

#     # Generate the confusion matrix
#     cm = confusion_matrix(y_true_pd['label'], y_pred_binary)

#     # Display the confusion matrix as a heatmap
#     plt.figure(figsize=(6, 4))
#     sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-default', 'Default'], 
#                 yticklabels=['Non-default', 'Default'])
#     plt.xlabel('Predicted')
#     plt.ylabel('True')
#     plt.title(model_name)
#     plt.show()

#     # Generate the classification report
#     report = classification_report(y_true_pd['label'], y_pred_binary, target_names=['Non-default', 'Default'], output_dict=True)

#     # Create a summary table
#     summary_table = pd.DataFrame({
#         'Model': [model_name],
#         'AUC': [auc],
#         'Sensitivity': [sensitivity],
#         'Specificity': [specificity],
#         'Precision': [precision],
#         'Recall': [recall],
#         'F1-Score': [f1_score],
#         'Accuracy': [accuracy],
#         'Precision (Non-default)': [report['Non-default']['precision']],
#         'Recall (Non-default)': [report['Non-default']['recall']],
#         'F1-score (Non-default)': [report['Non-default']['f1-score']],
#         'Precision (Default)': [report['Default']['precision']],
#         'Recall (Default)': [report['Default']['recall']],
#         'F1-score (Default)': [report['Default']['f1-score']],
#     })

#     return summary_table

# # Assuming you have 'model_default' and 'test' DataFrame from your pipeline
# evaluation_results_default = evaluate_model_with_visualization(model_default, test, 'Default Model')
# print(evaluation_results_default)


In [None]:
# from sklearn.metrics import confusion_matrix

# # Assuming you have the true labels and predicted labels for your test data
# true_labels = test.select('label').rdd.flatMap(lambda x: x).collect()
# predicted_labels = model_default.transform(test).select('prediction').rdd.flatMap(lambda x: x).collect()

# # Calculate the confusion matrix
# cm = confusion_matrix(true_labels, predicted_labels)

# # Print the confusion matrix
# print("Confusion Matrix:")
# print(cm)

## Decision Tree Model

### Default Model

###  Fine-Tuning 

### Evaluation

## Random Forest Model

### Default Model

###  Fine-Tuning 

### Evaluation

## Multilayer perceptron Model

### Default Model

###  Fine-Tuning 

### Evaluation

# Final comparison between Models