# Group 4 Assignment 2 

### Authors: 
-  Chin Yee Wan 
-  Darrel Koh
-  Nguyen Gia Khanh 
-  Ngo Vu Anh	

### Main Steps
1.  Data Preprocessing 
-   Read in as SPARK dataframe for data preprocessing
-   Convert to Pandas dataframe for data exploration
2.  Data Exploration
3.  Data Modelling
4.  Data Evaluation

# Discover and Visualise the Data

## Import Libraries

In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from imblearn.over_sampling import SMOTE


from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler, VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

from sklearn.metrics import confusion_matrix, classification_report

## Functions definition 

### Read in file

In [None]:
train_df = pd.read_csv('GA2Datasets/UNSW_NB15_training-set.csv')
test_df = pd.read_csv('GA2Datasets/UNSW_NB15_testing-set.csv')

In [None]:
spark = SparkSession.builder \
                            .appName("CSCI316GP2")\
                            .config("spark.sql.files.maxPartitionBytes", "1000000")\
                            .getOrCreate()

In [None]:
spark_df = spark.createDataFrame(train_df)
spark_df.show()

#### Custom pipeline for data pre-processing

In [None]:
class PreProcessPipeline:
    def __init__(self, label_encode = True, process_label = 'Binary', smote = False):
        self.label_encode = label_encode
        self.process_label = process_label
        self.smote = smote
    
    def fit(self):
        return self

    def transform(self, df):
        df = df.drop('id', axis=1)
        df = df.copy()
        if self.label_encode:
            columns = ['proto', 'service', 'state', 'attack_cat']
            for column in columns:
                unique_values = df[column].unique()
                mapping = {value: index for index, value in enumerate(unique_values)}
                df[column] = df[column].map(mapping)

        if self.process_label == 'Binary':
            df.drop('attack_cat', axis=1, inplace=True)
        else:             
            df['attack_cat'], df['label'] = df['label'], df['attack_cat']
            df.drop('attack_cat', axis=1, inplace=True)   

        if self.smote:
            # Separate features and labels
            X = df.drop('label', axis=1)
            y = df['label']

            # Apply SMOTE for oversampling
            smote = SMOTE(sampling_strategy='auto', random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X, y)

            # Convert NumPy arrays back to Pandas DataFrames
            X_resampled_df = pd.DataFrame(data=X_resampled, columns=X.columns)
            y_resampled_df = pd.DataFrame(data=y_resampled, columns=['label'])

            # Concatenate the features and label columns into a single DataFrame
            df = pd.concat([X_resampled_df, y_resampled_df], axis=1)     

        return df

#### Dataframe Pipeline

In [None]:
'''
Author: Khanh Nguyen
Name: PySpark Dataframe Pipeline
Description:
    This class is used to create a pipeline for PySpark dataframe, accept 2 boolean parameter: smote & standardize.
    Features 
        (Default)
        - Resample: Resample the dataframe
        - Vectorize: Vectorize the dataframe
        (activate by setting the parameter to True):
        - SMOTE: Oversampling the minority class
        - Standardize: Standardize the dataframe using z-score
'''
class SparkDFPipeline:
    def __init__(self, standardize=False):
        self.standardize = standardize
    
    def fit(self):
        return self
    
    def transform(self, train_df, test_df):      
        if self.standardize:
            # Standardize the df

            # Resample the df
            num_partitions = 500
            repartitioned_df = train_df.repartition(num_partitions)

            exclude = ['proto', 'service', 'state']
            input_columns = train_df.columns[:-1]
            selected_columns = [col for col in input_columns if col not in exclude]
            # Vectorize the df
            assembler = VectorAssembler(inputCols=selected_columns, outputCol='features')
            train_df = assembler.transform(repartitioned_df)
            test_df = assembler.transform(test_df)

            # Standardize the df
            scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=True)
            scaler_model = scaler.fit(train_df)
            train_df = scaler_model.transform(train_df)

            scaler_model = scaler.fit(test_df)
            test_df = scaler_model.transform(test_df)
            test_df = test_df.drop('features')
            train_df = train_df.drop('features')
            
            # put back the categorical columns
            input_cols = ['scaled_features', 'proto', 'service', 'state']
            output_col = "features"
            assembler1 = VectorAssembler(inputCols=input_cols, outputCol=output_col)
            train_df = assembler1.transform(train_df)
            test_df = assembler1.transform(test_df)

            # return result
            test_df = test_df.select('features', 'label')
            train_df = train_df.select('features', 'label')
        else:
            # Normal vectorize df
            num_partitions = 500
            repartitioned_df = train_df.repartition(num_partitions)
            input_columns = train_df.columns[:-1]
            assembler = VectorAssembler(inputCols=input_columns, outputCol='features')
            train_df = assembler.transform(repartitioned_df)
            train_df = train_df.select('features', 'label')
            test_df = assembler.transform(test_df)
              
        return train_df, test_df

#### For Logistic Regression

In [None]:
def train_weighted_logistic_regression(train_df):
    # Calculate class frequencies
    class_frequencies = train_df.groupBy("label").count()

    # Calculate class weights
    total_samples = train_df.count()
    class_frequencies = class_frequencies.withColumn("weight", total_samples / (class_frequencies["count"] * class_frequencies.count()))

    # Join the weights with the training data
    train_with_weights = train_df.join(class_frequencies, on="label")

    # Add a constant column for weight if it doesn't exist
    if "weight" not in train_with_weights.columns:
        train_with_weights = train_with_weights.withColumn("weight", lit(1.0))

    # Create a VectorAssembler
    assembler = VectorAssembler(inputCols=['features'], outputCol='assembled_features')

    # Define the Logistic Regression model with class weights
    lr_weighted = LogisticRegression(featuresCol='assembled_features', labelCol='label', maxIter=10, weightCol='weight')

    # Create a pipeline with the defined stages
    pipeline_weighted = Pipeline(stages=[assembler, lr_weighted])

    # Fit the pipeline on your training data with class weights
    model_weighted = pipeline_weighted.fit(train_with_weights)
    
    return model_weighted

#### For visualisation

In [None]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

#### For Evaluation

In [None]:
# Define your class labels
class_labels = ['Normal', 'Generic', 'Exploits', 'Fuzzers', 'DoS', 'Reconnaissance', 'Analysis', 'Backdoor', 'Shellcode', 'Worms']

def evaluate_model(model, val_data, model_name, process_label):
    # Make predictions on the validation data
    predictions = model.transform(val_data)

    if process_label == 'Multi':
        acc_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
        accuracy = acc_evaluator.evaluate(predictions)

        f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
        f1_score = f1_evaluator.evaluate(predictions)

        # AUC_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="areaUnderROC")
        # AUC_score = AUC_evaluator.evaluate(predictions)

        # AUPR_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="areaUnderPR")
        # AUPR_score = AUPR_evaluator.evaluate(predictions)

        precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
        precision_score = precision_evaluator.evaluate(predictions)

        recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
        recall_score = recall_evaluator.evaluate(predictions)
        
        # Convert Spark DataFrames to Pandas DataFrames for visualization
        y_true_pd = predictions.select('label').toPandas()
        y_pred_pd = predictions.select('prediction').toPandas()

        # Generate the confusion matrix
        cm = confusion_matrix(y_true_pd['label'], y_pred_pd['prediction'])

        # Get the predicted counts for each class label
        predicted_counts = y_pred_pd['prediction'].value_counts()

        # Create a dictionary to store the counts of each class label
        class_counts = {label: 0 for label in class_labels}

        # Fill in the dictionary with actual predicted counts where available
        for key, count in predicted_counts.items():
            class_counts[class_labels[int(key)]] = count

        # Convert the dictionary values to a list
        predicted_counts_list = [class_counts[label] for label in class_labels]

        # Display the confusion matrix as a heatmap with sorted class labels and counts
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels[::-1], yticklabels=class_labels)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(model_name)
        plt.show()


        # Create a dictionary of model evaluation metrics
        eval_metrics = {
            'Accuracy': accuracy
            , 'F1 Score': f1_score
            # , 'Area Under ROC': AUC_score
            # , 'Area Under PR': AUPR_score
            , 'Precision': precision_score
            , 'Recall': recall_score
        }

    elif process_label == 'Binary':
        # Evaluate the model using a BinaryClassificationEvaluator for AUC
        auc_evaluator = BinaryClassificationEvaluator(labelCol='label')
        auc = auc_evaluator.evaluate(predictions)
        # Evaluate the model using a BinaryClassificationEvaluator for AUC
        auc_evaluator = BinaryClassificationEvaluator(labelCol='label')
        auc = auc_evaluator.evaluate(predictions)

        # Evaluate the model using a BinaryClassificationEvaluator for areaUnderPR
        pr_evaluator = BinaryClassificationEvaluator(labelCol='label', metricName='areaUnderPR')
        area_under_pr = pr_evaluator.evaluate(predictions)

        

        # Calculate additional metrics
        sensitivity = predictions.filter("label = 1 and prediction = 1").count() / predictions.filter("label = 1").count()
        specificity = predictions.filter("label = 0 and prediction = 0").count() / predictions.filter("label = 0").count()
        precision = predictions.filter("prediction = 1").count() / predictions.filter("prediction = 1 or prediction = 0").count()
        recall = sensitivity
        f1_score = 2 * (precision * recall) / (precision + recall)
        accuracy = (predictions.filter("label = prediction").count()) / predictions.count()
        # youdens_j = sensitivity + specificity - 1
        # balanced_accuracy = (sensitivity + specificity) / 2

        # Convert Spark DataFrames to Pandas DataFrames for visualization
        y_true_pd = predictions.select('label').toPandas()
        y_pred_pd = predictions.select('prediction', 'probability').toPandas()
        
        # Convert prediction probabilities to binary predictions
        y_pred_binary = [1 if prob[1] >= 0.5 else 0 for prob in y_pred_pd['probability']]

        # Generate the confusion matrix
        cm = confusion_matrix(y_true_pd['label'], y_pred_binary)

        # Display the confusion matrix as a heatmap
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-default', 'Default'], 
                    yticklabels=['Non-default', 'Default'])
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title(model_name)
        plt.show()


        # Create a dictionary of model evaluation metrics
        eval_metrics = {
            'AUC': auc,
            'AreaUnderPR': area_under_pr,
            'Sensitivity': sensitivity,
            'Specificity': specificity,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1_score,
            'Accuracy': accuracy,
            # "Youden's J Index": youdens_j,
            # 'Balanced Accuracy': balanced_accuracy
        }

    return eval_metrics


# Sample usage
# # Evaluate models using the evaluate_model function
# evaluation_results_default = evaluate_model(model_default, test, 'Default Model',process_label)  # Use the 'test' dataset
# evaluation_results_best_tuned = evaluate_model(best_tuned_model, FT_test, 'Best-Tuned Model', process_label)  # Use the 'FT_test' dataset
# evaluation_results_weighted = evaluate_model(weighted_lr_model, FT_test, 'Weighted Model', process_label)  # Use the 'FT_test' dataset

# # Print evaluation results for all models side by side
# print("Evaluation Results:")
# print(f"{'Metric':<20}{'Default Model':<20}{'Best-Tuned Model':<20}{'Weighted Model':<20}")
# print("=" * 80)

# for metric in evaluation_results_default.keys():
#     default_value = evaluation_results_default[metric]
#     best_tuned_value = evaluation_results_best_tuned[metric]
#     weighted_value = evaluation_results_weighted[metric]
#     print(f"{metric:<20}{default_value:<20.6f}{best_tuned_value:<20.6f}{weighted_value:<20.6f}")


## Data Exploration

### Explore train_df 

In [None]:
train_df.isnull().sum()

#### Change either 'Binary' or 'Multi' according to Classification use case

In [None]:
process_label = 'Binary'  # or 'Binary' depending on use case

if process_label == 'Multi':
    pipeline_train = PreProcessPipeline(label_encode=True, process_label='Multi')
    pipeline_test = PreProcessPipeline(label_encode=True, process_label='Multi')
elif process_label == 'Binary':
    pipeline_train = PreProcessPipeline(label_encode=True, process_label='Binary')
    pipeline_test = PreProcessPipeline(label_encode=True, process_label='Binary')

# Transform train and test data using the appropriate pipeline
train_df = pipeline_train.transform(train_df)
test_df = pipeline_test.transform(test_df)


## Data Visualisation

In [None]:
# Set up the figure with subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

# Plot countplot for column1
sns.countplot(data=train_df, x='label', palette="Set1", ax=axes[0])
axes[0].set_title("Label Plot for train df")
axes[0].set_xlabel("Column 1")
axes[0].set_ylabel("Count")

# Plot countplot for column2
sns.countplot(data=test_df, x='label', palette="Set2", ax=axes[1])
axes[1].set_title("Label Plot for test df")
axes[1].set_xlabel("Column 2")
axes[1].set_ylabel("Count")

# Adjust spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()

In [None]:
rcParams["figure.figsize"]=(20,22)
train_df.hist()
plt.grid()

#### Check Coorelation between the features

In [None]:
# Visualize - Correlation matrix

# Create a correlation matrix
corr_matrix = train_df.corr()

# Select the correlation values with 'label', label here means attack_cat
target_corr = corr_matrix['label']

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(12, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show() 

In [None]:
# Show the feature ranking in corr order 

# Calculate the absolute correlation values with the 'label'
target_corr_abs = corr_matrix['label'].abs()

# Sort the correlation values in descending order
sorted_corr = target_corr_abs.sort_values(ascending=False)

# Print the sorted correlation values and their corresponding attributes
for attribute, correlation in target_corr_abs.items():
    print(f"{attribute}: {correlation:.4f}")

# Prepare the data

### Convert Pandas DF to Spark DF

In [None]:
sparktrain_df = spark.createDataFrame(train_df)
sparktest_df = spark.createDataFrame(test_df)

### Feature enabler

##### for Default

In [None]:
pipeline = SparkDFPipeline(standardize=True)
train, test = pipeline.transform(sparktrain_df, sparktest_df)

##### for Fine Tuning

In [None]:
pipeline = SparkDFPipeline(standardize=True)
FT_train, FT_test = pipeline.transform(sparktrain_df, sparktest_df)

# Model selection and training
- Select machine learning models (Logistic Regression , Decision Tree, Random Forest, Multilayer perceptron).
- Split the data into training and validation sets.
- Train the selected models using the training data.

## Logistic Regression model

### Default Model

In [None]:
# for default model
lr_default = LogisticRegression(featuresCol='features', labelCol='label', maxIter=10)
pipeline_default = Pipeline(stages=[lr_default])
model_default = pipeline_default.fit(train)  # Use the 'train' dataset

###  Fine-Tuning 

#### Best Tuned Model

In [None]:
# Create a pipeline with the defined stages
pipeline = Pipeline(stages=[lr_default])

# Define the ParamGrid
paramGrid = ParamGridBuilder() \
    .addGrid(lr_default.maxIter, [10, 20, 30]) \
    .addGrid(lr_default.regParam, [0.1, 0.01]) \
    .build()

# Set up the appropriate evaluator based on process_label
if process_label == 'Multi':
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
elif process_label == 'Binary':
    evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")

# Instantiate CrossValidator with the pipeline and paramGrid
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

# Fit the CrossValidator on your training data
best_tuned_model = cv.fit(FT_train).bestModel

#### Class-weighted model

In [None]:
weighted_lr_model = train_weighted_logistic_regression(FT_train)

### Evaluation

In [None]:
# Evaluate models using the evaluate_model function
evaluation_results_default = evaluate_model(model_default, test, 'Default Model',process_label)  # Use the 'test' dataset
evaluation_results_best_tuned = evaluate_model(best_tuned_model, FT_test, 'Best-Tuned Model', process_label)  # Use the 'FT_test' dataset
evaluation_results_weighted = evaluate_model(weighted_lr_model, FT_test, 'Weighted Model', process_label)  # Use the 'FT_test' dataset

# Print evaluation results for all models side by side
print("Evaluation Results:")
print(f"{'Metric':<20}{'Default Model':<20}{'Best-Tuned Model':<20}{'Weighted Model':<20}")
print("=" * 80)

for metric in evaluation_results_default.keys():
    default_value = evaluation_results_default[metric]
    best_tuned_value = evaluation_results_best_tuned[metric]
    weighted_value = evaluation_results_weighted[metric]
    print(f"{metric:<20}{default_value:<20.6f}{best_tuned_value:<20.6f}{weighted_value:<20.6f}")


## Decision Tree Model

### Default Model

###  Fine-Tuning 

### Evaluation

## Random Forest Model

### Default Model

In [None]:
pipeline = SparkDFPipeline(standardize=False)
train, test = pipeline.transform(train_df, test_df)

# Train a RandomForest model
default_rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42)

# Create a pipeline
default_pipeline = Pipeline(stages=[default_rf])

# Train the pipeline
default_model = default_pipeline.fit(train)

# Make predictions on the test data
default_predictions = default_model.transform(test)

# Create an evaluator for accuracy
default_accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
default_accuracy = default_accuracy_evaluator.evaluate(default_predictions)
print("Accuracy = %g" % default_accuracy)

# Create an evaluator for f1 score
default_f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")
default_f1_score = default_f1_evaluator.evaluate(default_predictions)
print("F1 Score = %g" % default_f1_score)

# Create an evaluator for precision
default_precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
default_precision = default_precision_evaluator.evaluate(default_predictions)
print("Precision = %g" % default_precision)

###  Fine-Tuning 

In [None]:
pipeline = SparkDFPipeline(standardize=True)
FT_train, FT_test = pipeline.transform(train_df, test_df)

# Train a RandomForest model
tuned_rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42)
tuned_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')


# Define the parameter grid
param_grid = ParamGridBuilder() \
            .addGrid(tuned_rf.numTrees, [10,20,30]) \
            .addGrid(tuned_rf.maxDepth, [5,6,8]) \
            .addGrid(tuned_rf.impurity, ['gini']) \
            .build()

# Create a CrossValidator instance
crossval = CrossValidator(estimator=tuned_rf,
                          estimatorParamMaps=param_grid,
                          evaluator=tuned_evaluator,
                          numFolds=3)  # Number of cross-validation folds

# Run cross-validation, and choose the best set of parameters
cv_model = crossval.fit(FT_train)
best_model = cv_model.bestModel

# Get the best parameters
best_numTrees = best_model.getOrDefault('numTrees')
best_maxDepth = best_model.getOrDefault('maxDepth')
best_impurity = best_model.getOrDefault('impurity')

# Print the best parameters
print("Best numTrees:", best_numTrees)
print("Best maxDepth:", best_maxDepth)
print("Best impurity:", best_impurity)

# Make predictions on the test data
best_predictions = best_model.transform(FT_test)

# Create an evaluator for accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
best_accuracy = accuracy_evaluator.evaluate(best_predictions)
print("Accuracy = %g" % best_accuracy)

# Create an evaluator for f1 score
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1")
best_f1_score = f1_evaluator.evaluate(best_predictions)
print("F1 Score = %g" % best_f1_score)

# Create an evaluator for precision
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
best_precision = precision_evaluator.evaluate(best_predictions)
print("Precision = %g" % best_precision)

### Evaluation

In [None]:
# Evaluate models using the evaluate_model function
evaluation_results_default = evaluate_model(default_model, test, 'Default Model')  # Use the 'test' dataset

# Evaluate models using the evaluate_model function
evaluation_results_tuned = evaluate_model(best_model, FT_test, 'Fine-Tuned Model')  # Use the 'test' dataset

# Print evaluation results for both models side by side

print("Evaluation Results:")
print(f"{'Metric':<20}{'Default Model':<20}{'Fine-Tuned Model':<20}")
print("=" * 60)

for metric in evaluation_results_default.keys():
    default_value = evaluation_results_default[metric]
    tuned_value = evaluation_results_tuned[metric]
    print(f"{metric:<20}{default_value:<20}{tuned_value:<20}")

## Multilayer perceptron Model

### Default Model

###  Fine-Tuning 

### Evaluation

# Final comparison between Models