# Import the Required Packages

In [None]:
# Standard Library Imports
import os
import re
from tqdm import tqdm

# Numerical and Data Manipulation Libraries
import numpy as np
import pandas as pd

# Visualization Libraries
import matplotlib.pyplot as plt

# PDF Handling
import PyPDF2

# Scikit-Learn for Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Machine Learning Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# Evaluation Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report, confusion_matrix

# Sentence Transformers for Sentence Embeddings
from sentence_transformers import SentenceTransformer

# Data Preparation and Feature Extraction for Text Classification

In [None]:
# Read CSV file and exclude specific columns
data_df = pd.read_csv('/Users/l/Desktop/Edinburgh/Dissertation/Session2_SB/Code/data_extracted.csv', 
                      usecols=lambda column: column in ['decision', 'background'])

# Load SentenceTransformer model for generating sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract the 'background' column as feature set X and 'decision' column as target labels y
X = data_df['background'].astype(str)
y = data_df['decision']

# Encode target labels to numerical values using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Perform TF-IDF vectorization on the training and testing text data
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Generate sentence embeddings for the training and testing data using SentenceTransformer
X_train_bert = np.array(model.encode(X_train.tolist()))
X_test_bert = np.array(model.encode(X_test.tolist()))

# Save the TF-IDF vectorized data to .npy files
np.save('X_train_tfidf.npy', X_train_tfidf.toarray())
np.save('X_test_tfidf.npy', X_test_tfidf.toarray())

# Save the Sentence-BERT embeddings to .npy files
np.save('X_train_bert.npy', X_train_bert)
np.save('X_test_bert.npy', X_test_bert)

# Save the encoded labels to .npy files
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

# Model Training and Evaluation Using Grid Search on TF-IDF and SBERT Features

In [None]:
# Load encoded data
# Loading precomputed TF-IDF and Sentence-BERT (SBERT) feature vectors, along with training and testing labels.
X_train_tfidf = np.load('X_train_tfidf.npy')
X_test_tfidf = np.load('X_test_tfidf.npy')
X_train_bert = np.load('X_train_bert.npy')
X_test_bert = np.load('X_test_bert.npy')

y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

# Define a dictionary of models for training
# Three machine learning models are chosen: Logistic Regression, Random Forest, and Neural Network (MLPClassifier).
models = {
    'LogisticRegression': LogisticRegression(max_iter=10000),
    'RandomForest': RandomForestClassifier(),
    'NeuralNetwork': MLPClassifier(max_iter=10000),
}

# Define parameters for grid search
# A dictionary specifying hyperparameters for each model to be optimized using grid search.
params = {
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'saga']
    },
    'RandomForest': {
        'n_estimators': [10, 15, 20],
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, None]
    },
    'NeuralNetwork': {
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'lbfgs']
    }
}

# Grid search and evaluation
# Initialize an empty list to store the results of each model evaluation.
results = []

# Loop through each feature vectorization method (TF-IDF and SBERT)
for vec_name, X_train_vec in {'TFIDF': X_train_tfidf, 'SBERT': X_train_bert}.items():
    # Select corresponding test set
    X_test_vec = X_test_tfidf if vec_name == 'TFIDF' else X_test_bert
    
    # Loop through each model
    for model_name, model in models.items():
        # Perform grid search with cross-validation to find the best hyperparameters
        grid_search = GridSearchCV(model, params.get(model_name, {}), cv=5, scoring='f1')
        grid_search.fit(X_train_vec, y_train)
        
        # Get the best model and parameters from the grid search
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        
        # Perform cross-validation on the training data to assess model performance
        cv_accuracy = cross_val_score(best_model, X_train_vec, y_train, cv=5, scoring='accuracy').mean()
        
        # Make predictions on both training and testing sets
        y_train_pred = best_model.predict(X_train_vec)
        y_test_pred = best_model.predict(X_test_vec)
        y_test_proba = best_model.predict_proba(X_test_vec)[:, 1]
        
        # Calculate performance metrics for both training and testing sets
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        precision_train = precision_score(y_train, y_train_pred, zero_division=1)
        recall_train = recall_score(y_train, y_train_pred, zero_division=1)
        precision_test = precision_score(y_test, y_test_pred, zero_division=1)
        recall_test = recall_score(y_test, y_test_pred, zero_division=1)
        f1_train = f1_score(y_train, y_train_pred)
        f1_test = f1_score(y_test, y_test_pred)
        roc_auc = roc_auc_score(y_test, y_test_proba)
        
        # Store the results in the list
        results.append({
            'Vectorizer': vec_name,
            'Model': model_name,
            'Best Params': best_params,
            'CV Accuracy': cv_accuracy,
            'Train Accuracy': train_accuracy,
            'Train Precision': precision_train,
            'Train Recall': recall_train,
            'Train F1 Score': f1_train,
            'Test Accuracy': test_accuracy,
            'Test Precision': precision_test,
            'Test Recall': recall_test,
            'Test F1 Score': f1_test,
            'ROC AUC': roc_auc
        })
        
        # Plot and display the ROC curve for the best model
        fpr, tpr, _ = roc_curve(y_test, y_test_proba)
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'Receiver Operating Characteristic - {vec_name} + {model_name}')
        plt.legend(loc="lower right")
        plt.show()

# Convert results list into a DataFrame for easier analysis and visualization
results_df = pd.DataFrame(results)

# Display the results DataFrame
display(results_df)