In [1]:
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import pandas as pd
import joblib

import preprocessor

### Evaluating the Original Jerbarnes Dataset

In [2]:
DATA_DIR = Path('./data')


print("--- Starting Evaluation on Original 'jerbarnes' Dataset Only ---")

try:
    # Load only the original 'jerbarnes' dataset
    jerbarnes_dataset_path = DATA_DIR/'jerbarnes_dataset_selective_lowercased_lemmatized.csv'
    df_jerbarnes = pd.read_csv(jerbarnes_dataset_path, header=None, names=['label', 'text', 'processed_text'])
    print(f"Original 'jerbarnes' dataset loaded successfully. Samples: {len(df_jerbarnes)}")

except FileNotFoundError as e:
    print(f"Error loading 'jerbarnes' data: {e}. Please ensure the file exists at {jerbarnes_dataset_path}")
    exit()

# Splitting the 'jerbarnes' dataset into training and testing sets (80/20 split)
X_old_train, X_old_test, y_old_train, y_old_test = train_test_split(
    df_jerbarnes['processed_text'], df_jerbarnes['label'],
    test_size=0.2, random_state=42, stratify=df_jerbarnes['label']
)

print(f"Jerbarnes Training set size: {len(X_old_train)} samples")
print(f"Jerbarnes Test set size: {len(X_old_test)} samples")


# Defining the SVM pipeline for hyperparameter tuning (using CountVectorizer)
svm_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()), # Placeholder for vectorizer, will be replaced by GridSearchCV
    ('classifier', SVC(probability=True, random_state=42)), # probability=True is needed for predict_proba
])

# Defining the hyperparameter grid for SVM
param_grid = [
    {
        'vectorizer': [CountVectorizer(min_df=2, lowercase=False, strip_accents=None, stop_words=None)],
        'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'classifier__C': [0.1, 1, 10, 100], # Regularization parameter
        'classifier__kernel': ['linear', 'rbf'], # Specifies the kernel type
        'classifier__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    },
    {
        'vectorizer': [TfidfVectorizer(min_df=2, lowercase=False, strip_accents=None, stop_words=None)],
        'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'vectorizer__use_idf': [True, False],
        'vectorizer__norm': ['l1', 'l2'],
        'classifier__C': [0.1, 1, 10, 100], # Regularization parameter
        'classifier__kernel': ['linear', 'rbf'], # Specifies the kernel type
        'classifier__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    },
]

print("Starting GridSearchCV for SVM with various feature representations...")
grid_search_svm_old = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=1, n_jobs=-1, scoring='f1_weighted')
grid_search_svm_old.fit(X_old_train, y_old_train)

print("\nBest parameters found for SVM on 'jerbarnes' data:")
print(grid_search_svm_old.best_params_)
print("\nBest cross-validation F1-weighted score for SVM on 'jerbarnes' data:")
print(grid_search_svm_old.best_score_)

# Evaluating the best SVM model (optimized on jerbarnes) on the 'jerbarnes' test set
best_svm_old_model = grid_search_svm_old.best_estimator_
y_pred_old_svm = best_svm_old_model.predict(X_old_test)

print("\nClassification Report for Best SVM Model (Optimized & Tested on 'jerbarnes' only):")
print(classification_report(y_old_test, y_pred_old_svm))
print(f"Accuracy for Best SVM Model (Optimized & Tested on 'jerbarnes' only): {accuracy_score(y_old_test, y_pred_old_svm):.4f}")

print("\n--- End of Evaluation on Original 'jerbarnes' Dataset Only ---")

--- Starting Evaluation on Original 'jerbarnes' Dataset Only ---
Original 'jerbarnes' dataset loaded successfully. Samples: 851
Jerbarnes Training set size: 680 samples
Jerbarnes Test set size: 171 samples
Starting GridSearchCV for SVM with various feature representations...
Fitting 5 folds for each of 720 candidates, totalling 3600 fits

Best parameters found for SVM on 'jerbarnes' data:
{'classifier__C': 10, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf', 'vectorizer': TfidfVectorizer(lowercase=False, min_df=2), 'vectorizer__ngram_range': (1, 1), 'vectorizer__norm': 'l2', 'vectorizer__use_idf': True}

Best cross-validation F1-weighted score for SVM on 'jerbarnes' data:
0.734102027825346

Classification Report for Best SVM Model (Optimized & Tested on 'jerbarnes' only):
              precision    recall  f1-score   support

           0       0.78      0.93      0.85       117
           1       0.74      0.43      0.54        54

    accuracy                           0.77

In [3]:
# Saving the results for the final report
report_model_a_dict = classification_report(y_old_test, y_pred_old_svm, output_dict=True)
accuracy_model_a = accuracy_score(y_old_test, y_pred_old_svm)

### Adding Crowd-sourced data to training and re-evaluating on the old test set

In [4]:
print("\n--- Starting Evaluation with Crowd-sourced Data Added to Training ---")

try:
    # Loading the crowd-sourced dataset
    crowdsourced_dataset_path = DATA_DIR/'crowdsourced_dataset_selective_lowercased_lemmatized.csv'
    df_crowdsourced = pd.read_csv(crowdsourced_dataset_path, header=None, names=['label', 'text', 'processed_text'])
    print(f"Crowd-sourced dataset loaded successfully. Samples: {len(df_crowdsourced)}")

except FileNotFoundError as e:
    print(f"Error loading crowd-sourced data: {e}. Please ensure the file exists at {crowdsourced_dataset_path}")
    exit()

# Combining the jerbarnes training set with the entire crowd-sourced dataset
X_extended_train = pd.concat([X_old_train, df_crowdsourced['processed_text']], ignore_index=True)
y_extended_train = pd.concat([y_old_train, df_crowdsourced['label']], ignore_index=True)

print(f"Extended Training set size (Jerbarnes train + Crowd-sourced): {len(X_extended_train)} samples")

# Retraining the BEST SVM model (found from jerbarnes-only optimization) on the extended training set
print("\nRetraining the best SVM model (from jerbarnes optimization) on the EXTENDED training data...")
retrained_svm_extended = best_svm_old_model.fit(X_extended_train, y_extended_train)
print("Retraining complete.")

# Evaluating this newly retrained model on the SAME 'jerbarnes' test set
y_pred_extended_svm = retrained_svm_extended.predict(X_old_test)

print("\nClassification Report for SVM Model (Retrained on Extended Data, Tested on 'jerbarnes' only):")
print(classification_report(y_old_test, y_pred_extended_svm))
print(f"Accuracy for SVM Model (Retrained on Extended Data, Tested on 'jerbarnes' only): {accuracy_score(y_old_test, y_pred_extended_svm):.4f}")

print("\n--- End of Evaluation with Crowd-sourced Data Added to Training ---")


--- Starting Evaluation with Crowd-sourced Data Added to Training ---
Crowd-sourced dataset loaded successfully. Samples: 1594
Extended Training set size (Jerbarnes train + Crowd-sourced): 2274 samples

Retraining the best SVM model (from jerbarnes optimization) on the EXTENDED training data...
Retraining complete.

Classification Report for SVM Model (Retrained on Extended Data, Tested on 'jerbarnes' only):
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       117
           1       0.66      0.46      0.54        54

    accuracy                           0.75       171
   macro avg       0.72      0.68      0.69       171
weighted avg       0.74      0.75      0.74       171

Accuracy for SVM Model (Retrained on Extended Data, Tested on 'jerbarnes' only): 0.7544

--- End of Evaluation with Crowd-sourced Data Added to Training ---


In [5]:
report_model_b_dict = classification_report(y_old_test, y_pred_extended_svm, output_dict=True)
accuracy_model_b = accuracy_score(y_old_test, y_pred_extended_svm)

### Evaluation on the full combined dataset

In [None]:
print("\n--- Starting Evaluation on the Full Combined Dataset (Optimized & Tested on original jerbarnes test set) ---")

# Using the original jerbarnes test set for evaluation of Model C as well
X_test_for_C = X_old_test
y_test_for_C = y_old_test

print(f"Combined Training set size for Model C: {len(X_combined_train_for_C)} samples")
print(f"Test set size (original jerbarnes test set) for Model C: {len(X_test_for_C)} samples")


# Defining the SVM pipeline for hyperparameter tuning
svm_pipeline_combined_C = Pipeline([
    ('vectorizer', CountVectorizer()), # Placeholder for vectorizer
    ('classifier', SVC(probability=True, random_state=42)),
])

# Defining the hyperparameter grid for SVM with vectorizer optimization 
param_grid_combined_C = [
    {
        'vectorizer': [CountVectorizer(min_df=2, lowercase=False, strip_accents=None, stop_words=None)],
        'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    },
    {
        'vectorizer': [TfidfVectorizer(min_df=2, lowercase=False, strip_accents=None, stop_words=None)],
        'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'vectorizer__use_idf': [True, False],
        'vectorizer__norm': ['l1', 'l2'],
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    },
]

print("\nStarting GridSearchCV for SVM on the FULL Combined Dataset (for Model C optimization)...")
grid_search_svm_combined_C = GridSearchCV(svm_pipeline_combined_C, param_grid_combined_C, cv=5, verbose=1, n_jobs=-1, scoring='f1_weighted')
grid_search_svm_combined_C.fit(X_extended_train, y_extended_train) # Fit on the full combined training data

print("\nBest parameters found for SVM on the FULL Combined Dataset (for Model C):")
print(grid_search_svm_combined_C.best_params_)
print("\nBest cross-validation F1-weighted score for SVM on the FULL Combined Dataset (for Model C):")
print(grid_search_svm_combined_C.best_score_)

# Evaluating the best SVM model (optimized on combined data) on the ORIGINAL JERBARNES test set
best_svm_combined_model_C = grid_search_svm_combined_C.best_estimator_
y_pred_combined_svm_C = best_svm_combined_model_C.predict(X_test_for_C) # Predict on X_old_test

print("\nClassification Report for Best SVM Model (Optimized on Combined Data, Tested on original 'jerbarnes' only):")
print(classification_report(y_test_for_C, y_pred_combined_svm_C))
print(f"Accuracy for Best SVM Model (Optimized on Combined Data, Tested on original 'jerbarnes' only): {accuracy_score(y_test_for_C, y_pred_combined_svm_C):.4f}")

print("\n--- End of Evaluation on the Full Combined Dataset (Optimized & Tested on original jerbarnes test set) ---")


--- Starting Evaluation on the Full Combined Dataset (Optimized & Tested on original jerbarnes test set) ---
Combined Training set size for Model C: 2445 samples
Test set size (original jerbarnes test set) for Model C: 171 samples

Starting GridSearchCV for SVM on the FULL Combined Dataset (for Model C optimization)...
Fitting 5 folds for each of 720 candidates, totalling 3600 fits

Best parameters found for SVM on the FULL Combined Dataset (for Model C):
{'classifier__C': 10, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf', 'vectorizer': TfidfVectorizer(lowercase=False, min_df=2), 'vectorizer__ngram_range': (1, 1), 'vectorizer__norm': 'l2', 'vectorizer__use_idf': True}

Best cross-validation F1-weighted score for SVM on the FULL Combined Dataset (for Model C):
0.7681495895075671

Classification Report for Best SVM Model (Optimized on Combined Data, Tested on original 'jerbarnes' only):
              precision    recall  f1-score   support

           0       0.78      0.89 

In [10]:
report_model_c_dict = classification_report(y_test_for_C, y_pred_combined_svm_C, output_dict=True)
accuracy_model_c = accuracy_score(y_test_for_C, y_pred_combined_svm_C)

### Printing the Metrics

In [11]:
import pandas as pd

def extract_metrics(report_dict, accuracy):
    metrics = {
        '0': {
            'precision': report_dict['0']['precision'],
            'recall': report_dict['0']['recall'],
            'f1-score': report_dict['0']['f1-score'],
        },
        '1': {
            'precision': report_dict['1']['precision'],
            'recall': report_dict['1']['recall'],
            'f1-score': report_dict['1']['f1-score'],
        },
        'macro avg': {
            'precision': report_dict['macro avg']['precision'],
            'recall': report_dict['macro avg']['recall'],
            'f1-score': report_dict['macro avg']['f1-score'],
        },
        'weighted avg': {
            'precision': report_dict['weighted avg']['precision'],
            'recall': report_dict['weighted avg']['recall'],
            'f1-score': report_dict['weighted avg']['f1-score'],
        },
        'Overall': {
            'Accuracy': accuracy
        }
    }
    return metrics

# Extract metrics for each model
metrics_model_a = extract_metrics(report_model_a_dict, accuracy_model_a)
metrics_model_b = extract_metrics(report_model_b_dict, accuracy_model_b)
metrics_model_c = extract_metrics(report_model_c_dict, accuracy_model_c)

# Preparing data for DataFrame
data = []
for class_label in ['0', '1', 'Overall', 'macro avg', 'weighted avg']:
    if class_label == 'Overall':
        row_accuracy = {
            'Class/Avg': class_label,
            'Metric': 'Accuracy',
            'Model A': metrics_model_a[class_label].get('Accuracy'),
            'Model B': metrics_model_b[class_label].get('Accuracy'),
            'Model C': metrics_model_c[class_label].get('Accuracy')
        }
        data.append(row_accuracy)
    else:
        for metric_name in ['precision', 'recall', 'f1-score']:
            row = {
                'Class/Avg': class_label,
                'Metric': metric_name,
                'Model A': metrics_model_a[class_label].get(metric_name),
                'Model B': metrics_model_b[class_label].get(metric_name),
                'Model C': metrics_model_c[class_label].get(metric_name)
            }
            data.append(row)

# Create the DataFrame
comparison_df = pd.DataFrame(data)

# Print the table (adjust formatting if needed)
print("\n--- SVM Model Performance Comparison ---")
print(comparison_df.to_string(index=False, float_format="%.2f"))


comparison_df.to_csv('svm_performance_comparison.csv', index=False)


--- SVM Model Performance Comparison ---
   Class/Avg    Metric  Model A  Model B  Model C
           0 precision     0.78     0.78     0.78
           0    recall     0.93     0.89     0.89
           0  f1-score     0.85     0.83     0.83
           1 precision     0.74     0.66     0.66
           1    recall     0.43     0.46     0.46
           1  f1-score     0.54     0.54     0.54
     Overall  Accuracy     0.77     0.75     0.75
   macro avg precision     0.76     0.72     0.72
   macro avg    recall     0.68     0.68     0.68
   macro avg  f1-score     0.69     0.69     0.69
weighted avg precision     0.77     0.74     0.74
weighted avg    recall     0.77     0.75     0.75
weighted avg  f1-score     0.75     0.74     0.74
