In [19]:
import warnings
warnings.filterwarnings("ignore")

# Step 1: Data Preparation and TF-IDF Vectorization


In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [21]:
# Load and preprocess the original dataset
df = pd.read_csv('data/100k_en_toots_labeled.csv')
df['content'] = df['content'].astype(str)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['y'], test_size=0.2, random_state=42)

# Prepare augmented training set
hate_df = pd.read_csv('data/hate_dataset.csv')
hate_df['content'] = hate_df['content'].astype(str)
X_train_augmented = pd.concat([X_train, hate_df['content']], ignore_index=True)
y_train_augmented = pd.concat([y_train, hate_df['y']], ignore_index=True)

# Apply TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=(1,2), max_features=20000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_train_augmented_tfidf = tfidf_vectorizer.transform(X_train_augmented)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Apply RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train_tfidf, y_train)

# Apply RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train_tfidf, y_train)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)


# Step 3: Model Training and Evaluation

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score
from joblib import dump, load
import os
import json

# Define scoring metrics
scoring = {
    'roc_auc': 'roc_auc',
    'balanced_accuracy': make_scorer(balanced_accuracy_score),
    'f1_score': make_scorer(f1_score, average='macro')
}

# Update model hyperparameters to include class weight adjustment
logistic_params = {'C': [0.1, 1, 10], 'class_weight': [None, 'balanced']}
random_forest_params = {'n_estimators': [10, 50, 100, 200], 'max_depth': [10, 20], 'class_weight': [None, 'balanced']}
xgboost_params = {'max_depth': [3, 6], 'n_estimators': [100, 200], 'scale_pos_weight': [1, y_train.value_counts()[0] / y_train.value_counts()[1]]}
svm_params = {'C': [0.1, 1], 'kernel': ['linear', 'rbf'], 'class_weight': [None, 'balanced']}
naive_bayes_params = {'alpha': [0.1, 1, 10]}  # No class_weight adjustment

models = {
    #'LogisticRegression': (LogisticRegression(), logistic_params),
    #'RandomForest': (RandomForestClassifier(), random_forest_params),
    #'XGBoost': (xgb.XGBClassifier(), xgboost_params),
    'SVM': (SVC(), svm_params),
    #'NaiveBayes': (MultinomialNB(), naive_bayes_params)
}

# Function to perform GridSearchCV with multiple metrics and checkpointing
def perform_grid_search(model_name, model, params, X_train, y_train, scoring, checkpoint_dir, variant_name):
    checkpoint_path = os.path.join(checkpoint_dir, f'{model_name}_{variant_name}_gridsearch.joblib')
    if os.path.exists(checkpoint_path):
        grid_search = load(checkpoint_path)
    else:
        grid_search = GridSearchCV(model, params, scoring=scoring, refit='f1_score', cv=5, verbose=1)
        grid_search.fit(X_train, y_train)
        dump(grid_search, checkpoint_path)
    return grid_search

# Prepare dataset variants
dataset_variants = {
    'ROS': (X_train_ros, y_train_ros),
    'RUS': (X_train_rus, y_train_rus),
    'SMOTE': (X_train_smote, y_train_smote),
    'ORIGINAL': (X_train_tfidf, y_train),
    'AUGMENTED': (X_train_augmented_tfidf, y_train_augmented)
}

# WE Include ORIGINAL and class weight penalization as variants
# For class weight penalization, use the original data but models will have class_weight parameter

# Directory for checkpoints
checkpoint_dir = 'checkpoints/'
os.makedirs(checkpoint_dir, exist_ok=True)

# Dictionary to store results
results = {}

In [37]:
from sklearn.base import BaseEstimator, ClassifierMixin

# Define the Baseline Classifier
class AlwaysZeroClassifier(BaseEstimator, ClassifierMixin):
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        return [0] * X.shape[0]

# Initialize and Evaluate the Baseline Classifier
baseline_classifier = AlwaysZeroClassifier()
baseline_classifier.fit(X_train_tfidf, y_train)  # Fitting to train data
y_pred_baseline = baseline_classifier.predict(X_test_tfidf)  # Predicting on test data

# Calculate metrics for the baseline classifier
baseline_scores = {
    'roc_auc': roc_auc_score(y_test, y_pred_baseline),
    'balanced_accuracy': balanced_accuracy_score(y_test, y_pred_baseline),
    'f1_score': f1_score(y_test, y_pred_baseline, average='macro')
}

# Add Baseline Results to the Main Results Dictionary
results['Baseline'] = baseline_scores
results

{'Baseline': {'roc_auc': 0.5,
  'balanced_accuracy': 0.5,
  'f1_score': 0.49630066612590024}}

In [38]:
# Perform grid search for each dataset variant and classifier
for variant_name, (X_train_var, y_train_var) in dataset_variants.items():
    variant_results = {}
    for model_name, (model, params) in models.items():
        print(f"Processing {model_name} on {variant_name} dataset...")
        grid_search = perform_grid_search(model_name, model, params, X_train_var, y_train_var, scoring, checkpoint_dir, variant_name)
        
        # Extract and log results
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_cv_scores = {metric: grid_search.cv_results_[f'mean_test_{metric}'][grid_search.best_index_] for metric in scoring}
        y_pred_test = best_model.predict(X_test_tfidf)
        test_scores = {
            'roc_auc': roc_auc_score(y_test, y_pred_test),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_pred_test),
            'f1_score': f1_score(y_test, y_pred_test, average='macro')
        }
        variant_results[model_name] = {
            'best_params': best_params,
            'best_cv_scores': best_cv_scores,
            'test_scores': test_scores
        }

    # Add variant results to main results
    results[variant_name] = variant_results

# Save results to a JSON file
with open('tfidf_results_svm.json', 'w') as file:
    json.dump(results, file)

Processing SVM on ROS dataset...
Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [35]:
import json

def pretty_print_json(file_path):
    try:
        # Load JSON data from the file
        with open(file_path, 'r') as file:
            data = json.load(file)

        # Pretty print the JSON data
        print(json.dumps(data, indent=4, sort_keys=True))
    except Exception as e:
        print(f"Error occurred: {e}")

pretty_print_json('tfidf_results.json')


{
    "AUGMENTED": {
        "LogisticRegression": {
            "best_cv_scores": {
                "balanced_accuracy": 0.8983493638146314,
                "f1_score": 0.8984381628643506,
                "roc_auc": 0.966046320027084
            },
            "best_params": {
                "C": 10,
                "class_weight": "balanced"
            },
            "test_scores": {
                "balanced_accuracy": 0.6807936038078872,
                "f1_score": 0.5752808220763801,
                "roc_auc": 0.6807936038078871
            }
        },
        "NaiveBayes": {
            "best_cv_scores": {
                "balanced_accuracy": 0.897920527358598,
                "f1_score": 0.8997244103386673,
                "roc_auc": 0.9692134668679241
            },
            "best_params": {
                "alpha": 0.1
            },
            "test_scores": {
                "balanced_accuracy": 0.665470010568506,
                "f1_score": 0.5724408134507842,
      