In [1]:
import multiprocessing

import pandas as pd
import numpy as np
import itertools
import importlib
import missingno as msno
import matplotlib.pyplot as plt
import warnings
import imblearn
import spacy
import re

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, roc_auc_score, precision_score
from sklearn.model_selection import learning_curve, validation_curve, train_test_split, KFold, StratifiedKFold, \
    cross_val_score, GridSearchCV, RandomizedSearchCV, cross_validate, RepeatedStratifiedKFold
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score, roc_curve, auc, precision_recall_curve

from sklearn.decomposition import TruncatedSVD

from scipy.stats import loguniform, beta, uniform

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline as IMBPipeline
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC

from Project.utils.storage import youtube_db as db

importlib.reload(db)

trump_podcasts = [
    "xrFdHO7FH8w",
    "blqIZGXWUpU",
    "s11uIW7wi-E",
    "vC5cHjcgt5g",
    "G80iLTctFuY",
    "qCbfTN-caFI",
    "Ry1IjOft95c",
    "S7BTdUaNQM8",
    "1SsmPh8gCxU",
    "-dmwG54QsKc",
    "nwQil7tcImI",
    "G9lXnwuZ2qs",
    "hBMoPUAeLnY"
]

harris_podcasts = [
    "_KCRsjPCiCI",
    "bzThwqnQJDY",
    "7L4sts7I3xI",
    "pNbwMrBMGgE",
    "Vu5yD3fu6A8",
]

youtube_db = db.SQLiteYoutubeSaver(db_name='../db/youtube.db')



In [2]:
NEUTRAL = 0
REPUBLICAN = 1
DEMOCRATIC = 2

TRUMP = 0
HARRIS = 1

In [3]:
youtube_db.cursor.execute(f"""
SELECT video_id, content, gemini_label
FROM CommentAnalysis JOIN Comments on Comments.id = CommentAnalysis.id
WHERE gemini_label IS NOT NULL
""")

data = youtube_db.cursor.fetchall()

df = pd.DataFrame(data)
df.columns = ['video_id', 'content', 'label']
df['podcast_guest'] = ['Trump' if video_id in trump_podcasts else 'Harris' for video_id in df['video_id']]
df['rep_label'] = [1 if l == 'Republican' else 0 for l in df['label']]
df['dem_label'] = [1 if l == 'Democratic' else 0 for l in df['label']]
df.drop('video_id', axis=1, inplace=True)

df

Unnamed: 0,content,label,podcast_guest,rep_label,dem_label
0,Best Podcast of All-time? 🤔🔥,Republican,Trump,1,0
1,Definitely my favorite!,Republican,Trump,1,0
2,"No, best guest",Republican,Trump,1,0
3,Probably,Neutral,Trump,0,0
4,Under 30 seconds,Neutral,Trump,0,0
...,...,...,...,...,...
18973,@@leroyjetson2329Folks with Caribbean heritage...,Neutral,Harris,0,0
18974,I am only here for the comment section.,Neutral,Harris,0,0
18975,😅😂 this comment section is trouble!,Neutral,Harris,0,0
18976,@@HI-DEF100 I’ve been looking for a positive c...,Republican,Harris,1,0


In [6]:
X = df[['content', 'podcast_guest']]

y_rep = df['rep_label']
y_dem = df['dem_label']

In [7]:
X_train_rep, X_test_rep, y_train_rep, y_test_rep = train_test_split(
    X, y_rep, test_size=0.2, stratify=y_rep, random_state=42, shuffle=True
)

In [8]:
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model='en_core_web_md'):
        self.model = model
        self.nlp = None

    def fit(self, X, y=None):  # Does nothing (only loads model)
        self.nlp = spacy.load(self.model)
        return self

    def transform(self, X):  # Transforms with nlp(content)
        return np.vstack([doc.vector for doc in self.nlp.pipe(X['content'], batch_size=64, n_process=3)])


preprocessor = ColumnTransformer(
    transformers=[
        ('spacy_vect', SpacyVectorTransformer(), ['content']),
        ('guest_ohe', OneHotEncoder(drop='first'), ['podcast_guest']),
    ]
)

def get_pipeline(): # Necessary to create two separate pipelines
    return IMBPipeline([
        ('preprocessor', preprocessor),
        ('sampler', None),
        ('dim_reduction', None),
        ('classifier', LogisticRegression())
    ])

In [27]:
scoring = {
    'f1': make_scorer(f1_score, zero_division=0),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'roc_auc': make_scorer(roc_auc_score)
}

sampler_configs = [
    {
        'sampler': [None],
    },
    # {
    #     'sampler': [SMOTE(random_state=42)],
    #     'sampler__sampling_strategy': [0.8, 1.0]
    # },
]

dim_reduction_configs = [
    {
        'dim_reduction': [None]
    },
    # {
    #     'dim_reduction': [PCA(random_state=42)],
    #     'dim_reduction__n_components': [0.9]
    # },
]

classifier_configs = [
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [50],
        'classifier__max_depth': [12, 14],
        'classifier__min_samples_split': [15, 25],
        'classifier__min_samples_leaf': [30, 50],
        'classifier__max_features': [0.5, 'sqrt'],
        'classifier__class_weight': ['balanced', {0: 1, 1: 2}, {0: 1, 1: 4}],
    },
    {
        'classifier': [LogisticRegression(solver='liblinear', max_iter=10000, random_state=42)],
        'classifier__C': [0.01, 0.1, 1.0],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__class_weight': ['balanced', {0: 1, 1: 2}, {0: 1, 1: 4}]
    }
]

all_configs = [
    dict(itertools.chain(*(e.items() for e in configuration)))
    for configuration in itertools.product(
        sampler_configs,
        dim_reduction_configs,
        classifier_configs
    )
]

print(f"Total pipeline combinations: {len(all_configs)}")

Total pipeline combinations: 2


In [None]:
baseline_pipeline = get_pipeline()
baseline_pipeline.set_params(
    sampler=None,
    dim_reduction=None,
    classifier=RandomForestClassifier(
        n_estimators=50,
        max_depth=8,
        min_samples_split=20,
        min_samples_leaf=10,
        class_weight='balanced',
        random_state=42
    )
)

baseline_scores = cross_val_score(
    baseline_pipeline, X_train_rep, y_train_rep,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1'
)
print(f"Baseline RF F1: {baseline_scores.mean():.3f} (+/- {baseline_scores.std() * 2:.3f})") # Result was Baseline RF F1: 0.548 (+/- 0.011)

In [28]:
rs_rep = RandomizedSearchCV(
    get_pipeline(),
    param_distributions=all_configs,
    n_iter=10,
    n_jobs=1,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring=scoring,
    error_score='raise',
    refit='precision',
    random_state=42,
    verbose=3,
    return_train_score=True
)

rs_rep.fit(X_train_rep, y_train_rep)

# nested_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#
# scores_rep = cross_validate(
#     rs_rep, X_train_rep, y_train_rep,
#     scoring=scoring,
#     cv=nested_cv,
#     return_estimator=True,
#     n_jobs=1,
#     verbose=3
# )

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END classifier=LogisticRegression(max_iter=10000, random_state=42, solver='liblinear'), classifier__C=0.1, classifier__class_weight=balanced, classifier__penalty=l1, dim_reduction=None, sampler=None; f1: (train=0.541, test=0.532) precision: (train=0.445, test=0.439) recall: (train=0.689, test=0.673) roc_auc: (train=0.645, test=0.637) total time= 1.6min
[CV 2/3] END classifier=LogisticRegression(max_iter=10000, random_state=42, solver='liblinear'), classifier__C=0.1, classifier__class_weight=balanced, classifier__penalty=l1, dim_reduction=None, sampler=None; f1: (train=0.544, test=0.543) precision: (train=0.444, test=0.443) recall: (train=0.700, test=0.700) roc_auc: (train=0.646, test=0.645) total time= 1.5min
[CV 3/3] END classifier=LogisticRegression(max_iter=10000, random_state=42, solver='liblinear'), classifier__C=0.1, classifier__class_weight=balanced, classifier__penalty=l1, dim_reduction=None, sampler=None; f1

In [None]:
best_estimators = scores_rep['estimator']

# Evaluate on validation/test set for threshold optimization
all_proba = []
all_y_true = []

print("Performance Summary:")
print("=" * 50)

for i, estimator in enumerate(best_estimators):
    print(f"\nFold {i+1}:")
    print(f"CV Score: {scores_rep['test_score'][i]:.4f}")

    # Get predictions on test set (or use validation approach)
    y_proba = estimator.predict_proba(X_test_rep)[:, 1]  # probabilities for positive class
    y_pred_default = estimator.predict(X_test_rep)

    all_proba.extend(y_proba)
    all_y_true.extend(y_test_rep)

    # Default threshold performance
    default_precision = precision_score(y_test_rep, y_pred_default)
    default_recall = recall_score(y_test_rep, y_pred_default)
    default_f1 = f1_score(y_test_rep, y_pred_default)

    print(f"Default threshold (0.5) - Precision: {default_precision:.4f}, Recall: {default_recall:.4f}, F1: {default_f1:.4f}")

# Convert to arrays for analysis
all_proba = np.array(all_proba)
all_y_true = np.array(all_y_true)

print(f"\nOverall CV Score: {scores_rep['test_score'].mean():.4f} ± {scores_rep['test_score'].std():.4f}")

# Find optimal threshold
print("\n" + "="*50)
print("THRESHOLD OPTIMIZATION")
print("="*50)

# Method 1: Optimize for F1 score
thresholds = np.arange(0.1, 0.9, 0.05)
f1_scores = []
precision_scores = []
recall_scores = []

for threshold in thresholds:
    y_pred_thresh = (all_proba >= threshold).astype(int)
    f1 = f1_score(all_y_true, y_pred_thresh)
    precision = precision_score(all_y_true, y_pred_thresh)
    recall = recall_score(all_y_true, y_pred_thresh)

    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)

# Best threshold for F1
best_f1_idx = np.argmax(f1_scores)
best_f1_threshold = thresholds[best_f1_idx]
best_f1_score = f1_scores[best_f1_idx]

print(f"Best F1 Threshold: {best_f1_threshold:.3f}")
print(f"Best F1 Score: {best_f1_score:.4f}")
print(f"Precision at best F1: {precision_scores[best_f1_idx]:.4f}")
print(f"Recall at best F1: {recall_scores[best_f1_idx]:.4f}")

# Method 2: ROC curve analysis
fpr, tpr, roc_thresholds = roc_curve(all_y_true, all_proba)
roc_auc = auc(fpr, tpr)

# Find threshold closest to top-left corner (Youden's J statistic)
j_scores = tpr - fpr
best_j_idx = np.argmax(j_scores)
best_roc_threshold = roc_thresholds[best_j_idx]

print(f"\nBest ROC Threshold (Youden's J): {best_roc_threshold:.3f}")

y_pred_roc = (all_proba >= best_roc_threshold).astype(int)
roc_precision = precision_score(all_y_true, y_pred_roc)
roc_recall = recall_score(all_y_true, y_pred_roc)
roc_f1 = f1_score(all_y_true, y_pred_roc)

print(f"ROC Threshold Performance - Precision: {roc_precision:.4f}, Recall: {roc_recall:.4f}, F1: {roc_f1:.4f}")

# Plotting
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# ROC Curve
ax1.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
ax1.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
ax1.scatter(fpr[best_j_idx], tpr[best_j_idx], color='red', s=100, label=f'Best threshold = {best_roc_threshold:.3f}')
ax1.set_xlim([0.0, 1.0])
ax1.set_ylim([0.0, 1.05])
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curve')
ax1.legend(loc="lower right")
ax1.grid(True)

# Precision-Recall Curve
precision_pr, recall_pr, pr_thresholds = precision_recall_curve(all_y_true, all_proba)
pr_auc = auc(recall_pr, precision_pr)

ax2.plot(recall_pr, precision_pr, color='blue', lw=2, label=f'PR curve (AUC = {pr_auc:.3f})')
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.set_title('Precision-Recall Curve')
ax2.legend()
ax2.grid(True)

# Threshold vs Metrics
ax3.plot(thresholds, f1_scores, 'g-', label='F1 Score', linewidth=2)
ax3.plot(thresholds, precision_scores, 'b-', label='Precision', linewidth=2)
ax3.plot(thresholds, recall_scores, 'r-', label='Recall', linewidth=2)
ax3.axvline(best_f1_threshold, color='black', linestyle='--', alpha=0.7, label=f'Best F1 threshold = {best_f1_threshold:.3f}')
ax3.set_xlabel('Threshold')
ax3.set_ylabel('Score')
ax3.set_title('Metrics vs Threshold')
ax3.legend()
ax3.grid(True)

# Probability Distribution
ax4.hist(all_proba[all_y_true == 0], bins=30, alpha=0.7, label='Negative Class', color='red')
ax4.hist(all_proba[all_y_true == 1], bins=30, alpha=0.7, label='Positive Class', color='blue')
ax4.axvline(best_f1_threshold, color='black', linestyle='--', label=f'Best F1 threshold = {best_f1_threshold:.3f}')
ax4.axvline(0.5, color='gray', linestyle=':', label='Default threshold = 0.5')
ax4.set_xlabel('Predicted Probability')
ax4.set_ylabel('Frequency')
ax4.set_title('Probability Distribution by Class')
ax4.legend()
ax4.grid(True)

plt.tight_layout()
plt.show()

print(f"\nROC AUC: {roc_auc:.4f}")
print(f"PR AUC: {pr_auc:.4f}")

# Final recommendation
print("\n" + "="*50)
print("FINAL RECOMMENDATION")
print("="*50)
print(f"Recommended threshold: {best_f1_threshold:.3f} (optimized for F1-score)")
print(f"Expected performance: Precision={precision_scores[best_f1_idx]:.4f}, Recall={recall_scores[best_f1_idx]:.4f}, F1={best_f1_score:.4f}")

In [None]:
# Training with Best Estimator

# Plotting the Learning Curve??

# Do the same for other model
