In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
from pathlib import Path

In [17]:
cwd_dir = Path.cwd().parent


In [18]:
df = pd.read_csv("/content/reddit_preprocessing.csv").dropna(subset=['clean_comment'])

In [19]:
df.shape

(36662, 2)

In [20]:
import dagshub
dagshub.init(repo_owner='AMR-ITH', repo_name='yt-comment-analyzer', mlflow=True)


In [21]:
# Set or create an experiment
mlflow.set_experiment("Exp1 - BoW vs TfIdf")

<Experiment: artifact_location='mlflow-artifacts:/2c14398098984bd99c77f12615b80629', creation_time=1751550497001, experiment_id='2', last_update_time=1751550497001, lifecycle_stage='active', name='Exp1 - BoW vs TfIdf', tags={}>

In [33]:
# 🧠 Helper function to get averaged word embeddings per document
def get_avg_wordvec(doc, model, vector_size):
    words = doc.split()
    valid_words = [w for w in words if w in model.wv]
    if not valid_words:
        return np.zeros(vector_size)
    return np.mean([model.wv[w] for w in valid_words], axis=0)

# 🔁 Main experiment runner
def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, vectorizer_name, w2v_params=None):
    # Ensure no active runs before starting
    if mlflow.active_run():
        mlflow.end_run()

    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
    )

    if vectorizer_type == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
        X_train = vectorizer.fit_transform(X_train_raw)
        X_test = vectorizer.transform(X_test_raw)

    elif vectorizer_type == "TF-IDF":
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
        X_train = vectorizer.fit_transform(X_train_raw)
        X_test = vectorizer.transform(X_test_raw)

    elif vectorizer_type == "Word2Vec":
        tokenized_train = [text.split() for text in X_train_raw]

        # Default params with update from user-defined
        w2v_defaults = {
            "vector_size": 100,
            "window": 5,
            "min_count": 1,
            "workers": 4,
            "sg": 0
        }
        if w2v_params:
            w2v_defaults.update(w2v_params)

        model_w2v = Word2Vec(sentences=tokenized_train, seed=42, **w2v_defaults)

        X_train = np.array([get_avg_wordvec(text, model_w2v, w2v_defaults["vector_size"]) for text in X_train_raw])
        X_test = np.array([get_avg_wordvec(text, model_w2v, w2v_defaults["vector_size"]) for text in X_test_raw])

    else:
        raise ValueError("Unsupported vectorizer_type")

    # 🎯 Model Training & Logging
    try:
        with mlflow.start_run() as run:
            # Run name
            if vectorizer_type == "Word2Vec":
                run_name = f"{vectorizer_name}_RandomForest"
            else:
                run_name = f"{vectorizer_name}_{ngram_range}_RandomForest"

            mlflow.set_tag("mlflow.runName", run_name)
            mlflow.set_tag("experiment_type", "feature_engineering")
            mlflow.set_tag("model_type", "RandomForestClassifier")

            # General parameters
            mlflow.log_param("vectorizer_type", vectorizer_type)
            if vectorizer_type != "Word2Vec":
                mlflow.log_param("ngram_range", ngram_range)
                mlflow.log_param("vectorizer_max_features", vectorizer_max_features)

            # Log Word2Vec parameters if applicable
            if vectorizer_type == "Word2Vec" and w2v_params:
                for key, val in w2v_defaults.items():
                    mlflow.log_param(f"w2v_{key}", val)

            # Model hyperparameters
            n_estimators = 200
            max_depth = 15
            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_param("max_depth", max_depth)

            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            mlflow.log_metric("accuracy", accuracy)

            classification_rep = classification_report(y_test, y_pred, output_dict=True)
            for label, metrics in classification_rep.items():
                if isinstance(metrics, dict):
                    for metric, value in metrics.items():
                        mlflow.log_metric(f"{label}_{metric}", value)

            conf_matrix = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
            plt.xlabel("Predicted")
            plt.ylabel("Actual")
            plt.title(f"Confusion Matrix: {vectorizer_name}")
            plt.savefig("confusion_matrix.png")
            mlflow.log_artifact("confusion_matrix.png")
            plt.close()

            print(f"✅ Completed experiment: {run_name}")

    except Exception as e:
        print(f"❌ Error in experiment {vectorizer_name}: {str(e)}")
        # Ensure run is ended even if there's an error
        if mlflow.active_run():
            mlflow.end_run()
        raise

In [34]:
# 🧪 BoW and TF-IDF Experiments
ngram_ranges = [(1, 1), (1, 2), (1, 3)]
max_features = 5000

print("🚀 Starting BoW and TF-IDF experiments...")
for ngram_range in ngram_ranges:
    run_experiment("BoW", ngram_range, max_features, vectorizer_name="BoW")
    run_experiment("TF-IDF", ngram_range, max_features, vectorizer_name="TF-IDF")

print("🚀 Starting Word2Vec experiments...")
# 🧪 Word2Vec Experiments (grid search)
w2v_param_grid = [
    {"vector_size": 100, "window": 5, "min_count": 1, "sg": 0},  # CBOW
    {"vector_size": 100, "window": 5, "min_count": 1, "sg": 1},  # Skip-gram
    {"vector_size": 50, "window": 3, "min_count": 2, "sg": 0},   # CBOW smaller
]

for w2v_params in w2v_param_grid:
    name = f"W2V_vs{w2v_params['vector_size']}_w{w2v_params['window']}_sg{w2v_params['sg']}"
    run_experiment("Word2Vec", None, None, vectorizer_name=name, w2v_params=w2v_params)

print("🎉 All experiments completed!")

# Ensure any remaining active runs are closed
if mlflow.active_run():
    mlflow.end_run()

🚀 Starting BoW and TF-IDF experiments...
✅ Completed experiment: BoW_(1, 1)_RandomForest
🏃 View run BoW_(1, 1)_RandomForest at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/2/runs/6b637184df5242e9b8c747f4b4445b6f
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/2
✅ Completed experiment: TF-IDF_(1, 1)_RandomForest
🏃 View run TF-IDF_(1, 1)_RandomForest at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/2/runs/d3d0e7f8176145df9aad12f3a3abd5a0
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/2
✅ Completed experiment: BoW_(1, 2)_RandomForest
🏃 View run BoW_(1, 2)_RandomForest at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/2/runs/a172b3cba1c440d3ac1872f04c3ff49f
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/2
✅ Completed experiment: TF-IDF_(1, 2)_RandomForest
🏃 View run TF-IDF_(1, 2)_RandomFore