In [6]:
import dagshub
dagshub.init(repo_owner='AMR-ITH', repo_name='yt-comment-analyzer', mlflow=True)

In [10]:
# Set or create an experiment
mlflow.set_experiment("Exp 2 - TfIdf Trigram max_features")

<Experiment: artifact_location='mlflow-artifacts:/39e80ae2c2b6468a949c287467410473', creation_time=1752223097484, experiment_id='8', last_update_time=1752223097484, lifecycle_stage='active', name='Exp 2 - TfIdf Trigram max_features', tags={}>

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
from pathlib import Path

In [None]:
data_dir_path = Path.cwd().parent / "data" /"raw"

In [11]:
df = pd.read_csv("/content/reddit_preprocessing.csv").dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [13]:
# 🧪 Function to run experiments with different vectorizers and max_features
def run_experiment_vectorizer_max_features(vectorizer_type, max_features):
    ngram_range = (1, 3)  # Trigram setting

    # Ensure no active MLflow runs
    if mlflow.active_run():
        mlflow.end_run()

    # Step 1: Choose vectorizer based on type
    if vectorizer_type == "TF-IDF":
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    elif vectorizer_type == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=max_features)
    else:
        raise ValueError("vectorizer_type must be 'TF-IDF' or 'BoW'")

    # Step 2: Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        df['clean_comment'], df['category'],
        test_size=0.2, random_state=42, stratify=df['category']
    )

    # Step 3: Vectorization
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Step 4: Model training with MLflow logging
    try:
        with mlflow.start_run() as run:
            # Set tags for the experiment and run
            mlflow.set_tag("mlflow.runName", f"{vectorizer_type}_Trigrams_max_features_{max_features}")
            mlflow.set_tag("experiment_type", "feature_engineering_max_features")
            mlflow.set_tag("model_type", "RandomForestClassifier")
            mlflow.set_tag("vectorizer_type", vectorizer_type)

            # Add a description
            mlflow.set_tag("description", f"RandomForest with {vectorizer_type} Trigrams, max_features={max_features}")

            # Log vectorizer parameters
            mlflow.log_param("vectorizer_type", vectorizer_type)
            mlflow.log_param("ngram_range", ngram_range)
            mlflow.log_param("vectorizer_max_features", max_features)

            # Log Random Forest parameters
            n_estimators = 200
            max_depth = 15
            mlflow.log_param("n_estimators", n_estimators)
            mlflow.log_param("max_depth", max_depth)

            # Initialize and train the model
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
            model.fit(X_train, y_train)

            # Step 5: Make predictions and log metrics
            y_pred = model.predict(X_test)

            # Log accuracy
            accuracy = accuracy_score(y_test, y_pred)
            mlflow.log_metric("accuracy", accuracy)

            # Log classification report
            classification_rep = classification_report(y_test, y_pred, output_dict=True)
            for label, metrics in classification_rep.items():
                if isinstance(metrics, dict):
                    for metric, value in metrics.items():
                        mlflow.log_metric(f"{label}_{metric}", value)

            # Log confusion matrix
            conf_matrix = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
            plt.xlabel("Predicted")
            plt.ylabel("Actual")
            plt.title(f"Confusion Matrix: {vectorizer_type} Trigrams, max_features={max_features}")
            plt.savefig(f"confusion_matrix_{vectorizer_type}_{max_features}.png")
            mlflow.log_artifact(f"confusion_matrix_{vectorizer_type}_{max_features}.png")
            plt.close()

            # Log additional metrics for analysis
            mlflow.log_metric("feature_count", min(max_features, X_train.shape[1]))
            mlflow.log_metric("training_samples", X_train.shape[0])
            mlflow.log_metric("test_samples", X_test.shape[0])

            print(f"✅ Completed: {vectorizer_type} with max_features={max_features}, Accuracy: {accuracy:.4f}")

    except Exception as e:
        print(f"❌ Error in {vectorizer_type} experiment with max_features={max_features}: {str(e)}")
        if mlflow.active_run():
            mlflow.end_run()
        raise

In [14]:
def run_vectorizer_experiments(vectorizer_type, max_features_values):
    print(f"\n🚀 Starting {vectorizer_type} experiments with varying max_features...")

    for max_features in max_features_values:
        run_experiment_vectorizer_max_features(vectorizer_type, max_features)

    print(f"🎉 Completed all {vectorizer_type} experiments!")



# Test various max_features values
max_features_values = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]


# Run TF-IDF experiments
run_vectorizer_experiments("TF-IDF", max_features_values)

# Run BoW experiments
run_vectorizer_experiments("BoW", max_features_values)

# Final cleanup
if mlflow.active_run():
    mlflow.end_run()

print("\n🎯 All experiments completed! Check MLflow for results.")
print("📈 Compare the performance curves to find optimal max_features value.")


🚀 Starting TF-IDF experiments with varying max_features...
✅ Completed: TF-IDF with max_features=1000, Accuracy: 0.6618
🏃 View run TF-IDF_Trigrams_max_features_1000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/8/runs/11ece2949fa04a73a5b0c2db70223e4c
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/8
✅ Completed: TF-IDF with max_features=2000, Accuracy: 0.6592
🏃 View run TF-IDF_Trigrams_max_features_2000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/8/runs/20bfb323cbf449fb9097fac2cbd54f9c
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/8
✅ Completed: TF-IDF with max_features=3000, Accuracy: 0.6539
🏃 View run TF-IDF_Trigrams_max_features_3000 at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/8/runs/b2e962f81dad4e42a09912c052ac2356
🧪 View experiment at: https://dagshub.com/AMR-ITH/yt-comment-analyzer.mlflow/#/experiments/8
✅ Co