# **Pipeline 1 - Data Ingestion**

In [None]:
# Cell 1: Install specific library versions
!pip install pandas==1.5.3 \
                numpy==1.24.2 \
                sqlalchemy==2.0.8 \
                scikit-learn==1.2.2 \
                matplotlib==3.7.1 \
                seaborn==0.12.2 \
                gradio==4.44.1


In [None]:
# Cell 2: Imports & styling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, accuracy_score

import gradio as gr
import seaborn as sns

# Use default matplotlib style for simplicity
plt.style.use('default')


In [None]:
# Cell 3: Data Ingestion & preview


In [None]:
# Cell 4: Missing-value proportions (simple bar chart)


In [None]:
# Cell 5: Save data to SQL


# **Pipeline 2 - Data Preprocessing**

In [None]:
# Cell 6: Data cleaning via SQL


In [None]:
# Cell 7: Views distribution before vs after cleaning (simple histograms)


# **Pipeline 3 - Data Exploration**

# **Pipeline 4 - Feature Engineering**

# **Pipeline 5 - Model Selection**

# **Pipeline 6 - Hyperparameter Tuning**

# **Pipeline 7 - Model Training and Evaluation**

# **Pipeline 8 - Model Deployment**

In [None]:
# Cell 16: Gradio Dashboard — Title Length EDA & Extended Top Channels
def show_dashboard():
    # 1) Title length distribution by engagement class
    fig1, ax1 = plt.subplots(figsize=(6,3))
    ax1.hist(df_feat.loc[df_feat.high_engagement==0, 'TitleLength'],
             bins=30, alpha=0.5, label='Low Engagement')
    ax1.hist(df_feat.loc[df_feat.high_engagement==1, 'TitleLength'],
             bins=30, alpha=0.5, label='High Engagement')
    ax1.set_xlabel("Title Length (chars)")
    ax1.set_ylabel("Count")
    ax1.set_title("Title Length by Engagement Class")
    ax1.legend()
    fig1.tight_layout()

    # 2) Tuned model accuracies
    fig2, ax2 = plt.subplots(figsize=(6,3))
    names_list = list(tuned_acc.keys())
    accs = [tuned_acc[n] for n in names_list]
    x = np.arange(len(names_list))
    ax2.bar(x, accs, color='lightgreen')
    ax2.set_xticks(x)
    ax2.set_xticklabels(names_list, rotation=45, ha='right')
    ax2.set_ylim(0,1)
    ax2.set_ylabel("Accuracy")
    ax2.set_title("Tuned Model Accuracies")
    fig2.tight_layout()

    # 3) Top-10 channels by average predicted engagement
    #    using the best RandomForest model
    best_model = tuned_models['RandomForest']
    probs = best_model.predict_proba(X_test)[:,1]
    video_meta = df_feat.loc[X_test.index, ['channel_title']].copy()
    video_meta['pred_high_eng'] = probs
    channel_probs = video_meta.groupby('channel_title')['pred_high_eng'].mean()
    top10_channels = (
        channel_probs
        .sort_values(ascending=False)
        .head(10)
        .reset_index()
        .rename(columns={'pred_high_eng': 'avg_pred_high_eng'})
    )

    # 4) Feature importances of the RandomForest
    importances = best_model.feature_importances_
    imp_df = pd.Series(importances, index=features).sort_values()
    fig4, ax4 = plt.subplots(figsize=(6,3))
    imp_df.plot.barh(ax=ax4)
    ax4.set_title("RandomForest Feature Importances")
    fig4.tight_layout()

    # Return: title-length hist, accuracies bar, top10 DataFrame, importances
    return fig1, fig2, top10_channels, fig4

gr.Interface(
    fn=show_dashboard,
    inputs=[],
    outputs=[
        gr.Plot(label="Title Length by Engagement Class"),
        gr.Plot(label="Tuned Model Accuracies"),
        gr.Dataframe(label="Top 10 Channels by Predicted Engagement"),
        gr.Plot(label="Feature Importances")
    ],
    title="US YouTube Video Engagement Dashboard & Model Insights",
    description="Title length EDA, tuned model performance, top channels, and feature importances."
).launch()
