
### Phase 1: Data Loading and Initial Exploration



**Step 1.1: Import Libraries**


In [None]:

import pandas as pd   # for data manipulation
import json
import matplotlib.pyplot as plt  # for plotting graphs
import seaborn as sns
import re


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")


**Step 1.2: Load Datasets**


In [None]:

# Load the main dataset containing trending video information
df = pd.read_csv("../data/youtube_dataset/USvideos.csv")

# Load the category ID to name mapping from the JSON file
with open("../data/youtube_dataset/US_category_id.json", "r") as f:
    categories_data = json.load(f)

# Create a dictionary for easy mapping of category IDs to names
category_id_to_name = {
    int(item["id"]): item["snippet"]["title"]
    for item in categories_data["items"]
}

# Apply the mapping to create a new column in the DataFrame
df["category_name"] = df["category_id"].map(category_id_to_name)

print("Datasets loaded and category names applied successfully.")
print("\nSample of mapped category names:")
print(df[["category_id", "category_name"]].drop_duplicates().head())



**Step 1.3: Initial Data Inspection**


In [None]:
# Display the first 5 rows of the DataFrame to understand its structure
print("\n--- DataFrame Head ---")
print(df.head())


In [None]:
# Display basic information about the DataFrame (data types, non-null counts)
print("\n--- DataFrame Info ---")
df.info()


In [None]:
# Display descriptive statistics for numerical columns
print("\n--- Descriptive Statistics ---")
print(df.describe())


In [None]:
# Display the category mapping to confirm its content
print("\n--- Category ID to Name Mapping ---")
print(category_id_to_name)


### Phase 2: Data Cleaning and Preprocessing 


In [None]:
# Check for missing values across all columns before handling
print("\n--- Missing Values Before Handling ---")
print(df.isnull().sum())


# Drop rows with missing values
df.dropna(inplace=True)

# Convert date/time columns
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')
df['publish_time'] = pd.to_datetime(df['publish_time'])

# duplicates handling
print("\n")
print("Duplicates Before Handling: ",df.duplicated(subset=["video_id", "trending_date"]).sum())
print("\n")
# Remove duplicates
df.drop_duplicates(subset=['video_id', 'trending_date'], inplace=True)
print("Duplicates After Handling: ",df.duplicated(subset=["video_id", "trending_date"]).sum())
print("\n")

# Re-check for missing values after handling
print("\n--- Missing Values After Handling ---")
print(df.isnull().sum())
print("\n")

# Fill missing values in the 'description' column with an empty string
df["description"] = df["description"].fillna("")

# Clean text data in 'tags' and 'description'
df['tags'] = df['tags'].apply(lambda x: x.replace('[none]', '').replace('|', ', '))
df['description'] = df['description'].fillna('')


# Confirm cleaning
df.info()



#### Output:
- **Missing Values:** Only description had 570 missing — now all filled.
- **Duplicates:** 49 found and removed.
- **Dates:** trending_date and publish_time converted to datetime.
- **Text:** Cleaned tags and filled description with empty strings.
- **Final Check:** No missing values, no duplicates, 40330 clean rows.



### Phase 3: Feature Engineering 



**Step 3.1: Time-Based Features**


In [None]:
# Remove timezone info (convert to tz-naive) if present
df["publish_time"] = pd.to_datetime(df["publish_time"]).dt.tz_localize(None)
df["trending_date"] = pd.to_datetime(df["trending_date"]).dt.tz_localize(None)

# Now you can safely subtract
df["days_to_trend"] = (df["trending_date"] - df["publish_time"]).dt.days

# Extract additional time-based features
df["publish_hour"] = df["publish_time"].dt.hour
df["publish_day_of_week"] = df["publish_time"].dt.dayofweek
df["publish_month"] = df["publish_time"].dt.month
df["publish_year"] = df["publish_time"].dt.year

print("Time-based features created successfully.")
print("\n--- Sample of Time-Based Features ---")
print(df[["trending_date", "publish_time", "days_to_trend", "publish_hour", "publish_day_of_week", "publish_month", "publish_year"]].head())

#### Output:
- Timezones removed for accuracy
- days_to_trend: Days from publish to trending
- Extracted: hour, weekday, month, year of publish
- Helps analyze when videos tend to trend


**Step 3.2: Engagement Ratios**


In [None]:
# Calculate engagement ratios to normalize metrics by views
# Adding 1 to denominators to avoid division by zero for videos with 0 views/dislikes
df["likes_per_view"] = df["likes"] / (df["views"] + 1)
df["dislikes_per_view"] = df["dislikes"] / (df["views"] + 1)
df["comments_per_view"] = df["comment_count"] / (df["views"] + 1)

# Calculate the overall engagement score (sum of likes, dislikes, comments per view)
df["engagement_score"] = (df["likes"] + df["dislikes"] + df["comment_count"]) / (df["views"] + 1)

# Calculate likes to dislikes ratio, handling zero dislikes
df["likes_dislikes_ratio"] = df["likes"] / (df["dislikes"] + 1)

print("Engagement ratios created successfully.")
print("\n--- Sample of Engagement Ratios ---")
print(df[["views", "likes", "dislikes", "comment_count", "likes_per_view", "dislikes_per_view", "comments_per_view", "engagement_score", "likes_dislikes_ratio"]].head())

#### Engagement Metrics
- Normalized by views to compare fairly across videos.

Key ratios:
- likes_per_view, dislikes_per_view, comments_per_view
- engagement_score: Total interaction per view
- likes_dislikes_ratio: Popularity vs. criticism
- Helps identify highly engaging or controversial videos regardless of view count.


**Step 3.3: Text-Based Features (Tags and Title/Description Length)**


In [None]:
import re

# Function to clean tags
def clean_tags(tags):
    tags = tags.replace('"', '')  # remove quotes
    return [tag.strip().lower() for tag in tags.split('|')]

# Function to clean descriptions
def clean_description(text):
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove special characters
    return ' '.join(text.lower().split())  # lowercase and strip extra spaces

# Make sure to clean the text columns first
df['tags_cleaned'] = df['tags'].apply(clean_tags)
df['description_cleaned'] = df['description'].apply(clean_description)

# Now you can safely create the features
df['tag_count'] = df['tags_cleaned'].apply(len)
df['title_length'] = df['title'].apply(len)
df['description_length'] = df['description_cleaned'].apply(len)

# Display results
print("✅ Text-based features created successfully.\n")
print("--- Sample of Text-Based Features ---")
print(df[['tags_cleaned', 'tag_count', 'title', 'title_length', 'description_cleaned', 'description_length']].head())


#### Text Features
- Tags cleaned: Lowercased, split, quotes removed
- Descriptions cleaned: URLs & symbols removed, lowercased

Created:

- tag_count: Number of tags
- title_length: Characters in title
- description_length: Cleaned description size
- Useful for content and SEO analysis


### Phase 4: Exploratory Data Analysis (EDA) - Univariate and Bivariate



**Step 4.1: Distribution of Numerical Features**


In [None]:

# Set the aesthetic style of the plots for better readability
sns.set_style("whitegrid")

# Create histograms for key numerical features to understand their distribution
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
sns.histplot(df["views"], bins=50, kde=True)
plt.title("Distribution of Views")
plt.xlabel("Views (Log Scale)")
plt.xscale("log") # Use log scale due to highly skewed distribution


plt.tight_layout()
plt.show()

**Insights:**
- This graph shows that most YouTube videos have a relatively low number of views, while a small number of videos gain very high view counts.
- The distribution is heavily right-skewed.
- Most videos struggle to get views, so creators must focus on optimizing content for visibility (thumbnails, titles, SEO) to stand out in the crowded    space.

In [None]:
sns.set_style("whitegrid")

# Create histograms for key numerical features to understand their distribution
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 2)
sns.histplot(df["likes"], bins=50, kde=True)
plt.title("Distribution of Likes")
plt.xlabel("Likes (Log Scale)")
plt.xscale("log") # Use log scale due to highly skewed distribution

plt.tight_layout()
plt.show()

**Insight:**
- The Graph shows that most videos receive a modest number of likes, with only a few getting large like counts. 
- This  follows a skewed distribution.
- Engagement through likes is not evenly spread. Encouraging viewers to like videos can improve visibility and algorithm ranking.


In [None]:
sns.set_style("whitegrid")

# Create histograms for key numerical features to understand their distribution
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 3)
sns.histplot(df["dislikes"], bins=50, kde=True)
plt.title("Distribution of Dislikes")
plt.xlabel("Dislikes (Log Scale)")
plt.xscale("log") # Use log scale due to highly skewed distribution


plt.tight_layout()
plt.show()

**Insights:**
- Dislikes are much fewer overall compared to likes and views. Most videos receive very few dislikes.
- While dislikes are relatively rare, they still indicate audience dissatisfaction.
- Keeping audience satisfaction high reduces the chances of negative feedback.

In [None]:
sns.set_style("whitegrid")

# Create histograms for key numerical features to understand their distribution
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 4)
sns.histplot(df["comment_count"], bins=50, kde=True)
plt.title("Distribution of Comment Count")
plt.xlabel("Comment Count (Log Scale)")
plt.xscale("log") # Use log scale due to highly skewed distribution


plt.tight_layout()
plt.show()

**Insights:**
- Most videos receive very few comments, while a few get a lot. 
- Comment activity is highly skewed like other engagement metrics.
- Encouraging viewers to comment can boost engagement signals. Interactive videos or questions in content help increase comment counts.

In [None]:
sns.set_style("whitegrid")

# Create histograms for key numerical features to understand their distribution
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 5)
sns.histplot(df["days_to_trend"], bins=30, kde=True)
plt.title("Distribution of Days to Trend")
plt.xlabel("Days to Trend")


plt.tight_layout()
plt.show()

**Insights:**
- Most videos that end up trending do so within just a few days of being uploaded, while very few trend later.
- The first few days after upload are critical for gaining traction.
- Creators should focus their promotion and viewer engagement efforts right after publishing a video.


**Step 4.2: Most Common Video Categories**


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 7))
sns.countplot(
    y="category_name",
    data=df,
    order=df["category_name"].value_counts().index[:15],
    palette="viridis"
)
plt.title("Top 15 Most Common Video Categories in Trending Videos")
plt.xlabel("Number of Trending Videos")
plt.ylabel("Category Name")
plt.tight_layout()
plt.show()


**Insights:**
The Above Graph Shows:
It clearly shows that Entertainment dominates with the highest number of trending videos, followed by Music and Howto & Style. Other popular categories include Comedy, People & Blogs, and News & Politics, while categories like Autos & Vehicles and Shows appear much less frequently in the trending list.

This indicates that entertainment-based and music content are more likely to go viral or trend on YouTube. For creators aiming to reach trending status, producing content in these popular categories might increase their chances, especially if combined with quality, engagement, and timing.



**Step 4.3: Likes vs. Views Scatter Plot**


In [None]:

plt.figure(figsize=(10, 8))
sns.scatterplot(x="views", y="likes", data=df, alpha=0.6, hue="category_name", size="comment_count", sizes=(20, 400), legend="brief")
plt.title("Likes vs. Views Scatter Plot (with Category and Comment Count)")
plt.xlabel("Views (Log Scale)")
plt.ylabel("Likes (Log Scale)")
plt.xscale("log") # Use log scale for better visualization due to wide range of views
plt.yscale("log") # Use log scale for better visualization due to wide range of likes
plt.grid(True, which="both", ls="--", c=".7")
plt.tight_layout()
plt.show()

**Insights:**
- Strong positive correlation between views and likes.
- Larger bubbles (more comments) appear in high-view/like areas.
- Categories like Entertainment & Music lead in engagement.



**Step 4.4: Correlation Heatmap**


In [None]:

# Select numerical features for correlation analysis, including newly engineered ones
correlation_features = [
    "views", "likes", "dislikes", "comment_count",
    "days_to_trend", "publish_hour", "publish_day_of_week",
    "likes_per_view", "dislikes_per_view", "comments_per_view",
    "engagement_score", "likes_dislikes_ratio", "tag_count", "title_length", "description_length"
]

plt.figure(figsize=(14, 12))
sns.heatmap(df[correlation_features].corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5)
plt.title("Correlation Heatmap of Numerical Features")
plt.tight_layout()
plt.show()

**Insights:**
- views, likes, dislikes, comment_count are highly correlated with each other (e.g., likes vs. views = 0.85).
- engagement_score strongly correlates with likes_per_view (0.97), indicating a solid metric for evaluating video performance.
- days_to_trend has weak or negative correlation with most metrics, suggesting that virality is unpredictable time-wise.


**Step 4.5: Trends by Publish Time (Hour and Day of Week)**


In [None]:
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
sns.countplot(x="publish_hour", data=df, palette="rocket")
plt.title("Number of Trending Videos by Publish Hour")
plt.xlabel("Publish Hour")
plt.ylabel("Number of Videos")

plt.tight_layout()
plt.show()

This graph shows that:
- Most trending videos are published between 2 PM and 5 PM(14:00 to 17:00), with a clear peak at 4 PM. 
- Posting in the afternoon greatly boosts the chance of trending, while early morning hours see the fewest trends.
- Timing your upload is a key factor for visibility.
- Posting your videos in the afternoon peak window (especially around 2–5 PM) can significantly increase their chances of trending and reaching a       wider audience.


In [None]:

plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 2)
sns.countplot(x="publish_day_of_week", data=df, palette="mako")
plt.title("Number of Trending Videos by Publish Day of Week")
plt.xlabel("Publish Day of Week (0=Monday, 6=Sunday)")
plt.ylabel("Number of Videos")

plt.tight_layout()
plt.show()

**Insights:**
This graph shows that
- Most trending videos are published between Monday and Friday, with a peak on Wednesday and Thursday.
- Fewer trending videos are posted on weekends, especially Saturday and Sunday.
- To increase chances of trending, upload during weekdays.


### Phase 5: Predictive Modeling (Optional)



**Step 5.1: Define Target Variable and Select Features**


In [None]:

view_threshold = df["views"].quantile(0.75)
df["is_trending"] = (df["views"] >= view_threshold).astype(int)


safe_features = [
    "likes", "dislikes", "comment_count",
    "publish_hour", "publish_day_of_week",
    "tag_count", "title_length", "description_length",
    "category_name", "comments_disabled", "ratings_disabled",
    "likes_dislikes_ratio"
]

X = df[safe_features]
y = df["is_trending"]





**Step 5.2: Data Splitting and Preprocessing Pipeline**


In [None]:
# ✅ Step 3: Identify feature types
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

# ✅ Step 4: Preprocessing for numeric and categorical features
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_features),
    ("cat", categorical_transformer, cat_features)
])



**Step 5.3: Model Training and Evaluation**


In [None]:
# ✅ Step 5: Full pipeline with Logistic Regression
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

# ✅ Step 6: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# ✅ Step 7: Train and evaluate
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)

print("\n--- Logistic Regression Evaluation (Leak-Free) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ✅ Step 8: Cross-validation ROC AUC
cv_score = cross_val_score(model_pipeline, X, y, cv=5, scoring="roc_auc")
print(f"✅ Average ROC AUC score (cv=5): {cv_score.mean():.4f}")


**Insights:**
The Model performs well:
- Accuracy: 91% — predicts correctly most of the time.
- ROC AUC: 0.95 — excellent at separating trending vs. not trending.
- Recall for trending: 73% — it misses some trending videos.
- Overall: Reliable, realistic, and leakage-free performance.

### Conclusion:
The model achieves 91% accuracy and ROC AUC of 0.95, effectively predicting trending videos using safe, leak-free features like engagement and metadata. It slightly under-predicts trending content (73% recall), but overall performs reliably and realistically for forecasting YouTube popularity.

In [None]:
import pandas as pd

# ✅ Step 1: Prepare example input (values can be modified)
example_video = pd.DataFrame([{
    "likes": 25000,
    "dislikes": 500,
    "comment_count": 3000,
    "publish_hour": 15,
    "publish_day_of_week": 2,  # Tuesday
    "tag_count": 20,
    "title_length": 50,
    "description_length": 300,
    "category_name": "Entertainment",
    "comments_disabled": False,
    "ratings_disabled": False,
    "likes_dislikes_ratio": 25000 / (500 + 1)  # = 49.90
}])

# ✅ Step 2: Predict using the trained model pipeline
prediction = model_pipeline.predict(example_video)
probability = model_pipeline.predict_proba(example_video)[0][1]

# ✅ Step 3: Display the result
if prediction[0] == 1:
    print(f"✅ This video is likely to trend (probability: {probability:.2%})")
else:
    print(f"❌ This video is unlikely to trend (probability: {probability:.2%})")


In [None]:
import pandas as pd

# Example with strong metrics for a trending video
example_video = pd.DataFrame([{
    "likes": 120000,
    "dislikes": 1500,
    "comment_count": 10000,
    "publish_hour": 18,               # Peak upload hour
    "publish_day_of_week": 4,         # Thursday
    "tag_count": 25,
    "title_length": 65,
    "description_length": 900,
    "category_name": "Music",         # Highly trending category
    "comments_disabled": False,
    "ratings_disabled": False,
    "likes_dislikes_ratio": 120000 / (1500 + 1)  # ≈ 79.94
}])

# Predict
prediction = model_pipeline.predict(example_video)
probability = model_pipeline.predict_proba(example_video)[0][1]

# Show result
if prediction[0] == 1:
    print(f"✅ This video is likely to trend (probability: {probability:.2%})")
else:
    print(f"❌ This video is unlikely to trend (probability: {probability:.2%})")
