In [None]:
# # SETUP ENVIRONMENT
import os
import sys

# # Set Java (SỬA PATH NÀY!)
os.environ['JAVA_HOME'] = 'C:\\Java\\jdk-1.8'

# # QUAN TRỌNG: Bypass Hadoop requirement
os.environ['HADOOP_HOME'] = os.environ.get('JAVA_HOME')
os.environ['PATH'] = f"{os.environ['JAVA_HOME']}\\bin;{os.environ.get('PATH', '')}"

print(f"JAVA_HOME: {os.environ['JAVA_HOME']}")

In [None]:
# INSTALL PACKAGES
!pip install pyspark findspark matplotlib seaborn -q

In [None]:
# IMPORT LIBRARIES
import findspark
findspark.init()

from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Set style cho plots
plt.style.use('default')
sns.set_palette("husl")


In [None]:
# INITIALIZE SPARK SESSION
import tempfile

spark = SparkSession.builder \
    .appName("YouTubeCategoryAnalysis") \
    .master("local[1]") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.warehouse.dir", tempfile.gettempdir()) \
    .config("spark.ui.enabled", "false") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
print(f"Spark {spark.version} started")

In [None]:
# LOAD PREPROCESSED DATA
df = spark.read.csv("./data/preprocessed_data.csv", header=True, inferSchema=True)

print(f"Loaded {df.count():,} rows with {len(df.columns)} columns")
print("\nSAMPLE DATA")
df.show(5)

In [None]:
# CATEGORY OVERVIEW & MAPPING

# Tạo mapping categoryId sang tên category
df = df.withColumn("category_name", 
    when(col("categoryId") == "1", "Film & Animation")
    .when(col("categoryId") == "2", "Autos & Vehicles")
    .when(col("categoryId") == "10", "Music")
    .when(col("categoryId") == "15", "Pets & Animals")
    .when(col("categoryId") == "17", "Sports")
    .when(col("categoryId") == "19", "Travel & Events")
    .when(col("categoryId") == "20", "Gaming")
    .when(col("categoryId") == "22", "People & Blogs")
    .when(col("categoryId") == "23", "Comedy")
    .when(col("categoryId") == "24", "Entertainment")
    .when(col("categoryId") == "25", "News & Politics")
    .when(col("categoryId") == "26", "Howto & Style")
    .when(col("categoryId") == "27", "Education")
    .when(col("categoryId") == "28", "Science & Technology")
    .otherwise("Unknown")
)

print("=== CATEGORIES IN DATASET ===")
df.select("categoryId", "category_name").distinct().orderBy("categoryId").show()

total_categories = df.select("categoryId").distinct().count()
print(f"\nTotal categories: {total_categories}")

In [None]:
# PHÂN TÍCH 1: CATEGORY PHỔ BIẾN NHẤT

# Đếm số video theo category
category_counts = df.groupBy("categoryId", "category_name") \
    .count() \
    .orderBy(desc("count"))

print("TOP CATEGORIES BY VIDEO COUNT")
category_counts.show()

# Convert to Pandas for plotting
category_counts_pd = category_counts.toPandas()

# Plot
plt.figure(figsize=(15, 8))
bars = plt.bar(category_counts_pd['category_name'], category_counts_pd['count'])
plt.title('Number of Trending Videos by Category', fontsize=16, fontweight='bold')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Number of Videos', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 100,
             f'{int(height):,}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

# Percentage breakdown
total_videos = category_counts_pd['count'].sum()
category_counts_pd['percentage'] = (category_counts_pd['count'] / total_videos * 100).round(2)

print("\nPERCENTAGE BREAKDOWN")
for _, row in category_counts_pd.head(5).iterrows():
    print(f"{row['category_name']}: {row['count']:,} videos ({row['percentage']}%)")

In [None]:
# PHÂN TÍCH 2: CATEGORY CÓ TỔNG VIEW CAO NHẤT

# Convert view_count to numeric
df = df.withColumn("view_count_num", col("view_count").cast("long"))

# Lấy view cao nhất cho mỗi video (để tránh duplicate)
window_spec = Window.partitionBy("video_id").orderBy(desc("view_count_num"))
df_unique = df.withColumn("rank", row_number().over(window_spec)) \
    .filter(col("rank") == 1) \
    .drop("rank")

# Tính tổng views theo category
total_views_by_category = df_unique.groupBy("categoryId", "category_name") \
    .agg(sum("view_count_num").alias("total_views")) \
    .orderBy(desc("total_views"))

print("TOTAL VIEWS BY CATEGORY")
total_views_by_category.show()

# Convert to Pandas for plotting
views_pd = total_views_by_category.toPandas()

# Plot 1: All categories
plt.figure(figsize=(15, 8))
bars = plt.bar(views_pd['category_name'], views_pd['total_views'])
plt.title('Total Views by Category', fontsize=16, fontweight='bold')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Total Views', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Format y-axis
ax = plt.gca()
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x/1e9:.1f}B' if x >= 1e9 else f'{x/1e6:.0f}M'))

# Add value labels
for bar in bars:
    height = bar.get_height()
    if height >= 1e9:
        label = f'{height/1e9:.1f}B'
    else:
        label = f'{height/1e6:.0f}M'
    plt.text(bar.get_x() + bar.get_width()/2., height,
             label, ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

# Plot 2: Without Music (để thấy rõ các category khác)
views_no_music = views_pd[views_pd['category_name'] != 'Music']

plt.figure(figsize=(15, 8))
bars = plt.bar(views_no_music['category_name'], views_no_music['total_views'])
plt.title('Total Views by Category (Excluding Music)', fontsize=16, fontweight='bold')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Total Views', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Format y-axis
ax = plt.gca()
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x/1e9:.1f}B' if x >= 1e9 else f'{x/1e6:.0f}M'))

plt.tight_layout()
plt.show()

In [None]:
# PHÂN TÍCH 3: CATEGORY CÓ NHIỀU CHANNEL NHẤT

# Đếm số channel unique theo category
channels_by_category = df.groupBy("categoryId", "category_name") \
    .agg(countDistinct("channelTitle").alias("unique_channels")) \
    .orderBy(desc("unique_channels"))

print("UNIQUE CHANNELS BY CATEGORY")
channels_by_category.show()

# Convert to Pandas for plotting
channels_pd = channels_by_category.toPandas()

# Plot
plt.figure(figsize=(15, 8))
bars = plt.bar(channels_pd['category_name'], channels_pd['unique_channels'])
plt.title('Number of Unique Channels by Category', fontsize=16, fontweight='bold')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Number of Unique Channels', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Add value labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 20,
             f'{int(height):,}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# PHÂN TÍCH 4: ENGAGEMENT THEO CATEGORY

# Convert metrics to numeric
df_metrics = df.withColumn("likes_num", col("likes").cast("long")) \
    .withColumn("dislikes_num", col("dislikes").cast("long")) \
    .withColumn("comment_count_num", col("comment_count").cast("long"))

# Calculate engagement metrics
engagement_by_category = df_metrics.groupBy("categoryId", "category_name") \
    .agg(
        avg("likes_num").alias("avg_likes"),
        avg("dislikes_num").alias("avg_dislikes"), 
        avg("comment_count_num").alias("avg_comments"),
        avg("view_count_num").alias("avg_views")
    ) \
    .orderBy(desc("avg_likes"))

print("AVERAGE ENGAGEMENT BY CATEGORY")
engagement_by_category.show()

# Convert to Pandas
engagement_pd = engagement_by_category.toPandas()

# Plot engagement metrics
fig, axes = plt.subplots(2, 2, figsize=(20, 12))
fig.suptitle('Average Engagement Metrics by Category', fontsize=16, fontweight='bold')

# Average Likes
axes[0,0].bar(engagement_pd['category_name'], engagement_pd['avg_likes'])
axes[0,0].set_title('Average Likes')
axes[0,0].set_ylabel('Average Likes')
axes[0,0].tick_params(axis='x', rotation=45)

# Average Comments
axes[0,1].bar(engagement_pd['category_name'], engagement_pd['avg_comments'])
axes[0,1].set_title('Average Comments')
axes[0,1].set_ylabel('Average Comments')
axes[0,1].tick_params(axis='x', rotation=45)

# Average Views
axes[1,0].bar(engagement_pd['category_name'], engagement_pd['avg_views'])
axes[1,0].set_title('Average Views')
axes[1,0].set_ylabel('Average Views')
axes[1,0].tick_params(axis='x', rotation=45)

# Average Dislikes
axes[1,1].bar(engagement_pd['category_name'], engagement_pd['avg_dislikes'])
axes[1,1].set_title('Average Dislikes')
axes[1,1].set_ylabel('Average Dislikes')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# PHÂN TÍCH BỔ SUNG 1: MỨC ĐỘ CÔ ĐẶC KÊNH
# Mục đích: Xem thể loại nào bị thống trị bởi ít kênh (cô đặc)
# hay được phân bổ đều cho nhiều kênh (phân tán).

print("PHÂN TÍCH BỔ SUNG: MỨC ĐỘ CÔ ĐẶC KÊNH (VIDEOS / KÊNH)")

# Tính tổng video và tổng kênh unique
channel_concentration = df.groupBy("category_name") \
    .agg(
        count("video_id").alias("total_videos"),
        countDistinct("channelTitle").alias("unique_channels")
    ) \
    .withColumn(
        "videos_per_channel", col("total_videos") / col("unique_channels")
    ) \
    .orderBy(desc("videos_per_channel"))

print("Mức độ tập trung của kênh (Videos / Kênh)")
channel_concentration.show()

# Plotting
concentration_pd = channel_concentration.toPandas()
plt.figure(figsize=(15, 7))
# Dùng category_name cho trục y để dễ đọc
bars = plt.bar(concentration_pd['category_name'], concentration_pd['videos_per_channel'])
plt.title('Mức độ "cô đặc" kênh (Trung bình Videos / Kênh)', fontsize=16)
plt.xlabel('Thể loại')
plt.ylabel('Số video trung bình / Kênh')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("   Insight: Thể loại có chỉ số cao (như Music) có nghĩa là một kênh")
print("   trung bình sản xuất RẤT NHIỀU video trending, cho thấy sự thống trị của các kênh lớn.")

In [None]:
# PHÂN TÍCH BỔ SUNG 2: TỶ LỆ LIKE/DISLIKE (SENTIMENT)
# Mục đích: Xem thể loại nào được "yêu thích" (ít dislike)
# và thể loại nào "gây tranh cãi" (nhiều dislike).

print("PHÂN TÍCH BỔ SUNG: TỶ LỆ LIKE/DISLIKE (SENTIMENT RATIO)")

# df_metrics đã được tạo ở Phân tích 4
sentiment_ratio = df_metrics.groupBy("category_name") \
    .agg(
        sum("likes_num").alias("total_likes"),
        sum("dislikes_num").alias("total_dislikes")
    ) \
    .withColumn(
        # Thêm +1 để tránh lỗi chia cho 0
        "like_dislike_ratio", 
        col("total_likes") / (col("total_dislikes") + 1) 
    ) \
    .orderBy(desc("like_dislike_ratio"))

print("Tỷ lệ Like/Dislike (Thể loại được 'yêu thích' nhất)")
sentiment_ratio.show()

# Plotting
sentiment_pd = sentiment_ratio.toPandas()
plt.figure(figsize=(15, 7))
bars = plt.bar(sentiment_pd['category_name'], sentiment_pd['like_dislike_ratio'])
plt.title('Tỷ lệ Like/Dislike theo Thể loại (Càng cao càng tốt)', fontsize=16)
plt.xlabel('Thể loại')
plt.ylabel('Tỷ lệ (Likes / Dislikes)')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("Insight: Thể loại 'News & Politics' thường có tỷ lệ này thấp")
print("   do tính chất gây tranh cãi, trong khi 'Music' hoặc 'Pets' thường cao hơn.")

In [None]:
# PHÂN TÍCH 5: TOP VIDEOS THEO CATEGORY

# Tạo lại df_unique với các numeric columns
df_unique_with_metrics = df_unique.withColumn("likes_num", col("likes").cast("long")) \
    .withColumn("dislikes_num", col("dislikes").cast("long")) \
    .withColumn("comment_count_num", col("comment_count").cast("long"))

# Top video by views in each category
window_cat = Window.partitionBy("category_name").orderBy(desc("view_count_num"))
top_videos_by_category = df_unique_with_metrics.withColumn("rank_in_category", row_number().over(window_cat)) \
    .filter(col("rank_in_category") == 1) \
    .select("category_name", "title", "channelTitle", "view_count_num", "likes_num") \
    .orderBy(desc("view_count_num"))

print("TOP VIDEO IN EACH CATEGORY")
top_videos_by_category.show(truncate=False)

# Overall top 10 videos
print("\nOVERALL TOP 10 VIDEOS")
df_unique_with_metrics.select("title", "category_name", "channelTitle", "view_count_num") \
    .orderBy(desc("view_count_num")) \
    .limit(10) \
    .show(truncate=False)

In [None]:
# SUMMARY DASHBOARD

print("YOUTUBE CATEGORY ANALYSIS SUMMARY")

# Key insights
top_category = category_counts_pd.iloc[0]
top_views_category = views_pd.iloc[0]
top_channels_category = channels_pd.iloc[0]

print(f"\n MOST POPULAR CATEGORY (by video count):")
print(f"   {top_category['category_name']}: {top_category['count']:,} videos ({top_category['percentage']}%)")

print(f"\n HIGHEST TOTAL VIEWS:")
print(f"   {top_views_category['category_name']}: {top_views_category['total_views']:,} views")

print(f"\n MOST CHANNELS:")
print(f"   {top_channels_category['category_name']}: {top_channels_category['unique_channels']:,} unique channels")

print(f"\n HIGHEST ENGAGEMENT (Average Likes):")
top_engagement = engagement_pd.iloc[0]
print(f"   {top_engagement['category_name']}: {top_engagement['avg_likes']:,.0f} average likes")

print(f"\n DATASET OVERVIEW:")
print(f"   Total videos analyzed: {df.count():,}")
print(f"   Total categories: {total_categories}")
print(f"   Total unique channels: {df.select('channelTitle').distinct().count():,}")

print(f"\n KEY INSIGHTS:")
print(f"   • Entertainment dominates with 20.07% of trending videos")
print(f"   • Music has highest total views (36.1B) and engagement")
print(f"   • Gaming shows strong performance across all metrics")
print(f"   • BLACKPINK's 'Pink Venom' is top music video (277M+ views)")
print(f"   • Discord's loot box video leads overall (1.4B+ views)")




In [None]:
# Stop Spark session
spark.stop()