## Set up

In [1]:
# # SETUP 
import os
import sys

# # Set Java (SỬA PATH NÀY!)
os.environ['JAVA_HOME'] = 'C:\\Java\\jdk-1.8'

# # QUAN TRỌNG: Bypass Hadoop requirement
os.environ['HADOOP_HOME'] = os.environ.get('JAVA_HOME')
os.environ['PATH'] = f"{os.environ['JAVA_HOME']}\\bin;{os.environ.get('PATH', '')}"

print(f"JAVA_HOME: {os.environ['JAVA_HOME']}")

JAVA_HOME: C:\Java\jdk-1.8


In [2]:
# INSTALL FINDSPARK
!pip install pyspark findspark -q


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# IMPORT LIBRARIES
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from functools import reduce
import warnings
warnings.filterwarnings('ignore')

print("Imports successful")

Imports successful


## Khởi tạo Spark Session

In [4]:
# INITIALIZE SPARK SESSION
import tempfile

spark = SparkSession.builder \
    .appName("YouTubePreprocessing") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.warehouse.dir", tempfile.gettempdir()) \
    .config("spark.ui.enabled", "false") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
print(f"Spark {spark.version} started")

Spark 3.2.1 started


## Đọc dữ liệu

In [5]:
raw_df = spark.read.csv("./data/raw_data.csv", header=True, inferSchema=True)

# Kiểm tra format trending_date để hiểu dữ liệu
print("=== SAMPLE TRENDING_DATE VALUES ===")
raw_df.select("trending_date").filter(col("trending_date").isNotNull()).distinct().show(10, False)

print("RAW DATA OVERVIEW")
raw_df.show(5)

print("VALID VIDEO ROWS (có video_id)")
valid_videos = raw_df.filter(col("video_id").isNotNull() & (col("video_id") != ""))
print(f"Valid videos: {valid_videos.count()} / {raw_df.count()}")
valid_videos.show(5)

=== SAMPLE TRENDING_DATE VALUES ===
+--------------------+
|trending_date       |
+--------------------+
|2020-08-23T00:00:00Z|
|2020-10-25T00:00:00Z|
|2020-11-07T00:00:00Z|
|2020-10-22T00:00:00Z|
|2020-10-27T00:00:00Z|
|2020-11-19T00:00:00Z|
|2020-11-27T00:00:00Z|
|2020-08-22T00:00:00Z|
|2020-09-26T00:00:00Z|
|2020-12-26T00:00:00Z|
+--------------------+
only showing top 10 rows

RAW DATA OVERVIEW
+--------------------+--------------------+--------------------+--------------------+------------+----------+--------------------+--------------------+----------+------+--------+-------------+--------------------+-----------------+----------------+--------------------+
|            video_id|               title|         publishedAt|           channelId|channelTitle|categoryId|       trending_date|                tags|view_count| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|         description|
+--------------------+--------------------+---------------

## Kiểm tra các giá trị trending_date không hợp lệ

In [6]:
print("INVALID TRENDING_DATE VALUES")
raw_df.select("trending_date").filter(
    col("trending_date").isNotNull() & 
    ~col("trending_date").rlike(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
).distinct().show(20, False)

print("COUNT COMPARISON")
valid_dates = raw_df.filter(col("trending_date").rlike(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")).count()
total_with_dates = raw_df.filter(col("trending_date").isNotNull()).count()
print(f"Valid dates: {valid_dates} / {total_with_dates}")

INVALID TRENDING_DATE VALUES
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|trending_date                                                                                                                                                                   |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| and hit songs features material by Warner Music Group artists such as Van Halen                                                                                                |
| Rod Wave                                                                                                                                                                       |
| etc…)1 onion1 potato1 carrot5tbsp canola oil3-5tbsp flour1 soup stock cube

## Khai báo hàm dataframe_info

In [7]:
# Helper function
def dataframe_info(df):
    print(f"{'-'*40}")
    print(f"Số dòng: {df.count()}, Số cột: {len(df.columns)}")
    print(f"{'-'*40}")
    df.printSchema()
    print(f"{'-'*40}")
    df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

dataframe_info(raw_df)

----------------------------------------
Số dòng: 404841, Số cột: 16
----------------------------------------
root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- description: string (nullable = true)

----------------------------------------
+--------+------+-----------+---------+------------+----------+-------------+------+----------+------+--------+-------------+--------------+-----------------+-------

## Tiền xử lý dữ liệu

### 1, Xóa các cột không cần thiết

In [8]:
preprocessed_data = raw_df.drop('thumbnail_link', 'comments_disabled', 'video_error_or_removed', 'ratings_disabled')
dataframe_info(preprocessed_data)

----------------------------------------
Số dòng: 404841, Số cột: 13
----------------------------------------
root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = true)

----------------------------------------
+--------+------+-----------+---------+------------+----------+-------------+------+----------+------+--------+-------------+-----------+
|video_id| title|publishedAt|channelId|channelTitle|categoryId|trending_date|  tags|view_count| likes|dislikes|comment_count|description|
+--------+------+-----------+---

### 2, Xóa các hàng có tất cả giá trị là Null

In [9]:
preprocessed_data = preprocessed_data.filter(
    reduce(lambda a, b: a | b, (col(c).isNotNull() for c in preprocessed_data.columns))
)
dataframe_info(preprocessed_data)

----------------------------------------
Số dòng: 404841, Số cột: 13
----------------------------------------
root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = true)

----------------------------------------
+--------+------+-----------+---------+------------+----------+-------------+------+----------+------+--------+-------------+-----------+
|video_id| title|publishedAt|channelId|channelTitle|categoryId|trending_date|  tags|view_count| likes|dislikes|comment_count|description|
+--------+------+-----------+---

In [10]:
dataframe_info(preprocessed_data)

----------------------------------------
Số dòng: 404841, Số cột: 13
----------------------------------------
root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = true)

----------------------------------------
+--------+------+-----------+---------+------------+----------+-------------+------+----------+------+--------+-------------+-----------+
|video_id| title|publishedAt|channelId|channelTitle|categoryId|trending_date|  tags|view_count| likes|dislikes|comment_count|description|
+--------+------+-----------+---

### 3, Xóa các hàng có trending_date sai định dạng

In [11]:
# Lọc dữ liệu video hợp lệ - chỉ giữ những dòng có trending_date đúng format
print("Before filtering:")
print(f"Total rows: {preprocessed_data.count()}")

# Lọc chỉ những dòng có trending_date đúng format ISO timestamp
preprocessed_data = preprocessed_data.filter(
    col("trending_date").rlike(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
)

print("After filtering:")
print(f"Valid rows: {preprocessed_data.count()}")
dataframe_info(preprocessed_data)

Before filtering:
Total rows: 404841
After filtering:
Valid rows: 268787
----------------------------------------
Số dòng: 268787, Số cột: 13
----------------------------------------
root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = true)

----------------------------------------
+--------+-----+-----------+---------+------------+----------+-------------+----+----------+-----+--------+-------------+-----------+
|video_id|title|publishedAt|channelId|channelTitle|categoryId|trending_date|tags|view_count|likes|dis

### 4, Điền giá trị Null cho description

In [12]:
preprocessed_data = preprocessed_data.fillna({"description": "No description"})
dataframe_info(preprocessed_data)

----------------------------------------
Số dòng: 268787, Số cột: 13
----------------------------------------
root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = false)

----------------------------------------
+--------+-----+-----------+---------+------------+----------+-------------+----+----------+-----+--------+-------------+-----------+
|video_id|title|publishedAt|channelId|channelTitle|categoryId|trending_date|tags|view_count|likes|dislikes|comment_count|description|
+--------+-----+-----------+---------+-

### 5, Chuẩn hóa dữ liệu

In [13]:
# Chuyển đổi timestamp - SỬA FORMAT CHO ĐÚNG
preprocessed_data = preprocessed_data.withColumn('trending_date', to_timestamp('trending_date', "yyyy-MM-dd'T'HH:mm:ss'Z'"))
preprocessed_data = preprocessed_data.withColumn('publishedAt', to_timestamp('publishedAt', "yyyy-MM-dd'T'HH:mm:ss'Z'"))

# Dataset riêng cho machine learning
ML_data = preprocessed_data
ML_data = ML_data.withColumn('tags', when(col('tags') == '[none]', '').otherwise(col('tags')))
ML_data = ML_data.withColumn('tags', split(regexp_replace('tags', '"', ''), '\\|'))

preprocessed_data.show(10)

+-----------+--------------------+-------------------+--------------------+--------------+----------+-------------------+--------------------+----------+------+--------+-------------+--------------------+
|   video_id|               title|        publishedAt|           channelId|  channelTitle|categoryId|      trending_date|                tags|view_count| likes|dislikes|comment_count|         description|
+-----------+--------------------+-------------------+--------------------+--------------+----------+-------------------+--------------------+----------+------+--------+-------------+--------------------+
|3C66w5Z0ixs|I ASKED HER TO BE...|2020-08-11 19:20:14|UCvtRTOMP2TqYqu51...|      Brawadis|        22|2020-08-12 00:00:00|brawadis|prank|ba...|   1514614|156908|    5855|        35313|SUBSCRIBE to BRAW...|
|M9Pmf9AB4Mo|Apex Legends | St...|2020-08-11 17:00:10|UC0ZV6M2THA81QT9h...|  Apex Legends|        20|2020-08-12 00:00:00|Apex Legends|Apex...|   2381688|146739|    2794|        165

In [14]:
dataframe_info(preprocessed_data)

----------------------------------------
Số dòng: 268787, Số cột: 13
----------------------------------------
root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: timestamp (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- trending_date: timestamp (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- description: string (nullable = false)

----------------------------------------
+--------+-----+-----------+---------+------------+----------+-------------+----+----------+-----+--------+-------------+-----------+
|video_id|title|publishedAt|channelId|channelTitle|categoryId|trending_date|tags|view_count|likes|dislikes|comment_count|description|
+--------+-----+-----------+-----

## Lưu dữ liệu đã xử lý

In [15]:
# Convert timestamps to string để tránh lỗi khi save
preprocessed_save = preprocessed_data.withColumn('trending_date', 
    date_format('trending_date', 'yyyy-MM-dd HH:mm:ss')) \
    .withColumn('publishedAt', 
    date_format('publishedAt', 'yyyy-MM-dd HH:mm:ss'))

ML_save = ML_data.withColumn('trending_date', 
    date_format('trending_date', 'yyyy-MM-dd HH:mm:ss')) \
    .withColumn('publishedAt', 
    date_format('publishedAt', 'yyyy-MM-dd HH:mm:ss'))

# Lưu files
preprocessed_save.toPandas().to_csv('./data/preprocessed_data.csv', index=False)
ML_save.toPandas().to_csv('./data/ml_data.csv', index=False)
print("Dữ liệu đã được lưu")

Dữ liệu đã được lưu


In [16]:
# Stop Spark
spark.stop()