In [1]:
# Imports and paths
import os
import ast
import pandas as pd
from pathlib import Path

BASE = Path(r'D:\Project\web_mining\data\data')
PATHS = {
    'movies': BASE / 'movies_metadata.csv',
    'credits': BASE / 'credits.csv',
    'keywords': BASE / 'keywords.csv',
    'links': BASE / 'links.csv',
    'ratings': BASE / 'ratings.csv',
    'ratings_small': BASE / 'ratings_small.csv',
}
OUTDIR = Path('data/processed')
OUTDIR.mkdir(parents=True, exist_ok=True)
print('Paths set, output dir:', OUTDIR)

Paths set, output dir: data\processed


In [2]:
# Helper: safe parse JSON-like string columns (e.g., genres containing [{'id':..,'name':'..'}, ...])
def safe_parse(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

print('Helpers ready')

Helpers ready


In [3]:
# Load main files (use low_memory for movies)
movies_raw = pd.read_csv(PATHS['movies'], low_memory=False)
credits_raw = pd.read_csv(PATHS['credits'])
keywords_raw = pd.read_csv(PATHS['keywords'])
links_raw = pd.read_csv(PATHS['links'])
# use small ratings for quick dev, full ratings when needed
ratings_small_raw = pd.read_csv(PATHS['ratings_small'])
ratings_raw = pd.read_csv(PATHS['ratings'])
print('Loaded: movies, credits, keywords, links, ratings_small')
print('movies rows=', len(movies_raw), 'credits=', len(credits_raw), 'ratings=', len(ratings_raw))

Loaded: movies, credits, keywords, links, ratings_small
movies rows= 45466 credits= 45476 ratings= 26024289


In [None]:
movies = movies_raw.copy()

In [71]:
## Chuyển đổi các cột JSON-like thành lists

json_cols = [
    "genres",
    "production_companies",
    "production_countries",
    "spoken_languages",
    "belongs_to_collection"
]
for col in json_cols:
    movies[col] = movies[col].apply(safe_parse)

In [72]:
def clean_belongs_to_collection(x):
    if isinstance(x, dict):
        return {
            "id": x.get("id"),
            "name": x.get("name")
        }
    return None
movies["belongs_to_collection"] = movies["belongs_to_collection"].apply(
    clean_belongs_to_collection)

In [73]:
# release_date
movies["release_date"] = pd.to_datetime(
    movies["release_date"], errors="coerce"
)
movies["release_year"] = movies["release_date"].dt.year

In [74]:
# Chuẩn hóa numeric 
num_cols = [
    "budget", "revenue", "runtime",
    "popularity", "vote_average", "vote_count"
]

movies[num_cols] = movies[num_cols].apply(pd.to_numeric, errors="coerce")

In [75]:
movies["overview"] = movies["overview"].fillna("")
movies["tagline"] = movies["tagline"].fillna("")
movies["runtime"] = movies["runtime"].fillna(movies["runtime"].median())
movies["budget"] = movies["budget"].fillna(0)
movies["revenue"] = movies["revenue"].fillna(0)

In [None]:
## drop cột không cần thiết
drop_cols = [
    "poster_path", "homepage",
    "spoken_languages", "original_title", "original_language",
    "imdb_id"
]

movies = movies.drop(columns=drop_cols)

In [77]:
movies.to_csv("movies_cleaned.csv", index=False)

In [4]:
# Clean credits: 
credits = credits_raw.copy()
credits["cast"] = credits["cast"].apply(safe_parse)
credits["crew"] = credits["crew"].apply(safe_parse)

In [5]:
# Trích xuất top diễn viên chính
def get_all_cast(cast_list):
    if not isinstance(cast_list, list):
        return []
    return [c.get("name") for c in cast_list if "name" in c]

credits["top_cast"] = credits["cast"].apply(get_all_cast)

In [6]:
# Trích xuất đạo diễn
def get_director(crew_list):
    if not isinstance(crew_list, list):
        return None
    for person in crew_list:
        if person.get("job") == "Director":
            return person.get("name")
    return None

credits["director"] = credits["crew"].apply(get_director)

In [7]:
# Trích xuất biên kịch / producer
def get_job(crew_list, job_name):
    if not isinstance(crew_list, list):
        return []
    return [
        p["name"] for p in crew_list
        if p.get("job") == job_name
    ]

credits["writers"] = credits["crew"].apply(
    lambda x: get_job(x, "Writer")
)

credits["producers"] = credits["crew"].apply(
    lambda x: get_job(x, "Producer")
)

In [8]:
credits_clean = credits.drop(columns=["cast", "crew"])


In [9]:
credits_clean.to_csv("credits_clean.csv", index=False)

In [23]:
# clean keywords

keywords = keywords_raw.copy()
keywords["keywords"] = keywords["keywords"].apply(safe_parse)
keywords["keyword_names"] = keywords["keywords"].apply(
    lambda x: [k["name"] for k in x] if isinstance(x, list) else []
)
keywords_clean = keywords.drop(columns=["keywords"])

In [25]:
keywords_clean.to_csv("keywords_clean.csv", index=False)

In [10]:
### train test split
import pandas as pd

# Đọc dữ liệu
ratings = pd.read_csv(r"D:\Project\web_mining\data\data\ratings.csv")

# Đảm bảo timestamp là số (phòng lỗi)
ratings["timestamp"] = ratings["timestamp"].astype(int)

train_list = []
test_list = []

# Group theo userID
for user_id, user_df in ratings.groupby("userId"):
    # Sort theo thời gian
    user_df = user_df.sort_values("timestamp")

    # Xác định điểm chia
    split_idx = int(len(user_df) * 0.85)

    train_list.append(user_df.iloc[:split_idx])
    test_list.append(user_df.iloc[split_idx:])

# Gộp lại thành DataFrame
train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

train_df.to_csv(r"D:\Project\web_mining\notebooks\ratings_train.csv", index=False)
test_df.to_csv(r"D:\Project\web_mining\notebooks\ratings_test.csv", index=False)

**Next steps / notes**:
- Run cells in order.
- For large-scale work, use `pd.read_csv(..., chunksize=...)` for `ratings.csv`.
- Adjust which features to extract (top-k cast, top keywords).
- Consider storing intermediate results as parquet or Feather for speed.
