In [None]:
import os
import ast
import pandas as pd
from pathlib import Path

BASE = Path(r'D:\Project\web_mining\data\data')
PATHS = {
    'movies': BASE / 'movies_metadata.csv',
    'credits': BASE / 'credits.csv',
    'keywords': BASE / 'keywords.csv',
    'links': BASE / 'links.csv',
    'ratings': BASE / 'ratings.csv',
    'ratings_small': BASE / 'ratings_small.csv',
}
OUTDIR = Path('data/processed')
OUTDIR.mkdir(parents=True, exist_ok=True)
print('Paths set, output dir:', OUTDIR)

Paths set, output dir: data\processed


In [None]:
def safe_parse(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

print('Helpers ready')

Helpers ready


In [None]:
movies_raw = pd.read_csv(PATHS['movies'], low_memory=False)
credits_raw = pd.read_csv(PATHS['credits'])
keywords_raw = pd.read_csv(PATHS['keywords'])
links_raw = pd.read_csv(PATHS['links'])
ratings_small_raw = pd.read_csv(PATHS['ratings_small'])
ratings_raw = pd.read_csv(PATHS['ratings'])
print('Loaded: movies, credits, keywords, links, ratings_small')
print('movies rows=', len(movies_raw), 'credits=', len(credits_raw), 'ratings=', len(ratings_raw))

Loaded: movies, credits, keywords, links, ratings_small
movies rows= 45466 credits= 45476 ratings= 26024289


In [None]:
movies = movies_raw.copy()

In [None]:
json_cols = [
    "genres",
    "production_companies",
    "production_countries",
    "spoken_languages",
    "belongs_to_collection"
]
for col in json_cols:
    movies[col] = movies[col].apply(safe_parse)

In [72]:
def clean_belongs_to_collection(x):
    if isinstance(x, dict):
        return {
            "id": x.get("id"),
            "name": x.get("name")
        }
    return None
movies["belongs_to_collection"] = movies["belongs_to_collection"].apply(
    clean_belongs_to_collection)

In [None]:
movies["release_date"] = pd.to_datetime(
    movies["release_date"], errors="coerce"
)
movies["release_year"] = movies["release_date"].dt.year

In [None]:
num_cols = [
    "budget", "revenue", "runtime",
    "popularity", "vote_average", "vote_count"
]

movies[num_cols] = movies[num_cols].apply(pd.to_numeric, errors="coerce")

In [75]:
movies["overview"] = movies["overview"].fillna("")
movies["tagline"] = movies["tagline"].fillna("")
movies["runtime"] = movies["runtime"].fillna(movies["runtime"].median())
movies["budget"] = movies["budget"].fillna(0)
movies["revenue"] = movies["revenue"].fillna(0)

In [None]:
drop_cols = [
    "poster_path", "homepage",
    "spoken_languages", "original_title", "original_language",
    "imdb_id"
]

movies = movies.drop(columns=drop_cols)

In [77]:
movies.to_csv("movies_cleaned.csv", index=False)

In [None]:
credits = credits_raw.copy()
credits["cast"] = credits["cast"].apply(safe_parse)
credits["crew"] = credits["crew"].apply(safe_parse)

In [None]:
def get_all_cast(cast_list):
    if not isinstance(cast_list, list):
        return []
    return [c.get("name") for c in cast_list if "name" in c]

credits["top_cast"] = credits["cast"].apply(get_all_cast)

In [None]:
def get_director(crew_list):
    if not isinstance(crew_list, list):
        return None
    for person in crew_list:
        if person.get("job") == "Director":
            return person.get("name")
    return None

credits["director"] = credits["crew"].apply(get_director)

In [None]:
def get_job(crew_list, job_name):
    if not isinstance(crew_list, list):
        return []
    return [
        p["name"] for p in crew_list
        if p.get("job") == job_name
    ]

credits["writers"] = credits["crew"].apply(
    lambda x: get_job(x, "Writer")
)

credits["producers"] = credits["crew"].apply(
    lambda x: get_job(x, "Producer")
)

In [None]:
credits_clean = credits.drop(columns=["cast", "crew"])

In [9]:
credits_clean.to_csv("credits_clean.csv", index=False)

In [None]:
keywords = keywords_raw.copy()
keywords["keywords"] = keywords["keywords"].apply(safe_parse)
keywords["keyword_names"] = keywords["keywords"].apply(
    lambda x: [k["name"] for k in x] if isinstance(x, list) else []
)
keywords_clean = keywords.drop(columns=["keywords"])

In [25]:
keywords_clean.to_csv("keywords_clean.csv", index=False)

In [None]:
import pandas as pd

ratings = pd.read_csv(r"D:\Project\web_mining\data\data\ratings_small.csv")

ratings["timestamp"] = ratings["timestamp"].astype(int)

train_list = []
test_list = []

for user_id, user_df in ratings.groupby("userId"):
    user_df = user_df.sort_values("timestamp")
    split_idx = int(len(user_df) * 0.80)
    train_list.append(user_df.iloc[:split_idx])
    test_list.append(user_df.iloc[split_idx:])

train_df = pd.concat(train_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

train_df.to_csv(r"D:\Project\web_mining\notebooks\ratings_train.csv", index=False)
test_df.to_csv(r"D:\Project\web_mining\notebooks\ratings_test.csv", index=False)

In [None]:
import pandas as pd

def clean_movie_id(df, col="movieId"):
    df[col] = pd.to_numeric(df[col], errors="coerce")
    df = df.dropna(subset=[col])
    df[col] = df[col].astype(int)
    return df


movies = pd.read_csv(r"D:\Project\web_mining\notebooks\movies_cleaned.csv", low_memory=False)
credits = pd.read_csv(r"D:\Project\web_mining\notebooks\credits_clean.csv")
keywords = pd.read_csv(r"D:\Project\web_mining\notebooks\keywords_clean.csv")

movies = movies.rename(columns={"id": "movieId"})
credits = credits.rename(columns={"id": "movieId"})
keywords = keywords.rename(columns={"id": "movieId"})

ratings_train = pd.read_csv(r"D:\Project\web_mining\notebooks\ratings_train.csv")
ratings_test  = pd.read_csv(r"D:\Project\web_mining\notebooks\ratings_test.csv")

movies = clean_movie_id(movies)
credits = clean_movie_id(credits)
keywords = clean_movie_id(keywords)
ratings_train = clean_movie_id(ratings_train)
ratings_test  = clean_movie_id(ratings_test)


movie_meta = movies.merge(
    credits,
    on="movieId",
    how="inner"
).merge(
    keywords,
    on="movieId",
    how="inner"
)

print("Movie meta shape:", movie_meta.shape)

train_merged = ratings_train.merge(
    movie_meta,
    on="movieId",
    how="inner"
)

test_merged = ratings_test.merge(
    movie_meta,
    on="movieId",
    how="inner"
)

movie_feature_cols = train_merged.columns[4:]

train_merged = train_merged.dropna(
    subset=movie_feature_cols,
    how="all"
)

test_merged = test_merged.dropna(
    subset=movie_feature_cols,
    how="all"
)


print("Train final shape:", train_merged.shape)
print("Test final shape :", test_merged.shape)

print("Empty movie rows (train):",
      train_merged.iloc[:, 4:].isna().all(axis=1).sum())

print("Empty movie rows (test):",
      test_merged.iloc[:, 4:].isna().all(axis=1).sum())


train_merged.to_csv(
    r"D:\Project\web_mining\notebooks\ratings_train_clean.csv",
    index=False
)

test_merged.to_csv(
    r"D:\Project\web_mining\notebooks\ratings_test_clean.csv",
    index=False
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(int)


Movie meta shape: (46628, 25)
Train final shape: (36757, 28)
Test final shape : (8285, 28)
Empty movie rows (train): 0
Empty movie rows (test): 0


In [None]:
import pandas as pd
import ast

file_path = r"D:\Project\web_mining\notebooks\ratings_test_clean.csv"

df = pd.read_csv(file_path)

def clean_list_column(value):
    if pd.isna(value):
        return value

    if isinstance(value, str):
        try:
            value = ast.literal_eval(value)
        except:
            return value

    if isinstance(value, list):
        cleaned = []
        for item in value:
            if isinstance(item, dict):
                vals = [v for k, v in item.items() if k != "id"]
                if len(vals) == 1:
                    cleaned.append(vals[0])
                else:
                    cleaned.append(vals)
            else:
                cleaned.append(item)
        return cleaned

    return value
for col in df.columns:
    df[col] = df[col].apply(clean_list_column)

df.to_csv(file_path, index=False, encoding="utf-8")

✅ Đã làm sạch toàn bộ cột list (bỏ id) và ghi đè file thành công!


In [None]:
import pandas as pd
import ast

file_path = r"D:\Project\web_mining\notebooks\ratings_train_clean.csv"

df = pd.read_csv(file_path)

def clean_list_column(value):
    if pd.isna(value):
        return value

    if isinstance(value, str):
        try:
            value = ast.literal_eval(value)
        except:
            return value

    if isinstance(value, list):
        cleaned = []
        for item in value:
            if isinstance(item, dict):
                vals = [v for k, v in item.items() if k != "id"]
                if len(vals) == 1:
                    cleaned.append(vals[0])
                else:
                    cleaned.append(vals)
            else:
                cleaned.append(item)
        return cleaned

    return value

for col in df.columns:
    df[col] = df[col].apply(clean_list_column)

df.to_csv(file_path, index=False, encoding="utf-8")

In [None]:
import pandas as pd
import ast
from sentence_transformers import SentenceTransformer


def embed_overview_and_keywords(file_path, batch_size=32):
    """
    Đọc file CSV, tạo embedding BERT cho:
    - overview       
    - keyword_names  
    """
    model = SentenceTransformer("all-MiniLM-L6-v2")
    df = pd.read_csv(file_path)
    def parse_list(x):
        if pd.isna(x):
            return []
        if isinstance(x, list):
            return x
        try:
            return ast.literal_eval(x)
        except:
            return []

    df["overview"] = df["overview"].fillna("")

    overview_embeddings = model.encode(
        df["overview"].tolist(),
        batch_size=batch_size,
        show_progress_bar=True
    )

    df["overview_bert"] = list(overview_embeddings)
    df["keyword_names"] = df["keyword_names"].apply(parse_list)
    df["keyword_text"] = df["keyword_names"].apply(
        lambda x: " and ".join(x) if len(x) > 0 else ""
    )

    keyword_embeddings = model.encode(
        df["keyword_text"].tolist(),
        batch_size=batch_size,
        show_progress_bar=True
    )

    df["keyword_names_bert"] = list(keyword_embeddings)
    df.drop(columns=["keyword_text"], inplace=True)

    return df

In [None]:
file_path = r"D:\Project\web_mining\notebooks\ratings_test_clean.csv"
df_embedded = embed_overview_and_keywords(file_path)
df_embedded.to_csv(r"D:\Project\web_mining\notebooks\ratings_test_clean_bert.csv", index=False, encoding="utf-8")

Batches: 100%|██████████| 259/259 [01:19<00:00,  3.26it/s]
Batches: 100%|██████████| 259/259 [00:34<00:00,  7.56it/s]


In [None]:
file_path = r"D:\Project\web_mining\notebooks\ratings_train_clean.csv"
df_embedded = embed_overview_and_keywords(file_path)
df_embedded.to_csv(r"D:\Project\web_mining\notebooks\ratings_train_clean_bert.csv", index=False, encoding="utf-8")

Batches: 100%|██████████| 1149/1149 [05:53<00:00,  3.25it/s]
Batches: 100%|██████████| 1149/1149 [02:21<00:00,  8.09it/s]


In [None]:
import pandas as pd
df_train = pd.read_csv(r"D:\Project\web_mining\notebooks\ratings_train_clean_bert.csv")

RATING_THRESHOLD = 3.5

df_train = df_train[df_train["rating"] >= RATING_THRESHOLD]
train_users = set(df_train["user_id"].unique())

df_train = df_train[df_train["user_id"].isin(train_users)]

def keep_latest_fraction(df, frac=0.5):
    df = df.sort_values(["user_id", "timestamp"])

    def select_latest(group):
        n_keep = max(1, int(len(group) * frac))
        return group.tail(n_keep)

    return df.groupby("user_id", group_keys=False).apply(select_latest)

In [None]:
df_latest = keep_latest_fraction(df_train, frac=0.5)
df_latest.to_csv(r"D:\Project\web_mining\notebooks\ratings_50_latest.csv", index=False)