In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import os

for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data\item_embeddings.parquet
data\item_tower.pt
data\movies.dat
data\movies.parquet
data\movies_feature.parquet
data\ratings.dat
data\ratings.parquet
data\README
data\train_user_tower.parquet
data\users.dat
data\users.parquet
data\user_tower.pt


In [2]:
columns = ['movieid', 'title', 'genre']
movies = pd.read_table('data/movies.dat', names=columns, sep='::', encoding='latin-1', engine='python')

movies

Unnamed: 0,movieid,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [None]:
movies.to_parquet('data/movies.parquet', engine='pyarrow')

In [4]:
columns = ['userid', 'movieid', 'rating', 'timestamp']
ratings = pd.read_table('data/ratings.dat', names=columns, sep='::', encoding='latin-1', engine='python')

ratings

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [None]:
ratings.to_parquet('data/ratings.parquet', engine='pyarrow')

In [6]:
columns = ['userId', 'gender', 'age', 'occupation', 'zip-code']
users = pd.read_table('data/users.dat', names=columns, sep='::', encoding='latin-1', engine='python')

users

Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [None]:
users.to_parquet('data/users.parquet', engine='pyarrow')

## Preprocessing

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import random
import os

ratings = pd.read_parquet("data/ratings.parquet")
users = pd.read_parquet("data/users.parquet")

positive = ratings[ratings['rating'] >= 4].copy() # keep ratings >= 4 as positive
positive = positive.merge(users, left_on='userid', right_on='userId')

positive = positive.dropna(subset=["gender", "age", "occupation"])

le_gender = LabelEncoder()
le_age = LabelEncoder()
le_occ = LabelEncoder()

positive['gender_id'] = le_gender.fit_transform(positive["gender"])
positive['age_id'] = le_age.fit_transform(positive['age'])
positive['occupation_id'] = le_occ.fit_transform(positive['occupation'])

positive_samples = positive[["userid", "movieid", "gender_id", "age_id", "occupation_id"]].copy()
positive_samples["label"] = 1

def generate_negative_samples(positive_df, all_movie_ids, neg_per_user=3):
    data = []
    grouped = positive_df.groupby("userid")

    for user_id, group in grouped:
        pos_movies = set(group["movieid"])
        candidate_neg = list(all_movie_ids - pos_movies)
        if not candidate_neg:
            continue

        neg_movies = random.sample(candidate_neg, min(len(candidate_neg), neg_per_user))
        user_info = group.iloc[0][['gender_id', 'age_id', 'occupation_id']]

        for movie_id in neg_movies:
            data.append({
                "userid": user_id,
                "movieid": movie_id,
                "gender_id": user_info["gender_id"],
                "age_id": user_info["age_id"],
                "occupation_id": user_info["occupation_id"],
                "label": 0
            })

    return pd.DataFrame(data)
    
all_movie_ids = set(ratings["movieid"].unique())
negative_samples = generate_negative_samples(positive_samples, all_movie_ids)

train_df = pd.concat([positive_samples, negative_samples], ignore_index=True)

train_df = train_df.dropna(subset=["gender_id", "age_id", "occupation_id", "movieid", "label"])

# Ensure the file is not locked or in use by another process
output_file = 'data/train_user_tower.parquet'
if os.path.exists(output_file):
    os.remove(output_file)  # Remove the file if it already exists

train_df.to_parquet(output_file, index=False)

print(f"训练样本数量：{len(train_df)}, 正样本数量：{len(positive_samples)}, 负样本数量：{len(negative_samples)}")


训练样本数量：593395, 正样本数量：575281, 负样本数量：18114


In [9]:
import re

movies = pd.read_parquet("data/movies.parquet")

def split_genres(df):
    all_genres = set()
    for g in df["genre"]:
        all_genres.update(g.split("|"))
    
    all_genres = sorted(list(all_genres))
    genre2id = {g: i for i, g in enumerate(all_genres)}

    genre_matrix = np.zeros((len(df), len(all_genres)))
    for idx, g_list in enumerate(df["genre"]):
        for g in g_list.split("|"):
            genre_matrix[idx][genre2id[g]] = 1
        
    genre_df = pd.DataFrame(genre_matrix, columns=[f"genre_{g}" for g in all_genres])
    return pd.concat([df.reset_index(drop=True), genre_df], axis=1), all_genres

movies, genre_list = split_genres(movies)

def extract_year(title):
    match = re.search(r"\((\d{4})\)", title)
    if match:
        return int(match.group(1))
    return 0

movies["year"] = movies["title"].apply(extract_year)
le_year = LabelEncoder()
movies["year_id"] = le_year.fit_transform(movies["year"])

output_path = "data/movies_feature.parquet"
if os.path.exists(output_path):
    os.remove(output_path)

movies.to_parquet(output_path, index=False)

print(f"Number of dimensions of genre: {len(genre_list)}")
print(f"Year range from: {min(movies['year'])} - {max(movies['year'])}")

Number of dimensions of genre: 18
Year range from: 1919 - 2000


In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class UserTower(nn.Module):
    def __init__(self, num_genders, num_ages, num_occupations, emb_dims=32):
        super(UserTower, self).__init__()

        self.gender_emb = nn.Embedding(num_genders, emb_dims)
        self.age_emb = nn.Embedding(num_ages, emb_dims)
        self.occupation_emb = nn.Embedding(num_occupations, emb_dims)

        self.mlp = nn.Sequential(
            nn.Linear(emb_dims * 3, 128),
            nn.ReLU(),
            nn.Linear(128, emb_dims)
        )
    
    def forward(self, gender, age, occupation):
        gender_vec = self.gender_emb(gender)
        age_vec = self.age_emb(age)
        occup_vec = self.occupation_emb(occupation)

        x = torch.concat([gender_vec, age_vec, occup_vec], dim=-1)
        out = self.mlp(x)
        out = F.normalize(out, dim=-1)
        return out

class ItemTower(nn.Module):
    def __init__(self, num_movies, num_years, num_genres, emb_dim=32):
        super().__init__()
        self.movie_emb = nn.Embedding(num_movies, emb_dim)
        self.year_emb = nn.Embedding(num_years, emb_dim)
        self.genre_emb = nn.Linear(num_genres, emb_dim)

        self.mlp = nn.Sequential(
            nn.Linear(emb_dim * 3, 128),
            nn.ReLU(),
            nn.Linear(128, emb_dim)
        )

    def forward(self, movie_id, genre_vec, year_id):
        movie_vec = self.movie_emb(movie_id)
        year_vec = self.year_emb(year_id)
        genre_vec = self.genre_emb(genre_vec.float())

        x = torch.concat([movie_vec, year_vec, genre_vec], dim=-1)
        out = self.mlp(x)

        return F.normalize(out, dim=-1)

In [11]:
from torch.utils.data import Dataset

class UserItemDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        try:
            sample = {
                "gender_id": torch.tensor(row["gender_id"], dtype=torch.long),
                "age_id": torch.tensor(row["age_id"], dtype=torch.long),
                "occupation_id": torch.tensor(row["occupation_id"], dtype=torch.long),
                "movie_id": torch.tensor(row["movieid"], dtype=torch.long),
                "label": torch.tensor(row["label"], dtype=torch.float)
            }
            return sample
        except Exception as e:
            print(f"❌ Error at index {index}")
            print(row)
            raise e

class MovieFeatureDataset(Dataset):
    def __init__(self, movie_df):
        self.movie_df = movie_df.reset_index(drop=True)
        self.genre_cols = [col for col in movie_df.columns if col.startswith("genre_")]

    def __len__(self):
        return len(self.movie_df)

    def __getitem__(self, idx):
        row = self.movie_df.iloc[idx]

        sample = {
            "movie_id": torch.tensor(int(row["movieid"]), dtype=torch.long), 
            "genre_vec": torch.tensor(row[self.genre_cols].values, dtype=torch.float),
            "year_id": torch.tensor(int(row["year_id"]), dtype=torch.long)
        }
        return sample

In [16]:
from torch.utils.data import DataLoader
from tqdm import tqdm

EMD_DIM = 32
BATCH_SIZE = 512
EPOCHS = 2
LR = 1e-3

df = pd.read_parquet("data/train_user_tower.parquet")
movie_df = pd.read_parquet("data/movies_feature.parquet")

genre_cols = [col for col in movie_df.columns if col.startswith("genre_")]
movie_df = movie_df.set_index("movieid")

dataset = UserItemDataset(df)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

user_tower = UserTower(
    num_genders=df["gender_id"].max() + 1,
    num_ages=df["age_id"].max() + 1,
    num_occupations=df["occupation_id"].max() + 1,
    emb_dims=EMD_DIM
).to(device)

item_tower = ItemTower(
    num_movies=movie_df.index.max() + 1,
    num_years=movie_df["year_id"].nunique(),
    num_genres=len(genre_cols),
    emb_dim=EMD_DIM,
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(
    list(user_tower.parameters()) + list(item_tower.parameters()), lr=LR
)

In [None]:
for epoch in range(EPOCHS):
    user_tower.train()
    item_tower.train()
    total_loss = 0.0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        gender = batch["gender_id"].to(device)
        age = batch["age_id"].to(device)
        occupation = batch["occupation_id"].to(device)
        movie_ids = batch["movie_id"].to(device)
        label = batch["label"].to(device)

        genre_vecs = []
        year_ids = []

        for mid in movie_ids.cpu().numpy():
            row = movie_df.loc[mid]
            genre_vecs.append(row[genre_cols].values)
            year_ids.append(row["year_id"])

        genre_vecs = torch.tensor(genre_vecs, dtype=torch.float32).to(device)
        year_ids = torch.tensor(year_ids, dtype=torch.long).to(device)

        user_vec = user_tower(gender, age, occupation)
        item_vec = item_tower(movie_ids, genre_vecs, year_ids)

        logits = torch.sum(user_vec * item_vec, dim=1) # cosine similarity, since we already normalised both vectors, the denominator is 1, so its basically the dot product of both vectors
        loss = criterion(logits, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}")

torch.save(user_tower.state_dict(), "user_tower.pt")
torch.save(item_tower.state_dict(), "item_tower.pt")
print("Model Saved!")

Epoch 1/2:   2%|▏         | 22/1159 [00:04<04:06,  4.61it/s]


KeyboardInterrupt: 

In [None]:
movies = pd.read_parquet("data/movies_feature.parquet")
genre_cols = [col for col in movies.columns if col.startswith("genre_")]
movie_ids = movies["movieid"].tolist()

item_tower.load_state_dict(torch.load("item_tower.pt", map_location=device))
item_tower.eval()

movie_vectors = []
movie_id_list = []

with torch.no_grad():
    for i in tqdm(range(len(movies))):
        row = movies.iloc[i]

        movie_id = torch.tensor([row["movieid"]], dtype=torch.long).to(device)
        genre_vec = torch.tensor([row[genre_cols].values], dtype=torch.float32).to(device)
        year_id = torch.tensor([row["year_id"]], dtype=torch.long).to(device)

        item_vec = item_tower(movie_id, genre_vec, year_id)
        item_vec = item_vec.squeeze(0).cpu().numpy()

        movie_vectors.append(item_vec)
        movie_id_list.append(row["movieid"])

df = pd.DataFrame(movie_vectors)
df["movieid"] = movie_id_list

df.to_parquet("data/item_embeddings.parquet", index=False)
print("Saved item embedding to data/item_embedding.parquet")

100%|██████████| 3883/3883 [00:04<00:00, 821.56it/s]

Saved item embedding to data/item_embedding.parquet



