In [1]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt


In [2]:
import yaml 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import torch 

In [3]:
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from gensim.models import Word2Vec



In [5]:
from data_preprocess import create_data

In [6]:
config_file_path = 'config.yaml'

In [7]:
users, movies, train_ratings, val_ratings, test_ratings, config = create_data(config_file_path)

   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [8]:
# Encode Gender (binary encoding: M=0, F=1)
users['Gender'] = users['Gender'].map({'M': 0, 'F': 1})

# One-hot encode Age and Occupation
encoder = OneHotEncoder()
age_encoded = encoder.fit_transform(users[['Age']]).toarray()
occupation_encoded = encoder.fit_transform(users[['Occupation']]).toarray()

# Combine all features
import numpy as np
user_features = np.hstack([users[['Gender']].values, age_encoded, occupation_encoded])

print("User Features Shape:", user_features.shape)

User Features Shape: (6040, 29)


In [9]:
# Extract Year from Title
movies['Year'] = movies['Title'].str.extract(r'\((\d{4})\)', expand=False)
movies['Year'] = movies['Year'].fillna(0).astype(int)  # Handle missing years


# Preprocess Genres
movies['Genres'] = movies['Genres'].str.split('|')  # Split genres into lists

# Preprocess Titles (remove year and tokenize)
movies['TitleTokens'] = movies['Title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip().str.lower().str.split()

# Combine all text (genres + titles) for training Word2Vec
all_text = movies['Genres'].tolist() + movies['TitleTokens'].tolist()

# Train Word2Vec model
w2v_model = Word2Vec(sentences=all_text, vector_size=100, window=5, min_count=1, workers=4)

# Function to compute average Word2Vec embeddings
def compute_avg_w2v(tokens, model):
    """
    Computes the average Word2Vec embedding for a list of tokens.
    """
    embeddings = [model.wv[token] for token in tokens if token in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Compute embeddings for Genres
movies['GenreEmbedding'] = movies['Genres'].apply(lambda x: compute_avg_w2v(x, w2v_model))

# Compute embeddings for Titles
movies['TitleEmbedding'] = movies['TitleTokens'].apply(lambda x: compute_avg_w2v(x, w2v_model))

# Combine Genre and Title embeddings
movies['CombinedEmbedding'] = movies.apply(
    lambda row: np.hstack([row['GenreEmbedding'], row['TitleEmbedding']]), axis=1
)

# Convert to a feature matrix
movie_features = np.vstack(movies['CombinedEmbedding'])

print("Movie Features Shape:", movie_features.shape)

Movie Features Shape: (3883, 200)


In [10]:
movie_features

array([[ 3.6688002e-03,  9.2054838e-03,  3.6766834e-03, ...,
        -8.4105134e-03,  1.8819816e-03,  3.8683200e-03],
       [-2.2051409e-03,  1.1449152e-02,  7.9483362e-03, ...,
         4.3088472e-03,  6.9524052e-05, -4.2605507e-03],
       [-4.7413115e-03,  4.3921703e-03, -3.4589744e-03, ...,
        -7.8097121e-03, -7.9876771e-03,  8.1994338e-05],
       ...,
       [-1.2162661e-03,  3.9624390e-03,  5.8436645e-03, ...,
         3.5839546e-03, -8.4574390e-03,  3.1285023e-03],
       [-1.2162661e-03,  3.9624390e-03,  5.8436645e-03, ...,
        -4.9603912e-03, -1.0659077e-02,  3.0752320e-03],
       [-4.7636400e-03,  4.4883694e-03, -6.8198680e-04, ...,
        -4.7323912e-02, -5.0380740e-02,  5.9759859e-02]], dtype=float32)

In [11]:
from torch_geometric.data import HeteroData

In [12]:
data = HeteroData()

In [None]:
data['users'].x = user_features
data['movies'].x = movie_features

edge_index_user_to_movie = torch.tensor(test_ratings[['UserID', 'MovieID']].values.T, dtype=torch.long)