In [1]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt
import re


In [2]:
import yaml 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import torch

In [3]:
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopword_tokens



In [5]:
from data_preprocess import create_data

In [6]:
import torch 
from torch_geometric.data import HeteroData
from torch_geometric.loader import LinkLoader, LinkNeighborLoader, NeighborLoader

In [7]:
config_file_path = 'config.yaml'

In [8]:
users, movies, train_ratings, val_ratings, test_ratings, config = create_data(config_file_path)

   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [9]:
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [10]:
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [11]:
train_ratings

Unnamed: 0,index,UserID,MovieID,Rating,Timestamp
0,1000138,6040,858,4,2000-04-25 23:05:32
1,1000153,6040,2384,4,2000-04-25 23:05:54
2,999873,6040,593,5,2000-04-25 23:05:54
3,1000007,6040,1961,4,2000-04-25 23:06:17
4,1000192,6040,2019,5,2000-04-25 23:06:17
...,...,...,...,...,...
800163,314151,1875,892,4,2000-12-02 14:51:59
800164,314073,1875,440,4,2000-12-02 14:52:18
800165,314225,1875,509,4,2000-12-02 14:52:18
800166,313950,1875,2065,4,2000-12-02 14:52:18


In [12]:
w2v_model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300_2.bin', binary=True)

In [13]:
def compute_average_embedding(genres, w2v_model):
    embeddings = [w2v_model.get_vector(genre) for genre in genres if genre in w2v_model.index_to_key]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        # Return a zero vector if no genres are found in the model
        return np.zeros(w2v_model.vector_size)
    
def preprocess_genre(genre):
    genre = genre.lower()
    genre = genre.split('|')
    return genre
    
def preprocess_title(title):
    title = title.lower()
    title = title.split(' ')
    title = remove_stopword_tokens(title)
    title  = [re.sub(r"[^ a-zA-Z0-9]+",'',word) for word in title]
    title = [word.strip() for word in title]
    title = [word for word in title if len(word)]
    return title
    

In [14]:
preprocess_title('sd dsdbfDFjnv $%#$^#$ dskgjs,fs is the')

['sd', 'dsdbfdfjnv', 'dskgjsfs']

In [15]:
def transform(users, movies):
    users['Gender'] = users['Gender'].map({'M' : 1, 'F' : 0})
    # Regex to extract title and year
    movies[['Title', 'Year']] = movies['Title'].str.extract(r'^(.*?)(?: \((\d{4})\))?$')
    movies['Year'].fillna(0, inplace = True)
    movies['Genre_List'] = movies['Genres'].apply(preprocess_genre)
    movies['Genre_Embedding'] = movies['Genre_List'].apply(lambda x: compute_average_embedding(x, w2v_model))
    movies['Title_List'] = movies['Title'].apply(preprocess_title)
    movies['Title_Embedding'] = movies['Title_List'].apply(lambda x: compute_average_embedding(x, w2v_model))
    return users, movies

In [16]:
users, movies = transform(users, movies)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies['Year'].fillna(0, inplace = True)


In [17]:
movies

Unnamed: 0,MovieID,Title,Genres,Year,Genre_List,Genre_Embedding,Title_List,Title_Embedding
0,1,Toy Story,Animation|Children's|Comedy,1995,"[animation, children's, comedy]","[0.041422527, -0.01586914, -0.013570149, 0.221...","[toy, story]","[0.13549805, 0.097717285, -0.06188965, 0.11779..."
1,2,Jumanji,Adventure|Children's|Fantasy,1995,"[adventure, children's, fantasy]","[0.18473308, -0.046101887, -0.07389323, 0.1487...",[jumanji],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Grumpier Old Men,Comedy|Romance,1995,"[comedy, romance]","[0.103881836, -0.122924805, -0.12606812, 0.246...","[grumpier, old, men]","[0.102864586, 0.12434896, 0.06526693, 0.038136..."
3,4,Waiting to Exhale,Comedy|Drama,1995,"[comedy, drama]","[0.08483887, -0.009887695, 0.0055236816, 0.202...","[waiting, exhale]","[0.12060547, 0.0087890625, 0.29052734, 0.05981..."
4,5,Father of the Bride Part II,Comedy,1995,[comedy],"[-0.029541016, -0.05834961, -0.0021362305, 0.3...","[father, bride, ii]","[-0.067708336, -0.09440104, 0.0230306, -0.0043..."
...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,Comedy,2000,[comedy],"[-0.029541016, -0.05834961, -0.0021362305, 0.3...","[meet, parents]","[-0.1940918, -0.033691406, 0.05908203, 0.08258..."
3879,3949,Requiem for a Dream,Drama,2000,[drama],"[0.19921875, 0.03857422, 0.013183594, 0.057861...","[requiem, dream]","[0.048095703, -0.06427002, 0.123046875, 0.1294..."
3880,3950,Tigerland,Drama,2000,[drama],"[0.19921875, 0.03857422, 0.013183594, 0.057861...",[tigerland],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3881,3951,Two Family House,Drama,2000,[drama],"[0.19921875, 0.03857422, 0.013183594, 0.057861...","[family, house]","[0.06921387, -0.09448242, -0.044799805, 0.0603..."


In [18]:
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,0,1,10,48067
1,2,1,56,16,70072
2,3,1,25,15,55117
3,4,1,45,7,02460
4,5,1,25,20,55455
...,...,...,...,...,...
6035,6036,0,25,15,32603
6036,6037,0,45,1,76006
6037,6038,0,56,1,14706
6038,6039,0,45,0,01060


In [19]:
train_ratings['MovieID'] = train_ratings['MovieID'] - 1
train_ratings['UserID'] = train_ratings['UserID'] - 1

In [20]:
train_pos_ratings = train_ratings[train_ratings['Rating'] >= 4]

In [21]:
train_pos_ratings

Unnamed: 0,index,UserID,MovieID,Rating,Timestamp
0,1000138,6039,857,4,2000-04-25 23:05:32
1,1000153,6039,2383,4,2000-04-25 23:05:54
2,999873,6039,592,5,2000-04-25 23:05:54
3,1000007,6039,1960,4,2000-04-25 23:06:17
4,1000192,6039,2018,5,2000-04-25 23:06:17
...,...,...,...,...,...
800163,314151,1874,891,4,2000-12-02 14:51:59
800164,314073,1874,439,4,2000-12-02 14:52:18
800165,314225,1874,508,4,2000-12-02 14:52:18
800166,313950,1874,2064,4,2000-12-02 14:52:18


In [22]:
torch.tensor(users[['Gender', 'Age', 'Occupation']].values)

tensor([[ 0,  1, 10],
        [ 1, 56, 16],
        [ 1, 25, 15],
        ...,
        [ 0, 56,  1],
        [ 0, 45,  0],
        [ 1, 25,  6]])

In [23]:
# from torch_geometric.data import HeteroData

# data = HeteroData()

# # Create two node types "paper" and "author" holding a feature matrix:
# data['paper'].x = torch.randn(num_papers, num_paper_features)
# data['author'].x = torch.randn(num_authors, num_authors_features)

# # Create an edge type "(author, writes, paper)" and building the
# # graph connectivity:
# data['author', 'writes', 'paper'].edge_index = ...  # [2, num_edges]

# data['paper'].num_nodes
# >>> 23

# data['author', 'writes', 'paper'].num_edges
# >>> 52

# # PyTorch tensor functionality:
# data = data.pin_memory()
# data = data.to('cuda:0', non_blocking=True)

In [24]:
data = HeteroData()

In [25]:
data['movie'].x = torch.tensor(movies['Genre_Embedding'].apply(lambda x : x.tolist()) + movies['Title_Embedding'].apply(lambda x : x.tolist())+ movies['Year'].apply(lambda x: [int(x)]))

In [26]:
data['users'].x = torch.tensor(users[['Gender', 'Age', 'Occupation']].values)

In [27]:
data['movie']

{'x': tensor([[ 4.1423e-02, -1.5869e-02, -1.3570e-02,  ..., -2.1179e-02,
         -4.3457e-02,  1.9950e+03],
        [ 1.8473e-01, -4.6102e-02, -7.3893e-02,  ...,  0.0000e+00,
          0.0000e+00,  1.9950e+03],
        [ 1.0388e-01, -1.2292e-01, -1.2607e-01,  ...,  4.6224e-02,
          2.0345e-01,  1.9950e+03],
        ...,
        [ 1.9922e-01,  3.8574e-02,  1.3184e-02,  ...,  0.0000e+00,
          0.0000e+00,  2.0000e+03],
        [ 1.9922e-01,  3.8574e-02,  1.3184e-02,  ...,  5.8807e-02,
         -6.7749e-02,  2.0000e+03],
        [ 1.9238e-01,  9.1553e-03,  1.2024e-02,  ...,  6.5918e-02,
         -3.7689e-03,  2.0000e+03]])}

In [28]:
data['users']

{'x': tensor([[ 0,  1, 10],
        [ 1, 56, 16],
        [ 1, 25, 15],
        ...,
        [ 0, 56,  1],
        [ 0, 45,  0],
        [ 1, 25,  6]])}

In [29]:
edge_index = train_pos_ratings[['UserID', 'MovieID']].values.T

In [30]:
data['user', 'likes', 'movie'].edge_index = torch.tensor(edge_index)

In [31]:
data

HeteroData(
  movie={ x=[3883, 601] },
  users={ x=[6040, 3] },
  (user, likes, movie)={ edge_index=[2, 463021] }
)

In [32]:
data

HeteroData(
  movie={ x=[3883, 601] },
  users={ x=[6040, 3] },
  (user, likes, movie)={ edge_index=[2, 463021] }
)

In [36]:
data['movie']

{'x': tensor([[ 4.1423e-02, -1.5869e-02, -1.3570e-02,  ..., -2.1179e-02,
         -4.3457e-02,  1.9950e+03],
        [ 1.8473e-01, -4.6102e-02, -7.3893e-02,  ...,  0.0000e+00,
          0.0000e+00,  1.9950e+03],
        [ 1.0388e-01, -1.2292e-01, -1.2607e-01,  ...,  4.6224e-02,
          2.0345e-01,  1.9950e+03],
        ...,
        [ 1.9922e-01,  3.8574e-02,  1.3184e-02,  ...,  0.0000e+00,
          0.0000e+00,  2.0000e+03],
        [ 1.9922e-01,  3.8574e-02,  1.3184e-02,  ...,  5.8807e-02,
         -6.7749e-02,  2.0000e+03],
        [ 1.9238e-01,  9.1553e-03,  1.2024e-02,  ...,  6.5918e-02,
         -3.7689e-03,  2.0000e+03]])}

In [37]:
data.edge_types

[('user', 'likes', 'movie')]

In [38]:
loader = NeighborLoader(
    data,
    # Sample 30 neighbors for each node for 2 iterations
    num_neighbors={key: [3] * 2 for key in data.edge_types},
    # Use a batch size of 128 for sampling training nodes
    batch_size=128,
)


: 