In [1]:
import torch
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch_geometric.data import Data

In [2]:
blog_data_df = pd.read_csv('data/blog_data.csv')

irrelevant_cols = ['blog_title', 'blog_content', 'blog_link', 'blog_img', 'scrape_time']
blog_data_df.drop(irrelevant_cols, axis='columns', inplace=True)
blog_data_df['topic'] = LabelEncoder().fit_transform(blog_data_df['topic'])

blog_data_df.tail()

Unnamed: 0,blog_id,author_id,topic
10462,10489,6867,21
10463,10490,2490,21
10464,10491,6810,21
10465,10492,6868,21
10466,10493,6750,21


In [3]:
interactions_df = pd.read_csv('data/blog_interactions.csv')

interactions_df.rename(columns={'userId': 'user_id', 'ratings': 'interaction'}, inplace=True)
interactions_df.replace({'interaction': {0.5: 2, 2: 3, 3.5: 4, 5: 5}}, inplace=True)
interactions_df['interaction'] = interactions_df['interaction'].fillna(2).astype(float)

interactions_df['interaction'].describe()

count    200140.000000
mean          3.744979
std           1.178742
min           2.000000
25%           3.000000
50%           4.000000
75%           5.000000
max           5.000000
Name: interaction, dtype: float64

In [4]:
blog_ids_map = {id: i for i, id in enumerate(blog_data_df['blog_id'])}
blog_data_df['blog_id'] = blog_data_df['blog_id'].map(blog_ids_map)
interactions_df['blog_id'] = interactions_df['blog_id'].map(blog_ids_map)
interactions_df.head()

Unnamed: 0,blog_id,user_id,interaction
0,8998,11,4.0
1,9293,11,5.0
2,9219,11,4.0
3,9404,11,5.0
4,848,11,3.0


In [5]:
last_interacted_blog_id = interactions_df['blog_id'].max()
blog_data_df = blog_data_df.query('blog_id <= @last_interacted_blog_id')
blog_data_df.tail()

Unnamed: 0,blog_id,author_id,topic
9724,9724,6406,1
9725,9725,5658,1
9726,9726,6407,1
9727,9727,6343,1
9728,9728,6408,1


In [6]:
blog_rating_groups = interactions_df.drop('user_id', axis=1).groupby('blog_id')
keys = [k for k,_ in blog_rating_groups]
blog_mean_ratings = {k:v for k,v in zip(keys, blog_rating_groups.mean().values[:, 0])}

ratings = []
for blog_id in blog_data_df['blog_id'].unique():
    if blog_id in blog_mean_ratings:
        ratings.append(blog_mean_ratings[blog_id])
    else:
        ratings.append(0.0)

len(ratings)

9729

In [7]:
def create_edge_index(data):
    edge_index = torch.tensor(data[['user_id', 'blog_id']].values.T, dtype=torch.long)
    edge_weight = torch.tensor(data['interaction'].values, dtype=torch.float)
    return edge_index, edge_weight

In [8]:
node_features = torch.tensor(blog_data_df[['topic']].values, dtype=torch.float)

# Create author embeddings
author_ids = blog_data_df['author_id'].unique()
author_id_map = {id: i for i, id in enumerate(author_ids)}
blog_data_df['author_idx'] = blog_data_df['author_id'].map(author_id_map)

author_embeddings = torch.nn.Embedding(len(author_ids), embedding_dim=16)
author_embedded = author_embeddings(torch.tensor(blog_data_df['author_idx'].values, dtype=torch.long))

node_features = torch.cat([node_features, author_embedded], dim=1)

edge_index, edge_weight = create_edge_index(interactions_df)

mean_ratings = torch.tensor(ratings, dtype=torch.float)

In [9]:
train_data, temp_data = train_test_split(interactions_df, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_edge_index, train_edge_weight = create_edge_index(train_data)
val_edge_index, val_edge_weight = create_edge_index(val_data)
test_edge_index, test_edge_weight = create_edge_index(test_data)

In [10]:
data = Data(x=node_features, y=mean_ratings, edge_index=edge_index, edge_attr=edge_weight)

data.train_edge_index = train_edge_index
data.train_edge_attr = train_edge_weight
data.val_edge_index = val_edge_index
data.val_edge_attr = val_edge_weight
data.test_edge_index = test_edge_index
data.test_edge_attr = test_edge_weight

In [11]:
torch.save(data, 'output/graph_data.pt')

In [12]:
data

Data(x=[9729, 17], edge_index=[2, 200140], edge_attr=[200140], y=[9729], train_edge_index=[2, 140098], train_edge_attr=[140098], val_edge_index=[2, 30021], val_edge_attr=[30021], test_edge_index=[2, 30021], test_edge_attr=[30021])