In [2]:
import pandas as pd 
import numpy as np
import ast

import torch
from torch_geometric.data import Data
import pandas as pd
import numpy as np

In [3]:
meta_df = pd.read_json("../data/raw/samples/sample_metadata.json", lines=True)
reviews_df = pd.read_csv("../data/raw/samples/cleaned_reviews.csv")

print(f"Product Metadata Columns: {meta_df.columns} | Shape: {meta_df.shape}")
print()
print(f"Reviews Data Columns: {reviews_df.columns} | Shape: {reviews_df.shape}")

Product Metadata Columns: Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
      dtype='object') | Shape: (55254, 16)

Reviews Data Columns: Index(['user_id', 'parent_asin', 'rating', 'timestamp'], dtype='object') | Shape: (100000, 4)


In [4]:
meta_df.head(1)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Cell Phones & Accessories,"QGHXO Band for Garmin Vivofit 4, Soft Silicone...",4.4,707,[Personalized Your Garmin Vivofit 4 Activity T...,"[Compatibility, Custom designed for your preci...",14.89,[{'thumb': 'https://m.media-amazon.com/images/...,[],QGHXO,"[Electronics, Wearable Technology, Arm & Wrist...",{'Package Dimensions': '6.85 x 4.37 x 1.1 inch...,B07BJ7ZZL7,,,


In [5]:
reviews_df.head(1)

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AGF6GWDAWG7UW3R5O2G5O5JZC4BQ,B07CLBNM1C,5.0,1627068549820


In [6]:
def safe_literal_eval(x):
    # If it's already a list, dict, or None, return as is
    if isinstance(x, (list, dict)):
        return x
    if x is None:
        return []
    if isinstance(x, float) and pd.isna(x):
        return []
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []  # or x, depending on use case
    return x


cols_to_parse = ['features', 'description', 'images', 'videos', 'categories', 'details']

for col in cols_to_parse:
    meta_df[col] = meta_df[col].apply(safe_literal_eval)

In [7]:
import re

def parse_price(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float)):
        return x
    if isinstance(x, str):
        # Remove currency symbols and commas
        cleaned = re.sub(r'[^0-9.]', '', x)
        try:
            return float(cleaned)
        except ValueError:
            return np.nan
    return np.nan

meta_df['price'] = meta_df['price'].apply(parse_price)
meta_df['price'] = meta_df['price'].fillna(meta_df['price'].mean())
meta_df['price'] = pd.to_numeric(meta_df['price'], errors='coerce')

In [8]:
meta_df.head(5)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Cell Phones & Accessories,"QGHXO Band for Garmin Vivofit 4, Soft Silicone...",4.4,707,[Personalized Your Garmin Vivofit 4 Activity T...,"[Compatibility, Custom designed for your preci...",14.89,[{'thumb': 'https://m.media-amazon.com/images/...,[],QGHXO,"[Electronics, Wearable Technology, Arm & Wrist...",{'Package Dimensions': '6.85 x 4.37 x 1.1 inch...,B07BJ7ZZL7,,,
1,Cell Phones & Accessories,Wenlaty Case Compatible with iPad 9th /8th /7t...,4.7,1915,[😊Designed for iPad 9th/8th/7th Generation: Pr...,[],19.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Wenlaty Case Compatible with iPad ...,Wenlaty,"[Electronics, Computers & Accessories, Tablet ...",{'Product Dimensions': '9.91 x 7.3 x 0.47 inch...,B09M7TH3YC,,,
2,Computers,iCasso Compatible with MacBook Pro 13 Inch Cas...,4.4,3503,[[IMPORTANT!!!]: ONLY Compatible with MacBook ...,[],27.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'MacBook Pro case', 'url': 'https:/...",iCasso,"[Electronics, Computers & Accessories, Laptop ...","{'Standing screen display size': '13 Inches', ...",B08GFTPQ5B,,,
3,Cell Phones & Accessories,SUPCASE UB Pro Series Case for iPad Pro 12.9 i...,4.5,4050,[Designed for iPad Pro 12.9 Inch 2020(Model: A...,[],27.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Protective & Has An Awesome Built-...,SUPCASE,"[Electronics, Computers & Accessories, Tablet ...",{'Package Dimensions': '11.57 x 9.53 x 0.75 in...,B086W46TYP,,,
4,Computers,Aruba Instant On AP11 2x2 WiFi Access Point | ...,4.4,119,[Aruba Instant On AP11 Indoor Access Points de...,[],109.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'What is a Wireless Access Point?',...",Aruba a Hewlett Packard Enterprise company,"[Electronics, Computers & Accessories, Network...",{'Product Dimensions': '5.98 x 5.98 x 1.34 inc...,B098ZFMY25,,,


In [9]:
def create_feature_matrix(meta_df):
    import pandas as pd

    def safe_len(x):
        return len(x) if isinstance(x, list) else 0

    def extract_features(row):
        return {
            'average_rating': float(row.get('average_rating', 0.0)),
            'rating_number': int(row.get('rating_number', 0)),
            'num_features': safe_len(row.get('features', [])),
            'description_length': len(row['description'][0]) if isinstance(row.get('description'), list) and row['description'] else 0,
            'price': float(row.get('price', 0.0)) if row.get('price') not in [None, ''] else 0.0,
            'num_images': safe_len(row.get('images', [])),
            'num_videos': safe_len(row.get('videos', [])),
            'num_categories': safe_len(row.get('categories', [])),
            # 'num_details': len(row.get('details', {})) if isinstance(row.get('details'), dict) else 0
        }

    clean_rows = []
    for _, row in meta_df.iterrows():
        try:
            clean_rows.append(extract_features(row))
        except Exception as e:
            print(f"Row failed: {e}")
            clean_rows.append({k: 0.0 for k in ['average_rating', 'rating_number', 'num_features',
                                               'description_length', 'price', 'num_images',
                                               'num_videos', 'num_categories']})
    
    clean_df = pd.DataFrame(clean_rows)
    return clean_df

features_matrix = create_feature_matrix(meta_df)
features_matrix.head(30) 

Unnamed: 0,average_rating,rating_number,num_features,description_length,price,num_images,num_videos,num_categories
0,4.4,707,5,13,14.89,7,0,3
1,4.7,1915,7,0,19.99,8,10,5
2,4.4,3503,5,0,27.99,5,8,5
3,4.5,4050,5,0,27.99,7,10,5
4,4.4,119,5,0,109.99,9,10,4
5,4.2,1325,5,0,74.516365,6,10,5
6,4.5,2745,5,825,11.95,9,10,5
7,4.3,1417,5,214,33.99,9,9,5
8,3.0,32,2,19,74.516365,1,0,0
9,3.7,28,0,0,74.516365,7,2,4


In [10]:
import pandas as pd
import numpy as np

def handle_missing_values(df):
    df['average_rating'] = df['average_rating'].fillna(df['average_rating'].median())
    df['rating_number'] = df['rating_number'].fillna(df['rating_number'].median())
    df['num_features'] = df['num_features'].fillna(df['num_features'].median())
    df['description_length'] = df['description_length'].fillna(df['description_length'].median())
    df['price'] = df['price'].fillna(df['price'].median())
    df['num_images'] = df['num_images'].fillna(df['num_images'].median())
    df['num_videos'] = df['num_videos'].fillna(df['num_videos'].median())
    df['num_categories'] = df['num_categories'].fillna(df['num_categories'].median())
    df['num_details'] = df['num_details'].fillna(df['num_details'].median())
    return df

features_matrix = handle_missing_values(features_matrix)
features_matrix.head(30)

KeyError: 'num_details'

In [None]:
def feature_engineering(df):
    # Create new features like price per rating
    df['price_per_rating'] = df['price'] / (df['rating_number'] + 1)  # To avoid division by zero

    # Create a feature that compares the number of features to ratings
    df['feature_to_rating_ratio'] = df['num_features'] / (df['rating_number'] + 1)

    # You can add more features based on domain knowledge or feature importance analysis
    return df

feature_engineering(features_matrix)

Unnamed: 0,average_rating,rating_number,num_features,description_length,price,num_images,num_videos,num_categories,num_details,price_per_rating,feature_to_rating_ratio
0,0.850,0.000682,0.079365,0.001157,0.001390,0.184211,0.0,0.428571,0.109890,0.001389,0.079311
1,0.925,0.001849,0.111111,0.000000,0.001901,0.210526,1.0,0.714286,0.131868,0.001897,0.110906
2,0.850,0.003384,0.079365,0.000000,0.002701,0.131579,0.8,0.714286,0.131868,0.002692,0.079097
3,0.875,0.003912,0.079365,0.000000,0.002701,0.184211,1.0,0.714286,0.164835,0.002690,0.079056
4,0.850,0.000114,0.079365,0.000000,0.010903,0.236842,1.0,0.571429,0.076923,0.010902,0.079356
...,...,...,...,...,...,...,...,...,...,...,...
55249,0.850,0.000006,0.095238,0.015749,0.007355,0.184211,0.0,0.571429,0.109890,0.007355,0.095238
55250,0.850,0.000024,0.063492,0.000890,0.007355,0.052632,0.0,0.714286,0.208791,0.007355,0.063491
55251,0.750,0.000288,0.095238,0.000000,0.007355,0.868421,0.0,0.714286,0.153846,0.007353,0.095211
55252,0.850,0.000150,0.079365,0.000000,0.001298,0.184211,0.0,0.714286,0.032967,0.001298,0.079353


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def normalize_features(df):
    # Select numerical columns for normalization
    numeric_cols = ['average_rating', 'rating_number', 'num_features', 'description_length', 
                    'price', 'num_images', 'num_videos', 'num_categories', 'num_details']
    
    # Min-Max Scaling: Rescale features to [0, 1]
    min_max_scaler = MinMaxScaler()
    df[numeric_cols] = min_max_scaler.fit_transform(df[numeric_cols])

    # Alternatively, Standardize features (mean=0, std=1)
    # scaler = StandardScaler()
    # df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

normalize_features(features_matrix)

Unnamed: 0,average_rating,rating_number,num_features,description_length,price,num_images,num_videos,num_categories,num_details
0,0.850,0.000682,0.079365,0.001157,0.001390,0.184211,0.0,0.428571,0.109890
1,0.925,0.001849,0.111111,0.000000,0.001901,0.210526,1.0,0.714286,0.131868
2,0.850,0.003384,0.079365,0.000000,0.002701,0.131579,0.8,0.714286,0.131868
3,0.875,0.003912,0.079365,0.000000,0.002701,0.184211,1.0,0.714286,0.164835
4,0.850,0.000114,0.079365,0.000000,0.010903,0.236842,1.0,0.571429,0.076923
...,...,...,...,...,...,...,...,...,...
55249,0.850,0.000006,0.095238,0.015749,0.007355,0.184211,0.0,0.571429,0.109890
55250,0.850,0.000024,0.063492,0.000890,0.007355,0.052632,0.0,0.714286,0.208791
55251,0.750,0.000288,0.095238,0.000000,0.007355,0.868421,0.0,0.714286,0.153846
55252,0.850,0.000150,0.079365,0.000000,0.001298,0.184211,0.0,0.714286,0.032967


In [None]:
user_ids = reviews_df['user_id'].unique()
product_ids = reviews_df['parent_asin'].unique()

user_id_map = {uid: idx for idx, uid in enumerate(user_ids)}
product_id_map = {pid: idx + len(user_ids) for idx, pid in enumerate(product_ids)}

# Step 3: Create the edges (user <-> product) and edge weights (ratings)
edge_list = []
edge_ratings = []

for _, row in reviews_df.iterrows():
    if row['user_id'] in user_id_map and row['parent_asin'] in product_id_map:
        u = user_id_map[row['user_id']]
        p = product_id_map[row['parent_asin']]
        edge_list += [[u, p], [p, u]]  # undirected graph
        edge_ratings += [row['rating'], row['rating']]  # rating as the edge feature

# Step 4: Create user and product features (based on review data)
user_features = []
product_features = []

# User features: Number of reviews and average rating
user_review_counts = reviews_df.groupby('user_id').size().reindex(user_ids).fillna(0).values
user_avg_ratings = reviews_df.groupby('user_id')['rating'].mean().reindex(user_ids).fillna(0).values
user_features = list(zip(user_review_counts, user_avg_ratings))

# Product features: Number of ratings and average rating
product_rating_counts = reviews_df.groupby('parent_asin').size().reindex(product_ids).fillna(0).values
product_avg_ratings = reviews_df.groupby('parent_asin')['rating'].mean().reindex(product_ids).fillna(0).values
product_features = list(zip(product_rating_counts, product_avg_ratings))

# Step 5: Convert everything to tensors
edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()  # shape [2, num_edges]
edge_attr = torch.tensor(edge_ratings, dtype=torch.float).view(-1, 1)  # shape [num_edges, 1]

# Combine user and product features into one feature tensor
user_features = torch.tensor(user_features, dtype=torch.float)
product_features = torch.tensor(product_features, dtype=torch.float)

# Combine both features (user and product) into a single tensor (node features)
x = torch.cat([user_features, product_features], dim=0)  # [num_users + num_products, 2] features

# Step 6: Create the PyG Data object
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

# Step 7: Verify the graph creation
print(f"Graph has {x.shape[0]} nodes, {edge_index.shape[1]} edges, and {x.shape[1]} features per node.")
print(f"Node features (example): {x[:5]}")  # Show the first few node features
print(f"Edge index (example): {edge_index[:, :5]}")  # Show the first few edges
print(f"Edge attributes (example): {edge_attr[:5]}")  # Show the first few edge feature

Graph has 61844 nodes, 200000 edges, and 2 features per node.
Node features (example): tensor([[20.0000,  4.5000],
        [65.0000,  4.6923],
        [12.0000,  3.9167],
        [ 6.0000,  4.0000],
        [ 6.0000,  4.1667]])
Edge index (example): tensor([[   0, 6590,    1, 6591,    2],
        [6590,    0, 6591,    1, 6592]])
Edge attributes (example): tensor([[5.],
        [5.],
        [5.],
        [5.],
        [2.]])


In [None]:
from torch_geometric.utils import train_test_split_edges

data = train_test_split_edges(data)  # will add data.train_pos_edge_index, etc.

NameError: name 'data' is not defined

In [None]:
data.head()

NameError: name 'data' is not defined