In [1]:
import duckdb
import glob
import os
import pandas as pd
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

CLEANED_DIR = "/root/Merged"
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))

df_list = []

# Load relevant data in batches
for file in all_files:
    print(f"Loading from {os.path.basename(file)}")
    try:
        df = duckdb.sql(f"""
            SELECT user_id, asin, rating
            FROM '{file}'
            WHERE rating BETWEEN 1 AND 5
            LIMIT 50000
        """).df()
        df_list.append(df)
    except Exception as e:
        print(f"Skipped {file}: {e}")

# Combine all batches
df = pd.concat(df_list, ignore_index=True).dropna()

# Encode IDs
user_map = {u: i for i, u in enumerate(df['user_id'].unique())}
item_map = {a: j for j, a in enumerate(df['asin'].unique())}
df['user_idx'] = df['user_id'].map(user_map)
df['item_idx'] = df['asin'].map(item_map)

# Create sparse matrix (item-user for training)
ratings_matrix = coo_matrix((df['rating'], (df['user_idx'], df['item_idx'])))
ratings_matrix_csr = ratings_matrix.tocsr()

# Train ALS
model = AlternatingLeastSquares(factors=50, iterations=15, regularization=0.1)
model.fit(ratings_matrix.T)  # transpose = item-user

# Reverse the item_map to map indices back to item IDs
reverse_item_map = {v: k for k, v in item_map.items()}

# Recommend for 3 users
print("Sample Recommendations:")
for user_id in list(user_map.keys())[:3]:
    user_idx = user_map[user_id]
    user_ratings = ratings_matrix_csr[user_idx]  # Extract only that user's row
    recs = model.recommend(user_idx, user_ratings, N=5)  # Returns two arrays: (indices, scores)
    
    # Extract item IDs from recommendations
    item_indices, scores = recs  # Unpack the two arrays
    
    # Filter out indices not in reverse_item_map
    valid_item_ids = [reverse_item_map[i] for i in item_indices if i in reverse_item_map]
    
    print(f"User {user_id}: {valid_item_ids}")


  from pandas.core import (


Loading from Arts_Crafts_and_Sewing_merged.parquet
Loading from Baby_Products_merged.parquet
Loading from CDs_and_Vinyl_merged.parquet
Loading from Digital_Music_merged.parquet
Loading from Beauty_and_Personal_Care_merged.parquet
Loading from Clothing_Shoes_and_Jewelry_merged.parquet
Loading from Automotive_merged.parquet
Loading from Gift_Cards_merged.parquet
Loading from All_Beauty_merged.parquet
Loading from Cell_Phones_and_Accessories_merged.parquet
Loading from Books_merged.parquet
Loading from Appliances_merged.parquet


  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

Sample Recommendations:
User AFYIOFE7ZRJFQLVCXLJS5YX5PBUA: ['B000L6H3NA', 'B00LTATPLU', 'B08YYNVZN9', 'B001PB7OS4', 'B09WCQS8MW']
User AH7CB6NHGGJ5MIGMDB4NWZO57FHA: ['B000L6H3NA', 'B00LTATPLU', 'B093FVGKT7', 'B005BTKFYG']
User AE7WSZXPO336LQEJDYP2ZHUKT5HA: ['B0036I6DOK']
