In [2]:
import duckdb
import glob
import os
import pandas as pd
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

CLEANED_DIR = "/root/cleaned_parquets"
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))

df_list = []

for file in all_files:
    print(f"Loading from {os.path.basename(file)}")
    try:
        df = duckdb.sql(f"""
            SELECT user_id, asin, CAST(rating AS DOUBLE) AS rating
            FROM '{file}'
            WHERE rating BETWEEN 1 AND 5
            LIMIT 50000
        """).df()

        if all(col in df.columns for col in ["user_id", "asin", "rating"]):
            df = df.dropna(subset=["user_id", "asin", "rating"])
            df_list.append(df)
        else:
            print(f"Skipped {file}: missing required columns")
    except Exception as e:
        print(f"Skipped {file}: {e}")

if not df_list:
    raise ValueError("No valid data was loaded from the parquet files.")

df = pd.concat(df_list, ignore_index=True)

user_map = {u: i for i, u in enumerate(df['user_id'].unique())}
item_map = {a: j for j, a in enumerate(df['asin'].unique())}
reverse_item_map = {j: a for a, j in item_map.items()}

df['user_idx'] = df['user_id'].map(user_map)
df['item_idx'] = df['asin'].map(item_map)

ratings_matrix = coo_matrix((df['rating'], (df['user_idx'], df['item_idx'])))
ratings_matrix_csr = ratings_matrix.tocsr()

print("Training ALS model...")
model = AlternatingLeastSquares(
    factors=50,
    iterations=15,
    regularization=0.1,
    use_gpu=False
)
model.fit(ratings_matrix.T)

print("Sample Recommendations:")
for user_id in list(user_map.keys())[:3]:
    user_idx = user_map.get(user_id)
    if user_idx is None:
        continue

    user_ratings = ratings_matrix_csr[user_idx]

    try:
        recs = model.recommend(user_idx, user_ratings, N=5, filter_already_liked_items=True)
        print(f"Recommendations for User {user_id}:")
        for row in recs:
            item_idx = int(row[0])
            score = float(row[1])
            asin = reverse_item_map.get(item_idx, "Unknown")
            print(f"  {asin} — score: {score:.4f}")
    except Exception as e:
        print(f"Could not generate recommendations for {user_id}: {e}")


Loading from Video_Games_merged.parquet
Loading from Amazon_Fashion_merged.parquet


Loading from Software_merged.parquet
Loading from Health_and_Personal_Care_merged.parquet
Loading from Musical_Instruments_merged.parquet
Loading from Arts_Crafts_and_Sewing_merged.parquet
Loading from Home_and_Kitchen_merged.parquet
Skipped /root/cleaned_parquets/Home_and_Kitchen_merged.parquet: Invalid Input Error: No magic bytes found at end of file '/root/cleaned_parquets/Home_and_Kitchen_merged.parquet'
Loading from Handmade_Products_merged.parquet
Loading from Baby_Products_merged.parquet
Loading from Electronics_merged.parquet
Loading from CDs_and_Vinyl_merged.parquet
Loading from Digital_Music_merged.parquet
Loading from Patio_Lawn_and_Garden_merged.parquet
Loading from Office_Products_merged.parquet
Loading from Beauty_and_Personal_Care_merged.parquet
Loading from Kindle_Store_merged.parquet
Skipped /root/cleaned_parquets/Kindle_Store_merged.parquet: Invalid Input Error: No magic bytes found at end of file '/root/cleaned_parquets/Kindle_Store_merged.parquet'
Loading from Cloth

100%|██████████| 15/15 [01:37<00:00,  6.53s/it]

Sample Recommendations:
Recommendations for User AGCI7FAH4GL5FI65HYLKWTMFZ2CQ:
  B001S1TRWM — score: 75.0000
  B07DJWBYKP — score: 0.0131
Recommendations for User AGXVBIUFLFGMVLATYXHJYL4A5Q7Q:
  B001S1TRWM — score: 59.0000
  B07DJWBYKP — score: 0.0042
Recommendations for User AFTC6ZR5IKNRDG5JCPVNVMU3XV2Q:
  B08LYKJ1PS — score: 3117.0000
  B07DJWBYKP — score: 0.0001



