In [None]:
import polars as pl
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.model_selection import train_test_split

CLEANED_DIR = "/root/Merged"
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))

sample_fraction = 0.2  # 20%

lazy_frames = []

# Step 1: Build lazy loading + sampling pipeline
for file in all_files:
    print(f"Scanning {file}")
    lf = pl.scan_parquet(file)
    lf = lf.select(["user_id", "asin", "rating"])
    lf = lf.filter((pl.col("rating") >= 1) & (pl.col("rating") <= 5))
    lf = lf.sample(fraction=sample_fraction)
    lazy_frames.append(lf)

# Step 2: Combine all sampled LazyFrames
combined_lf = pl.concat(lazy_frames)

# Step 3: Collect sampled DataFrame
df = combined_lf.collect()

# Step 4: Categorical encoding
df = df.with_columns([
    pl.col("user_id").cast(pl.Categorical).alias("user_cat"),
    pl.col("asin").cast(pl.Categorical).alias("item_cat")
]).drop(["user_id", "asin"])

df = df.with_columns([
    pl.col("user_cat").to_physical().alias("user_idx"),
    pl.col("item_cat").to_physical().alias("item_idx")
])

# Step 5: Train/test split
user_indices = df["user_idx"].to_numpy()
item_indices = df["item_idx"].to_numpy()
ratings = df["rating"].cast(pl.Float32).to_numpy()

train_idx, test_idx = train_test_split(np.arange(len(df)), test_size=0.2, random_state=42)

train_user_indices = user_indices[train_idx]
train_item_indices = item_indices[train_idx]
train_ratings = ratings[train_idx]

test_user_indices = user_indices[test_idx]
test_item_indices = item_indices[test_idx]
test_ratings = ratings[test_idx]

# Step 6: Build sparse matrix
train_sparse_csr = coo_matrix(
    (train_ratings, (train_user_indices, train_item_indices))
).tocsr()

# Step 7: Train ALS
model = AlternatingLeastSquares(
    factors=50,
    iterations=15,
    regularization=0.1
)
model.fit(train_sparse_csr.T)  # item-user

print("Model trained successfully!")


Loading from Video_Games_merged.parquet
Loading from Amazon_Fashion_merged.parquet


Loading from Software_merged.parquet
Loading from Health_and_Personal_Care_merged.parquet
Loading from Musical_Instruments_merged.parquet
Loading from Arts_Crafts_and_Sewing_merged.parquet
Loading from Home_and_Kitchen_merged.parquet
Skipped /root/cleaned_parquets/Home_and_Kitchen_merged.parquet: Invalid Input Error: No magic bytes found at end of file '/root/cleaned_parquets/Home_and_Kitchen_merged.parquet'
Loading from Handmade_Products_merged.parquet
Loading from Baby_Products_merged.parquet
Loading from Electronics_merged.parquet
Loading from CDs_and_Vinyl_merged.parquet
Loading from Digital_Music_merged.parquet
Loading from Patio_Lawn_and_Garden_merged.parquet
Loading from Office_Products_merged.parquet
Loading from Beauty_and_Personal_Care_merged.parquet
Loading from Kindle_Store_merged.parquet
Skipped /root/cleaned_parquets/Kindle_Store_merged.parquet: Invalid Input Error: No magic bytes found at end of file '/root/cleaned_parquets/Kindle_Store_merged.parquet'
Loading from Cloth

100%|██████████| 15/15 [01:37<00:00,  6.53s/it]

Sample Recommendations:
Recommendations for User AGCI7FAH4GL5FI65HYLKWTMFZ2CQ:
  B001S1TRWM — score: 75.0000
  B07DJWBYKP — score: 0.0131
Recommendations for User AGXVBIUFLFGMVLATYXHJYL4A5Q7Q:
  B001S1TRWM — score: 59.0000
  B07DJWBYKP — score: 0.0042
Recommendations for User AFTC6ZR5IKNRDG5JCPVNVMU3XV2Q:
  B08LYKJ1PS — score: 3117.0000
  B07DJWBYKP — score: 0.0001



