In [1]:
import pyarrow.parquet as pq
import pandas as pd

parquet_file = pq.ParquetFile('amazon_df.parquet')
chunk_size = 100000

chunks = []

for batch in parquet_file.iter_batches(batch_size=chunk_size):
    chunk_df = batch.to_pandas()
    print(f"Chunk shape: {chunk_df.shape}")
    chunks.append(chunk_df)

df = pd.concat(chunks, ignore_index=True)

print(f"Total interactions: {len(df)}")
print(f"Unique users: {len(set(df['user_id']))}")
print(f"Unique items: {len(set(df['parent_asin']))}")
print(f"Rating Distribution:")
print(df['rating'].value_counts().sort_index())

Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (100000, 28)
Chunk shape: (83400, 28)
Total interactions: 2483400
Unique users: 316045
Unique items: 1300847
Rating Distribution:
rating
1.0     174371
2.0      98312
3.0     180027
4.0     336999
5.0    1693691
Name: count, dtype: int64


In [6]:
from scipy.sparse import csr_matrix
def create_interaction_matrix(df, user_col='user_id', item_col='parent_asin', rating_col='rating'):
    users = df[user_col].unique()
    items = df[item_col].unique()
    
    user_to_idx = {user: idx for idx, user in enumerate(users)}
    item_to_idx = {item: idx for idx, item in enumerate(items)}
    
    idx_to_user = {idx: user for user, idx in user_to_idx.items()}
    idx_to_item = {idx: item for item, idx in item_to_idx.items()}
    
    user_indices = df[user_col].map(user_to_idx)
    item_indices = df[item_col].map(item_to_idx)
    ratings = df[rating_col].values
    
    interaction_matrix = csr_matrix((ratings, (user_indices, item_indices)), shape=(len(users), len(items)))
    
    return interaction_matrix, user_to_idx, item_to_idx, idx_to_user, idx_to_item

interaction_matrix, user_to_idx, item_to_idx, idx_to_user, idx_to_item = create_interaction_matrix(df)
print(f"Matrix shape: {interaction_matrix.shape}")
print(f"Matrix density: {interaction_matrix.nnz / (interaction_matrix.shape[0] * interaction_matrix.shape[1]) * 100:.4f}%")

Matrix shape: (316045, 1300847)
Matrix density: 0.0006%


In [7]:
from sklearn.decomposition import TruncatedSVD

class MatrixFactorizationRecommender: 
    def __init__(self, n_factors=50):
        self.n_factors = n_factors
        self.svd =None
        self.user_factors = None
        self.item_factors = None
        
    def fit(self, interaction_matrix): 
        self.svd = TruncatedSVD(n_components=self.n_factors, random_state=42)
        
        self.user_factors = self.svd.fit_transform(interaction_matrix)
        self.item_factors = self.svd.components_.T
        
        print(f"Trained SVD with {self.n_factors} factors")
        print(f"User factors shape: {self.user_factors.shape}")
        print(f"Item factors shape: {self.item_factors.shape}")
        
        return self

mf_model = MatrixFactorizationRecommender(n_factors=50)
mf_model.fit(interaction_matrix)


Trained SVD with 50 factors
User factors shape: (316045, 50)
Item factors shape: (1300847, 50)


<__main__.MatrixFactorizationRecommender at 0x22b7d549390>

In [8]:
def get_user_recommendations(user_id, mf_model, user_to_idx, idx_to_item, interaction_matrix, n_recommendations=10):
    if user_id not in user_to_idx:
        return []
    
    user_idx = user_to_idx[user_id]
    
    predicted_ratings = mf_model.user_factors[user_idx] @ mf_model.item_factors.T
    
    rated_items = set(interaction_matrix[user_idx].nonzero()[1])
    
    item_scores = []
    for item_idx in range(len(predicted_ratings)):
        if item_idx not in rated_items:
            item_id = idx_to_item[item_idx]
            score = predicted_ratings[item_idx]
            item_scores.append((item_id, score))  
            
    item_scores.sort(key=lambda x: x[1], reverse=True)
    return item_scores[:n_recommendations]

In [9]:
def get_smart_recommendations(user_id, df, mf_model, user_to_idx, idx_to_item, interaction_matrix, n_recommendations=10):
    user_history = df[df['user_id'] == user_id]
    
    print(f"User history categories: {user_history['main_category'].value_counts()}")
    primary_category = user_history['main_category'].mode().iloc[0] if len(user_history) > 0 else None
    print(f"Detected primary category: {primary_category}")
    
    collab_recs = get_user_recommendations(user_id, mf_model, user_to_idx, idx_to_item, interaction_matrix, n_recommendations * 3)
    
    if collab_recs and collab_recs[0][1] > 0.5:  
        return collab_recs[:n_recommendations]
    
    else:
        print(f"Low collaborative confidence for {user_id}, using category fallback")
        
        if len(user_history) > 0:
            top_categories = user_history['main_category'].value_counts().head(3).index.tolist()
            
            recommendations = []
            for category in top_categories:
                category_items = df[df['main_category'] == category]
                unbought_items = category_items[~category_items['parent_asin'].isin(user_history['parent_asin'])]
                
                if len(unbought_items) > 0:  
                    item_popularity = unbought_items.groupby('parent_asin').agg({
                        'rating': 'mean',
                        'parent_asin': 'count'
                    }).rename(columns={'parent_asin': 'review_count'})
                    
                    item_popularity['popularity_score'] = item_popularity['rating'] * np.log1p(item_popularity['review_count'])
                    
                    top_items = item_popularity.nlargest(n_recommendations//3, 'popularity_score')
                    recommendations.extend([(item_id, score, category) for item_id, score in zip(top_items.index, top_items['popularity_score'])])
            
            recommendations.sort(key=lambda x: x[1], reverse=True)
            return recommendations[:n_recommendations]
        
        else:
            return get_overall_popular_items(df, user_history, n_recommendations)

def get_overall_popular_items(df, user_history, n_recommendations):
    unbought_items = df[~df['parent_asin'].isin(user_history['parent_asin'])]
    popular_items = unbought_items['parent_asin'].value_counts().head(n_recommendations)
    return [(item_id, count) for item_id, count in popular_items.items()]

In [11]:
import numpy as np
sample_users = df['user_id'].value_counts().head(5).index.tolist()
test_user = sample_users[0]

print(f"Testing recommendations for user: {test_user}")

user_history = df[df['user_id'] == test_user][['parent_asin', 'rating', 'main_category', 'title_meta']].head(10)
print(f"\nUser's actual purchase history:")
print(user_history)

recommendations = get_smart_recommendations(test_user, df, mf_model, user_to_idx, idx_to_item, interaction_matrix, n_recommendations=10)

print(f"\nTop 10 recommendations:")
for i, recommendation in enumerate(recommendations, 1):
    if len(recommendation) == 3:  
        item_id, score, category = recommendation
    else:  
        item_id, score = recommendation
        category = "Unknown"
    
    item_info = df[df['parent_asin'] == item_id].iloc[0]
    print(f"{i}. {item_info['title_meta'][:50]}... (Category: {item_info['main_category']}, Score: {score:.3f})")

Testing recommendations for user: AGZZXSMMS4WRHHJRBUJZI4FZDHKQ

User's actual purchase history:
    parent_asin  rating main_category  \
502  B081M5R5L5     5.0    All Beauty   
633  B095H936ZB     5.0    All Beauty   
634  B08CVTNQP1     5.0    All Beauty   
635  B08JZD8HN4     5.0    All Beauty   
636  B07Z548TKH     5.0    All Beauty   
637  B07VQR3W3Z     5.0    All Beauty   
638  B082MDFNZM     5.0    All Beauty   
639  B082TSD9HN     5.0    All Beauty   
640  B07XVNJFNF     5.0    All Beauty   
641  B0815TNMTL     5.0    All Beauty   

                                            title_meta  
502  QIC 4D Silk Fiber Lash Mascara, Fiber Mascara,...  
633  Sallcks Long Black Wavy Wigs for Women Long Cu...  
634  Disney Frozen - Townley Girl Mega Nail Set wit...  
635  12 Sheets 3D Iridescent Butterfly Nail Art Sti...  
636  Face and Body Paint Kit By Color Technik 24 Pr...  
637  Hair Scrunchies for women,Including 3 Pcs Long...  
638  Baasha 4 Pcs Paddle Brush Set, Detangling Brus..

# **Content Based Model**

In [12]:

class FastContentBasedRecommender: 
    def __init__(self):
        self.tfidf_vectorizer = None
        self.content_features = None
        self.item_to_idx = {}
        self.idx_to_item = {}
        self.scaler = StandardScaler()
        self.product_df = None
        
        self.item_similarities = {}  
        self.similarity_computed = False
        
    def clean_text(self, text): 
        if pd.isna(text) or text == [] or text == '': 
            return ''
        
        if isinstance(text, list): 
            text = ' '.join(str(item) for item in text)
            
        text = str(text).lower()
        text = ''.join(c if c.isalnum() or c.isspace() else ' ' for c in text)
        text = ' '.join(text.split())
        
        return text
    
    def extract_product_features(self, df, max_products=None): 
        print("Extracting features for products...")
        print(f"Total unique products in dataset: {df['parent_asin'].nunique()}")
        
        if max_products:
            print(f"Limiting to {max_products} products for testing")
            unique_products = df['parent_asin'].unique()[:max_products]
            df = df[df['parent_asin'].isin(unique_products)]
        
        product_features = []
        
        unique_asins = df['parent_asin'].unique()
        total_products = len(unique_asins)
        
        for i, asin in enumerate(unique_asins):
            if i % 50000 == 0:
                print(f"Processing product {i+1}/{total_products} ({100*i/total_products:.1f}%)")
            
            group = df[df['parent_asin'] == asin]
            
            description = group['description'].iloc[0]
            store = group['store'].iloc[0] if 'store' in group.columns else ''
            price = group['price'].iloc[0] if 'price' in group.columns else None
            main_category = group['main_category'].iloc[0] if 'main_category' in group.columns else ''
            details = group['details'].iloc[0] if 'details' in group.columns else ''
            
            max_reviews = min(10, len(group))
            review_subset = group.head(max_reviews)
            
            review_titles = ' '.join(review_subset['title_review'].fillna('').astype(str))
            review_texts = ' '.join(review_subset['review_text'].fillna('').astype(str))
            
            review_texts = review_texts[:500] if len(review_texts) > 500 else review_texts
            
            avg_rating = group['rating'].mean()
            num_reviews = len(group)
            avg_helpful_votes = group['helpful_vote'].mean() if 'helpful_vote' in group.columns else 0
            
            product_features.append({
                'parent_asin': asin, 
                'description': description, 
                'store': store, 
                'main_category': main_category, 
                'details': details, 
                'review_titles': review_titles, 
                'review_texts': review_texts, 
                'price': price, 
                'avg_rating': avg_rating, 
                'num_reviews': num_reviews, 
                'avg_helpful_votes': avg_helpful_votes
            })
            
            if i % 100000 == 0:
                gc.collect()
                
        return pd.DataFrame(product_features)
    
    def prepare_content_features(self, product_df): 
        print("Preparing content features...")
        
        product_df['description_clean'] = product_df['description'].apply(self.clean_text)
        product_df['store_clean'] = product_df['store'].apply(self.clean_text)
        product_df['main_category_clean'] = product_df['main_category'].apply(self.clean_text)
        product_df['details_clean'] = product_df['details'].apply(self.clean_text)
        product_df['review_titles_clean'] = product_df['review_titles'].apply(self.clean_text)
        product_df['review_texts_clean'] = product_df['review_texts'].apply(self.clean_text)
        
        product_df['combined_text'] = (
            product_df['description_clean'] + ' ' +
            product_df['store_clean'] + ' ' +
            (product_df['main_category_clean'] + ' ') * 3 +  
            product_df['details_clean'] + ' ' +
            product_df['review_titles_clean'] + ' ' +
            product_df['review_texts_clean']
        )
        
        self.item_to_idx = {item: idx for idx, item in enumerate(product_df['parent_asin'])}
        self.idx_to_item = {idx: item for item, idx in self.item_to_idx.items()}
        
        return product_df
    
    def fit(self, df, max_products=None): 
        print("Training Fast Content-Based Recommender...")
        
        self.product_df = self.extract_product_features(df, max_products=max_products)
        print(f"Extracted features for {len(self.product_df)} unique products")
        
        self.product_df = self.prepare_content_features(self.product_df)
        
        print("Creating TF-IDF features...")
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=1000,  
            stop_words='english',
            ngram_range=(1, 1),  
            min_df=5,
            max_df=0.7
        )
        
        tfidf_features = self.tfidf_vectorizer.fit_transform(self.product_df['combined_text'])
        
        print("Preparing numerical features...")
        numerical_features = self.product_df[['price', 'avg_rating', 'num_reviews', 'avg_helpful_votes']].fillna(0)
        numerical_features_scaled = self.scaler.fit_transform(numerical_features)
        numerical_sparse = csr_matrix(numerical_features_scaled)
        
        
        self.content_features = hstack([
            tfidf_features * 0.8, 
            numerical_sparse * 0.2
        ])
        
        print(f"Content features shape: {self.content_features.shape}")
        sparsity = 1 - (self.content_features.nnz / (self.content_features.shape[0] * self.content_features.shape[1]))
        print(f"Content features sparsity: {sparsity:.4f}")
        
        del tfidf_features, numerical_features_scaled, numerical_sparse
        gc.collect()
        
        print("Training completed!")
        return self
    
    def precompute_similarities(self, top_k=50, batch_size=500, checkpoint_every=10000, 
                              resume_from=None, checkpoint_file="similarities_checkpoint.pkl"):
        print(f"\nPre-computing top-{top_k} similarities for {len(self.item_to_idx)} products...")
        print("CRASH RECOVERY enabled - progress will be saved!")
        
        start_time = time.time()
        n_items = self.content_features.shape[0]
        
        if resume_from and os.path.exists(checkpoint_file):
            print(f"Resuming from checkpoint: {checkpoint_file}")
            with open(checkpoint_file, 'rb') as f:
                checkpoint_data = pickle.load(f)
                self.item_similarities = checkpoint_data['item_similarities']
                start_idx = checkpoint_data['last_processed'] + batch_size
            print(f"Resuming from product {start_idx}/{n_items}")
        else:
            self.item_similarities = {}
            start_idx = 0
            print("Starting New computation...")
        
        estimated_gb = (n_items * top_k * 32) / (1024**3)  
        print(f"Estimated memory needed: ~{estimated_gb:.1f} GB")
        if estimated_gb > 8:
            print("WARNING: This may exceed your computer's memory!")
            print("Consider using max_products parameter to limit dataset size")
            
        for current_idx in range(start_idx, n_items, batch_size):
            end_idx = min(current_idx + batch_size, n_items)
            
            if current_idx % (checkpoint_every) == 0:
                elapsed = time.time() - start_time
                progress = current_idx / n_items * 100
                eta = (elapsed / max(progress, 0.001)) * (100 - progress) if progress > 0 else 0
                print(f"Progress: {progress:.1f}% ({current_idx}/{n_items}) - ETA: {eta/60:.1f} min")
                
                try:
                    import psutil
                    memory_percent = psutil.virtual_memory().percent
                    if memory_percent > 85:
                        print(f"Memory usage high: {memory_percent:.1f}%")
                except:
                    pass
            
            try:
                batch_features = self.content_features[current_idx:end_idx]
                
                similarities = cosine_similarity(batch_features, self.content_features)
                
                for i, row_similarities in enumerate(similarities):
                    actual_idx = current_idx + i
                    item_id = self.idx_to_item[actual_idx]
                    
                    top_indices = np.argsort(row_similarities)[::-1]
                    top_items = []
                    
                    for idx in top_indices:
                        if idx != actual_idx and len(top_items) < top_k:
                            similar_item_id = self.idx_to_item[idx]
                            similarity_score = row_similarities[idx]
                            if similarity_score > 0.01:  # Only store meaningful similarities
                                top_items.append((similar_item_id, similarity_score))
                    
                    self.item_similarities[item_id] = top_items
                
                del similarities, batch_features
                gc.collect()
                
            except MemoryError:
                print(" MEMORY ERROR! Try reducing batch_size or max_products")
                break
            except Exception as e:
                print(f" ERROR at batch {current_idx}: {e}")
                break
            
            if current_idx % checkpoint_every == 0 and current_idx > 0:
                checkpoint_data = {
                    'item_similarities': self.item_similarities,
                    'last_processed': current_idx + batch_size - 1,
                    'total_items': n_items,
                    'top_k': top_k
                }
                with open(checkpoint_file, 'wb') as f:
                    pickle.dump(checkpoint_data, f)
                print(f"Checkpoint saved! (Processed {current_idx + batch_size}/{n_items})")
        
        self.similarity_computed = True
        total_time = time.time() - start_time
        print(f" Similarity computation completed in {total_time/60:.1f} minutes!")
        print(f" Average similarities per item: {np.mean([len(sims) for sims in self.item_similarities.values()]):.1f}")
        
        if os.path.exists(checkpoint_file):
            os.remove(checkpoint_file)
            print("🗑  Checkpoint file cleaned up")
    
    def get_user_recommendations(self, user_id, df, n_recommendations=10):
        if not self.similarity_computed:
            print("  Similarities not pre-computed! This will be slow...")
            print("Run model.precompute_similarities() first for fast recommendations")
            return self._slow_user_recommendations(user_id, df, n_recommendations)
        
        user_history = df[df['user_id'] == user_id]['parent_asin'].unique()
        
        if len(user_history) == 0:
            print(f"No purchase history found for user: {user_id}")
            return []
        
        print(f"Found {len(user_history)} items in user history")
        
        recommendation_scores = defaultdict(float)
        
        for purchased_item in user_history:
            if purchased_item in self.item_similarities:
                similar_items = self.item_similarities[purchased_item]
                
                for similar_item_id, similarity_score in similar_items:
                    if similar_item_id not in user_history:
                        recommendation_scores[similar_item_id] += similarity_score
        
        sorted_recommendations = sorted(
            recommendation_scores.items(), 
            key=lambda x: x[1], 
            reverse=True
        )
        
        return sorted_recommendations[:n_recommendations]
    
    def _slow_user_recommendations(self, user_id, df, n_recommendations=10):
        user_history = df[df['user_id'] == user_id]['parent_asin'].unique()
        
        if len(user_history) == 0:
            return []
        
        recommendations = {}
        batch_size = 1000
        
        for purchased_item in user_history[:5]: 
            if purchased_item in self.item_to_idx:
                item_idx = self.item_to_idx[purchased_item]
                item_features = self.content_features[item_idx:item_idx+1]
                
                n_items = self.content_features.shape[0]
                
                for start_idx in range(0, n_items, batch_size):
                    end_idx = min(start_idx + batch_size, n_items)
                    batch_features = self.content_features[start_idx:end_idx]
                    
                    similarities = cosine_similarity(item_features, batch_features).flatten()
                    
                    for i, similarity_score in enumerate(similarities):
                        actual_idx = start_idx + i
                        similar_item = self.idx_to_item[actual_idx]
                        
                        if similar_item not in user_history:
                            if similar_item in recommendations:
                                recommendations[similar_item] = max(recommendations[similar_item], similarity_score)
                            else:
                                recommendations[similar_item] = similarity_score
        
        sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
        return sorted_recommendations[:n_recommendations]
    
    def get_item_similarities(self, item_id, n_recommendations=10):
        if not self.similarity_computed:
            print("  Similarities not pre-computed!")
            return []
            
        if item_id not in self.item_similarities:
            print(f"Item {item_id} not found in similarity index")
            return []
            
        return self.item_similarities[item_id][:n_recommendations]
    
    def save_model(self, filepath):
        model_data = {
            'tfidf_vectorizer': self.tfidf_vectorizer,
            'content_features': self.content_features,
            'item_to_idx': self.item_to_idx,
            'idx_to_item': self.idx_to_item,
            'scaler': self.scaler,
            'product_df': self.product_df,
            'item_similarities': self.item_similarities,
            'similarity_computed': self.similarity_computed
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {filepath}")
        
        import os
        size_mb = os.path.getsize(filepath) / (1024 * 1024)
        print(f"File size: {size_mb:.1f} MB")
    
    def load_model(self, filepath):
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        self.tfidf_vectorizer = model_data['tfidf_vectorizer']
        self.content_features = model_data['content_features']
        self.item_to_idx = model_data['item_to_idx']
        self.idx_to_item = model_data['idx_to_item']
        self.scaler = model_data['scaler']
        self.product_df = model_data['product_df']
        self.item_similarities = model_data.get('item_similarities', {})
        self.similarity_computed = model_data.get('similarity_computed', False)
        
        print(f"Model loaded from {filepath}")
        if self.similarity_computed:
            print("Pre-computed similarities loaded - recommendations will be FAST!")
        else:
            print(" No pre-computed similarities - run precompute_similarities() for speed")



In [17]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
import json
import gc
import os
import pickle
from collections import defaultdict
class ChunkedContentRecommender:
    def __init__(self, chunk_size=100000):
        self.chunk_size = chunk_size
        self.chunk_models = []
        self.chunk_files = []
        self.total_products = 0
        
    def create_chunks_and_train(self, df, save_directory="chunk_models"):
        
        if not os.path.exists(save_directory):
            os.makedirs(save_directory)
            
        unique_products = df['parent_asin'].unique()
        self.total_products = len(unique_products)
        n_chunks = (self.total_products + self.chunk_size - 1) // self.chunk_size  
        
        print(f" Creating {n_chunks} chunks for {self.total_products} products")
        print(f" Chunk size: {self.chunk_size} products each")
        
        chunk_info = []
        
        for chunk_num in range(n_chunks):
            print(f"\n" + "="*60)
            print(f" PROCESSING CHUNK {chunk_num + 1}/{n_chunks}")
            print("="*60)
            
            start_idx = chunk_num * self.chunk_size
            end_idx = min(start_idx + self.chunk_size, self.total_products)
            
            chunk_products = unique_products[start_idx:end_idx]
            chunk_df = df[df['parent_asin'].isin(chunk_products)]
            
            print(f" Chunk {chunk_num + 1}: Products {start_idx} to {end_idx-1}")
            print(f" Chunk contains {len(chunk_products)} unique products")
            print(f" Chunk contains {len(chunk_df)} total reviews/records")
            
            
            from __main__ import FastContentBasedRecommender
            
            chunk_model = FastContentBasedRecommender()
            
            print(f"  Training chunk {chunk_num + 1}...")
            chunk_model.fit(chunk_df)
            
            print(f"⚡ Pre-computing similarities for chunk {chunk_num + 1}...")
            chunk_model.precompute_similarities(
                top_k=50, 
                batch_size=500, 
                checkpoint_every=5000
            )
            
            chunk_filename = f"{save_directory}/chunk_{chunk_num:03d}.pkl"
            chunk_model.save_model(chunk_filename)
            
            chunk_info.append({
                'chunk_id': chunk_num,
                'filename': chunk_filename,
                'product_range': (start_idx, end_idx),
                'n_products': len(chunk_products),
                'products': chunk_products.tolist()  
            })
            
            print(f" Chunk {chunk_num + 1} completed and saved!")
            
            del chunk_model, chunk_df
            gc.collect()
        
        metadata_file = f"{save_directory}/chunk_metadata.json"
        with open(metadata_file, 'w') as f:
            json.dump({
                'total_products': self.total_products,
                'chunk_size': self.chunk_size,
                'n_chunks': n_chunks,
                'chunks': chunk_info
            }, f, indent=2)
        
        print(f"\n ALL CHUNKS COMPLETED!")
        print(f" Models saved in: {save_directory}/")
        print(f"Metadata saved: {metadata_file}")
        
        return chunk_info
    
    def load_chunks(self, save_directory="chunk_models"):
        metadata_file = f"{save_directory}/chunk_metadata.json"
        
        if not os.path.exists(metadata_file):
            raise FileNotFoundError(f"Metadata file not found: {metadata_file}")
        
        with open(metadata_file, 'r') as f:
            self.metadata = json.load(f)
        
        print(f" Loading {self.metadata['n_chunks']} chunk models...")
        
        self.chunk_models = []
        self.product_to_chunk = {}  
        
        from __main__ import FastContentBasedRecommender
        
        for chunk_info in self.metadata['chunks']:
            chunk_id = chunk_info['chunk_id']
            chunk_filename = chunk_info['filename']
            
            print(f" Loading chunk {chunk_id + 1}...")
            
            chunk_model = FastContentBasedRecommender()
            chunk_model.load_model(chunk_filename)
            self.chunk_models.append(chunk_model)
            
            for product_id in chunk_info['products']:
                self.product_to_chunk[product_id] = chunk_id
        
        print(f"All chunks loaded! Ready for recommendations.")
        return self.chunk_models
    
    def get_user_recommendations(self, user_id, df, n_recommendations=10, max_user_history=20):
        
        if not self.chunk_models:
            raise ValueError("No chunks loaded! Call load_chunks() first.")
        
        user_history = df[df['user_id'] == user_id]['parent_asin'].unique()
        
        if len(user_history) == 0:
            print(f"No purchase history found for user: {user_id}")
            return []
        
        if len(user_history) > max_user_history:
            user_history = user_history[-max_user_history:]  
        
        print(f" Searching recommendations across {len(self.chunk_models)} chunks...")
        print(f" User has {len(user_history)} items in recent history")
        
        all_recommendations = defaultdict(float)
        chunks_searched = 0
        
        for purchased_item in user_history:
            if purchased_item in self.product_to_chunk:
                chunk_idx = self.product_to_chunk[purchased_item]
                chunk_model = self.chunk_models[chunk_idx]
                chunks_searched += 1
                
                similar_items = chunk_model.get_item_similarities(purchased_item, n_recommendations=50)
                
                for similar_item, similarity_score in similar_items:
                    if similar_item not in user_history:
                        all_recommendations[similar_item] += similarity_score
        
        print(f"Searched {chunks_searched} relevant chunks")
        
        sorted_recommendations = sorted(
            all_recommendations.items(),
            key=lambda x: x[1],
            reverse=True
        )
        
        return sorted_recommendations[:n_recommendations]
    
    def get_item_similarities(self, item_id, n_recommendations=10):
        
        if not self.chunk_models:
            raise ValueError("No chunks loaded! Call load_chunks() first.")
        
        if item_id not in self.product_to_chunk:
            print(f"Item {item_id} not found in any chunk")
            return []
        
        chunk_idx = self.product_to_chunk[item_id]
        chunk_model = self.chunk_models[chunk_idx]
        
        print(f"Finding similarities in chunk {chunk_idx + 1}")
        
        return chunk_model.get_item_similarities(item_id, n_recommendations)
    
    def get_system_stats(self):
        if not hasattr(self, 'metadata'):
            return "No chunks loaded"
        
        stats = {
            'total_products': self.metadata['total_products'],
            'n_chunks': self.metadata['n_chunks'],
            'chunk_size': self.metadata['chunk_size'],
            'chunks_loaded': len(self.chunk_models),
            'memory_per_chunk_mb': 'Unknown'  
        }
        
        return stats



def estimate_chunked_system_requirements(total_products, chunk_size=100000):
    n_chunks = (total_products + chunk_size - 1) // chunk_size
    
    print(f"CHUNKED SYSTEM ESTIMATES for {total_products:,} products:")
    print(f"Number of chunks: {n_chunks}")
    print(f"Products per chunk: {chunk_size:,}")
    print(f"Training time per chunk: ~30-45 minutes")
    print(f"Total training time: ~{n_chunks * 35 / 60:.1f} hours")
    print(f"Memory per chunk: ~500MB - 1GB")
    print(f"Total storage: ~{n_chunks * 0.8:.1f}GB")
    print(f"Recommendation speed: ~0.01-0.1 seconds")
    
    return n_chunks



In [9]:
chunked_system = ChunkedContentRecommender(chunk_size=100000)


chunk_info = chunked_system.create_chunks_and_train(df, save_directory="chunk_models")

 Creating 14 chunks for 1300847 products
 Chunk size: 100000 products each

 PROCESSING CHUNK 1/14
 Chunk 1: Products 0 to 99999
 Chunk contains 100000 unique products
 Chunk contains 201366 total reviews/records
  Training chunk 1...
Training Fast Content-Based Recommender...
Extracting features for products...
Total unique products in dataset: 100000
Processing product 1/100000 (0.0%)
Processing product 50001/100000 (50.0%)
Extracted features for 100000 unique products
Preparing content features...
Creating TF-IDF features...
Preparing numerical features...
Content features shape: (100000, 1004)
Content features sparsity: 0.9599
Training completed!
⚡ Pre-computing similarities for chunk 1...

Pre-computing top-50 similarities for 100000 products...
✅ CRASH RECOVERY enabled - progress will be saved!
Starting New computation...
Estimated memory needed: ~0.1 GB
🔄 Progress: 0.0% (0/100000) - ETA: 0.0 min
Memory usage high: 93.6%
🔄 Progress: 5.0% (5000/100000) - ETA: 30.4 min
Checkpoint s

In [14]:
chunked_system = ChunkedContentRecommender()
chunked_system.load_chunks("chunk_models")

 Loading 14 chunk models...
 Loading chunk 1...
Model loaded from chunk_models/chunk_000.pkl
Pre-computed similarities loaded - recommendations will be FAST!
 Loading chunk 2...
Model loaded from chunk_models/chunk_001.pkl
Pre-computed similarities loaded - recommendations will be FAST!
 Loading chunk 3...
Model loaded from chunk_models/chunk_002.pkl
Pre-computed similarities loaded - recommendations will be FAST!
 Loading chunk 4...
Model loaded from chunk_models/chunk_003.pkl
Pre-computed similarities loaded - recommendations will be FAST!
 Loading chunk 5...
Model loaded from chunk_models/chunk_004.pkl
Pre-computed similarities loaded - recommendations will be FAST!
 Loading chunk 6...
Model loaded from chunk_models/chunk_005.pkl
Pre-computed similarities loaded - recommendations will be FAST!
 Loading chunk 7...
Model loaded from chunk_models/chunk_006.pkl
Pre-computed similarities loaded - recommendations will be FAST!
 Loading chunk 8...
Model loaded from chunk_models/chunk_007.p

[<__main__.FastContentBasedRecommender at 0x22b7d5bab10>,
 <__main__.FastContentBasedRecommender at 0x22b9fbb13d0>,
 <__main__.FastContentBasedRecommender at 0x22b8cb0b350>,
 <__main__.FastContentBasedRecommender at 0x22e1e974710>,
 <__main__.FastContentBasedRecommender at 0x22e5bc08990>,
 <__main__.FastContentBasedRecommender at 0x22e9a2d8790>,
 <__main__.FastContentBasedRecommender at 0x22ef732cad0>,
 <__main__.FastContentBasedRecommender at 0x22f32e2c6d0>,
 <__main__.FastContentBasedRecommender at 0x22f6d564990>,
 <__main__.FastContentBasedRecommender at 0x22fa9480b10>,
 <__main__.FastContentBasedRecommender at 0x22fe80a46d0>,
 <__main__.FastContentBasedRecommender at 0x230378a8d10>,
 <__main__.FastContentBasedRecommender at 0x230769c0b10>,
 <__main__.FastContentBasedRecommender at 0x230b6aa4910>]

In [18]:
import time

start_time = time.time()
recommendations = chunked_system.get_user_recommendations(
    'AGZZXSMMS4WRHHJRBUJZI4FZDHKQ', 
    df, 
    n_recommendations=10
)
end_time = time.time()

print(f"Recommendation time: {end_time - start_time:.4f} seconds")
print("\nChunked System Recommendations:")
for i, (item_id, score) in enumerate(recommendations, 1):
    print(f"{i}. Product: {item_id} - Score: {score:.4f}")

 Searching recommendations across 14 chunks...
 User has 20 items in recent history
Searched 20 relevant chunks
Recommendation time: 0.3472 seconds

Chunked System Recommendations:
1. Product: B07Z3LG2NW - Score: 2.3272
2. Product: B082ZY7LTY - Score: 1.3555
3. Product: B01A885HVA - Score: 1.2938
4. Product: B0814NWRT4 - Score: 1.2729
5. Product: B09229VMXD - Score: 1.2223
6. Product: B07XV75LHK - Score: 1.2093
7. Product: B07Z3NN94Z - Score: 1.1822
8. Product: B08WFBN4WS - Score: 1.1807
9. Product: B09XQ2CYQD - Score: 1.1583
10. Product: B07YFKQ664 - Score: 1.1144


In [19]:
similar_items = chunked_system.get_item_similarities('B07B4JXK8D', n_recommendations=10)

print("\nSimilar Items:")
for i, (item_id, score) in enumerate(similar_items, 1):
    print(f"{i}. {item_id} - Similarity: {score:.4f}")

Finding similarities in chunk 1

Similar Items:
1. B01N0TQ0OH - Similarity: 0.9971
2. B07WTXWC32 - Similarity: 0.9970
3. B0B3DB5HTC - Similarity: 0.9962
4. B081KSD3BK - Similarity: 0.9960
5. B00UXG4WR8 - Similarity: 0.9958
6. B07TK15BQQ - Similarity: 0.9952
7. B07SM1PGJW - Similarity: 0.9952
8. B07RNJY499 - Similarity: 0.9932
9. B081K4Q6KQ - Similarity: 0.9917
10. B092LLM7H3 - Similarity: 0.9917


In [21]:
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle

class EnsembleRecommender:
    def __init__(self, content_model=None, collaborative_model=None):
        self.content_model = content_model
        self.collaborative_model = collaborative_model
        
    def set_content_model(self, content_model):
        self.content_model = content_model
        
    def set_collaborative_model(self, collaborative_model):
        self.collaborative_model = collaborative_model
        
    def get_ensemble_recommendations(self, user_id, df, n_recommendations=10, 
                                   content_weight=0.6, collaborative_weight=0.4):
        print(f"Getting ensemble recommendations for user: {user_id}")
        
        all_recommendations = defaultdict(float)
        
        if self.content_model:
            try:
                content_recs = self.content_model.get_user_recommendations(
                    user_id, df, n_recommendations=20
                )
                
                if content_recs:
                    max_score = max([score for _, score in content_recs])
                    for item_id, score in content_recs:
                        normalized_score = score / max_score if max_score > 0 else 0
                        all_recommendations[item_id] += normalized_score * content_weight
                        
            except Exception as e:
                print(f"Content model error: {e}")
        
        if self.collaborative_model:
            try:
                collaborative_recs = self.collaborative_model.get_user_recommendations(
                    user_id, df, n_recommendations=20
                )
                
                if collaborative_recs:
                    max_score = max([score for _, score in collaborative_recs])
                    for item_id, score in collaborative_recs:
                        normalized_score = score / max_score if max_score > 0 else 0
                        all_recommendations[item_id] += normalized_score * collaborative_weight
                        
            except Exception as e:
                print(f"Collaborative model error: {e}")
        
        sorted_recommendations = sorted(
            all_recommendations.items(),
            key=lambda x: x[1],
            reverse=True
        )
        
        return sorted_recommendations[:n_recommendations]

class CollaborativeModelWrapper:
    def __init__(self, mf_model, user_to_idx, idx_to_item, interaction_matrix, get_smart_recommendations_func):
        self.mf_model = mf_model
        self.user_to_idx = user_to_idx
        self.idx_to_item = idx_to_item
        self.interaction_matrix = interaction_matrix
        self.get_recommendations_func = get_smart_recommendations_func
    
    def get_user_recommendations(self, user_id, df, n_recommendations=10):
        recommendations = self.get_recommendations_func(
            user_id, df, self.mf_model, self.user_to_idx, 
            self.idx_to_item, self.interaction_matrix, n_recommendations
        )
        
        result = []
        for rec in recommendations:
            if len(rec) == 3:
                item_id, score, category = rec
                result.append((item_id, score))
            else:
                result.append(rec)
        return result

In [22]:
ensemble = EnsembleRecommender()

collab_wrapper = CollaborativeModelWrapper(
    mf_model, user_to_idx, idx_to_item, interaction_matrix, get_smart_recommendations
)

ensemble.set_content_model(chunked_system)
ensemble.set_collaborative_model(collab_wrapper)

sample_users = df['user_id'].value_counts().head(5).index.tolist()
test_user = sample_users[0]

ensemble_recs = ensemble.get_ensemble_recommendations(test_user, df, n_recommendations=10)

for i, (item_id, score) in enumerate(ensemble_recs, 1):
    item_info = df[df['parent_asin'] == item_id].iloc[0]
    print(f"{i}. {item_info['title_meta'][:50]}... (Score: {score:.3f})")

Getting ensemble recommendations for user: AGZZXSMMS4WRHHJRBUJZI4FZDHKQ
 Searching recommendations across 14 chunks...
 User has 20 items in recent history
Searched 20 relevant chunks
User history categories: main_category
AMAZON FASHION               3498
Toys & Games                 1676
Amazon Home                  1267
Office Products               619
Tools & Home Improvement      496
Arts, Crafts & Sewing         298
All Electronics               283
All Beauty                    252
Books                         189
Sports & Outdoors             173
Industrial & Scientific       111
Health & Personal Care         92
Musical Instruments            80
Computers                      74
Pet Supplies                   58
Cell Phones & Accessories      42
Camera & Photo                 39
Grocery                        36
Automotive                     20
Baby                           17
                               17
Handmade                       13
Home Audio & Theater         