In [20]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from scipy.spatial.distance import pdist, squareform
import pickle

In [21]:
# Define Function to Clean and Extract Tags
nlp = spacy.load("en_core_web_sm")

def clean_and_extract_tags(text):
    if isinstance(text, str):  
        doc = nlp(text.lower())
        tags = [token.text for token in doc if token.text.isalnum() and token.text not in STOP_WORDS]
        return ', '.join(tags)
    return ''  

# Content Based Hybrid Model

In [4]:
# Content Based Recommendation Class

class DiverseRecommendationSystem:
    def __init__(self, df):
        self.df = df
        self.prepare_features()
    
    def prepare_features(self):
        # Combine multiple text features
        self.df['combined_features'] = (
            self.df['Product Name'] + ' ' + 
            self.df['Product Description'] + ' ' + 
            self.df['Product Tags']
        )
        
        self.df['category_brand'] = (
            self.df['Product Category'].fillna('') + ' ' + 
            self.df['Product Brand'].fillna('')
        )
        
        self.content_similarity = self.compute_text_similarity()
        self.category_similarity = self.compute_category_similarity()
        self.price_similarity = self.compute_price_similarity()
        self.rating_similarity = self.compute_rating_similarity()
        
    def compute_text_similarity(self):
        tfidf = TfidfVectorizer(stop_words='english')
        feature_matrix = tfidf.fit_transform(self.df['combined_features'])
        return cosine_similarity(feature_matrix)
    
    def compute_category_similarity(self):
        tfidf = TfidfVectorizer(stop_words='english')
        category_matrix = tfidf.fit_transform(self.df['category_brand'])
        return cosine_similarity(category_matrix)
    
    def compute_price_similarity(self):
        prices = self.df['Product Price'].fillna(0).values.reshape(-1, 1)
        scaler = MinMaxScaler()
        normalized_prices = scaler.fit_transform(prices)
        price_distances = pdist(normalized_prices, metric='euclidean')
        return 1 - squareform(price_distances)  # Convert distances to similarities
    
    def compute_rating_similarity(self):
        ratings = self.df['Product Rating'].fillna(0).values.reshape(-1, 1)
        scaler = MinMaxScaler()
        normalized_ratings = scaler.fit_transform(ratings)
        rating_distances = pdist(normalized_ratings, metric='euclidean')
        return 1 - squareform(rating_distances)
    
    def get_diverse_recommendations(self, product_id, n_recommendations=10, 
                                  content_weight=0.3,
                                  category_weight=0.4,
                                  price_weight=0.2,
                                  rating_weight=0.0,
                                  diversity_threshold=0.6):
        try:
            
            idx = self.df[self.df['Product Id'] == product_id].index[0]
        
            # Compute weighted hybrid similarity
            hybrid_scores = (
            content_weight * self.content_similarity[idx] +
            category_weight * self.category_similarity[idx] +
            price_weight * self.price_similarity[idx] +
            rating_weight * self.rating_similarity[idx]
            )
        
            recommendations = []
            used_indices = {idx}  
        
            candidate_indices = np.argsort(hybrid_scores)[::-1]
        
            for candidate_idx in candidate_indices:
                if candidate_idx in used_indices:
                    continue
                
            # Check diversity with existing recommendations
                is_diverse = True
                for rec_idx in used_indices:
                    similarity = (
                        content_weight * self.content_similarity[candidate_idx][rec_idx] +
                        category_weight * self.category_similarity[candidate_idx][rec_idx] +
                        price_weight * self.price_similarity[candidate_idx][rec_idx] +
                        rating_weight * self.rating_similarity[candidate_idx][rec_idx]
                    )
                    if similarity > diversity_threshold:
                        is_diverse = False
                        break
            
                if is_diverse:
                    recommendations.append(self.df.iloc[candidate_idx]['Product Id'])  
                    used_indices.add(candidate_idx)
            
                if len(recommendations) >= n_recommendations:
                    break
        
            return recommendations  
        
        except Exception as e:
            print(f"Error generating recommendations: {str(e)}")
            return []

In [18]:
# Load and process data

products = pd.read_csv('final_products.csv')  

print("Columns in the DataFrame:")
for col in products.columns:
    print(f"'{col}' (length: {len(col)})")

print("Checking for NaN values in relevant columns:")
print(products[['Product Description', 'Product Tags']].isnull().sum())

products['Product Description'] = products['Product Description'].fillna('')
products['Product Tags'] = products['Product Tags'].fillna('')

num_columns = ['Product Price', 'Product Rating', 'Product Reviews Count']
for column in num_columns:
    products[column] = pd.to_numeric(products[column], errors='coerce')  

print("Checking for NaN values in numerical columns:")
print(products[num_columns].isnull().sum())

products[num_columns] = products[num_columns].fillna(0)

for column in ['Product Description', 'Product Tags']:
    products[column] = products[column].apply(clean_and_extract_tags)
    

Columns in the DataFrame:
'Unnamed: 0' (length: 10)
'Uniq Id' (length: 7)
'Product Id' (length: 10)
'Product Category' (length: 16)
'Product Brand' (length: 13)
'Product Name' (length: 12)
'Product Price' (length: 13)
'Product Description' (length: 19)
'Product Image Url' (length: 17)
'Product Tags' (length: 12)
'Product Rating' (length: 14)
'Product Reviews Count' (length: 21)
Checking for NaN values in relevant columns:
Product Description    1127
Product Tags              0
dtype: int64
Checking for NaN values in numerical columns:
Product Price              42
Product Rating           2792
Product Reviews Count    1650
dtype: int64


In [54]:
# Initialize the Recommendation System and Save as pkl file
rec_system = DiverseRecommendationSystem(products)
with open('content_rec.pkl', 'wb') as f:
    pickle.dump(rec_system, f)

## User may also like

In [29]:
# Block 7: Get Recommendations for a Specific Product
product_id = 'f7b3fbeefd6ac9b22403b8a083656cd1'
recommendations = rec_system.get_diverse_recommendations(
        product_id,
        n_recommendations=10
    )
    
    # Print recommendations
print(f"\nDiverse recommendations for Product ID {product_id}:",recommendations)


Diverse recommendations for Product ID f7b3fbeefd6ac9b22403b8a083656cd1: ['fff36074421d36893b704f3325d28518', '5a3aef3e41321dedbedf7b6d701446e7', 'a6a830de8f5cd3e62e3c3024e6035764', '54cf22213ad13a4eab8c6412239e3579', 'd9978813405b8576df47d69dd1832f19', 'b039fc42321106b9f5a60e50613ea990', 'fa7e5052c316b59d3ff43f82de4d37aa', '7f1efeabc9b5e269425a174f69a1cf77', '2ab8b8b553b3d97ea00afa8748c487c5', '66439a0e31a2e45a8644e13b3951d7e2']


## Similar Items

In [55]:
product_id = 'f7b3fbeefd6ac9b22403b8a083656cd1'
recommendations = rec_system.get_diverse_recommendations(
        product_id,
        n_recommendations=15,
        content_weight=0.5,
        category_weight=0.1,
        price_weight=0.0,
        rating_weight=0.0,
        diversity_threshold=0.3
    )
    
    # Print recommendations
print(f"\nDiverse recommendations for Product ID {product_id}:",recommendations)



Diverse recommendations for Product ID f7b3fbeefd6ac9b22403b8a083656cd1: ['f6e3334629f2d753f7bd0fbb05223f5e', 'db22de6a3e47eeea083c1fae8ef68657', '6ba0ae7b9b50bb69fb66f4551b3269cf', '5a3aef3e41321dedbedf7b6d701446e7', '8a723fae706cd0135037a1091c5a993a', '2fccae8c299f3c29fe93017ea779a1f4', 'c6f68771315ff9b0404c68297ebbfac1', '7f1efeabc9b5e269425a174f69a1cf77', 'd86b009636f7fb3a2683cbd14a11f17d', '0c1054e3d6b67991ee587bd7ff39e5eb', '05381e1a57648f42615b63379ac2c114', 'd9978813405b8576df47d69dd1832f19', 'bed58b3f1a3246fc34ea71ec91511083', '1f9bafa0fde3499ed9dfa5dae9c6f3a2', '7d1df2ce4daed6ead595c5153731151f']


# Collaborative recommendation

In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load user interactions data
user_interactions = pd.read_csv('D:\python_progs\\recommend_system\website_building\\balanced_synthetic_interactions.csv')  # Adjust the filename as necessary
user_interaction_matrix = pd.read_csv('D:\python_progs\\recommend_system\website_building\\balanced_interaction_matrix.csv')  # Adjust the filename as necessary


In [9]:
# Calculate cosine similarity between products
product_similarity = cosine_similarity(user_interaction_matrix.T)  # Transpose to get products as rows
product_similarity_df = pd.DataFrame(product_similarity, 
                                      index=user_interaction_matrix.columns, 
                                      columns=user_interaction_matrix.columns)




In [5]:
class CollaborativeRecommendationSystem:
    def __init__(self, user_interaction_matrix):
        self.user_interaction_matrix = user_interaction_matrix
        self.product_similarity = self.compute_product_similarity()

    def compute_product_similarity(self):
        from sklearn.metrics.pairwise import cosine_similarity
        product_similarity = cosine_similarity(self.user_interaction_matrix.T)  # Transpose to get products as rows
        return pd.DataFrame(product_similarity, index=self.user_interaction_matrix.columns, columns=self.user_interaction_matrix.columns)

    def get_col_recommendations(self, product_id, n_recommendations=10):
        if product_id not in self.product_similarity.columns:
            return []

        similar_scores = self.product_similarity[product_id]
        top_recommendations = similar_scores.sort_values(ascending=False).head(n_recommendations + 1)  # +1 to exclude the product itself
        return top_recommendations.index[1:].tolist()

In [43]:
collab_system=CollaborativeRecommendationSystem(user_interaction_matrix)
with open('collab_rec.pkl', 'wb') as f:
    pickle.dump(collab_system, f)

In [44]:
product_id = '742ff862e17f41b3c1611f0e22c1009e'  
recommendations = collab_system.get_col_recommendations(product_id)
print("Recommendations for Product ID", product_id, ":", recommendations)

Recommendations for Product ID 742ff862e17f41b3c1611f0e22c1009e : ['eb407ddeb992ff7d53e6ee1edb54d7ac', '2ee0076aa415e7d4cc0927f1714bbbef', 'b11b0ef4ad16247f58d0b6f18e6f4b65', '2135948483fa7498ca3d81273897f4a4', 'b498d2c95a40d8f2cb95bb1cf1e1fe15', 'd1116fac5d14f9bc135b3f4cbbddddc4', '62cf58a85b4370f093d569c79994312a', '4003dd622780002d65617e72555f4f38', '262535d639646224749679971d2a28aa', 'ac1d22278f2e7ea03af1f765d9388b6f']


# Hybrid rec

In [9]:
import pickle

def load_model(model_path):
    with open(model_path, 'rb') as f:
        return pickle.load(f)
    

rec_system = load_model('D:\\python_progs\\recommend_system\\website_building\\content_rec.pkl')
collab_system = load_model('D:\\python_progs\\recommend_system\\website_building\\collab_rec.pkl')
print("Models loaded successfully.")

Models loaded successfully.


In [18]:
def hybrid_recommendations(product_id):
    # Get recommendations from content-based and collaborative filtering models
    content_based_rec = rec_system.get_diverse_recommendations(product_id)  # This should return a list of product IDs
    collaborative_filtering_rec = collab_system.get_col_recommendations(product_id)  # This should also return a list of product IDs

    # Convert lists to DataFrames
    content_based_df = pd.DataFrame(content_based_rec, columns=['Product Id'])
    collaborative_filtering_df = pd.DataFrame(collaborative_filtering_rec, columns=['Product Id'])

    # Merge and deduplicate the recommendations
    hybrid_rec = pd.concat([content_based_df, collaborative_filtering_df]).drop_duplicates()

    shuffled_hybrid_rec = hybrid_rec.sample(frac=1).reset_index(drop=True)  # Shuffle the DataFrame
    product_id_list = shuffled_hybrid_rec['Product Id'].head(15).tolist()

    return product_id_list



In [19]:
product_id = '742ff862e17f41b3c1611f0e22c1009e'  
recommendations = hybrid_recommendations(product_id)
print("Recommendations for Product ID", product_id, ":", recommendations)

Recommendations for Product ID 742ff862e17f41b3c1611f0e22c1009e : ['eb407ddeb992ff7d53e6ee1edb54d7ac', 'e49c35631b91a1cd4fa8c15ad84f77c4', 'd1116fac5d14f9bc135b3f4cbbddddc4', '6ebdb6c57db3dbf89962af50393011c4', 'ac1d22278f2e7ea03af1f765d9388b6f', 'b559a45841fe01d6cb6a972a2512c2c1', '62cf58a85b4370f093d569c79994312a', '2135948483fa7498ca3d81273897f4a4', '262535d639646224749679971d2a28aa', '2ee0076aa415e7d4cc0927f1714bbbef', '998612eb7af80112dae05de1d0931651', 'b498d2c95a40d8f2cb95bb1cf1e1fe15', 'b11b0ef4ad16247f58d0b6f18e6f4b65', '4003dd622780002d65617e72555f4f38', '2a6bf71eb3889c98402085e37b3d62b4']
