In [1]:
import os
import glob
import pandas as pd

# Step 1: Folder path
folder_path = r"C:\Users\91945\OneDrive\Desktop\product recomendation"

# Step 2: Find all CSV files
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

print("CSV files found:", len(csv_files))
for f in csv_files[:10]:
    print("-", os.path.basename(f))

# Step 3: Load each CSV
dataframes = {}

for file_path in csv_files:
    file_name = os.path.basename(file_path)
    category_name = os.path.splitext(file_name)[0]
    
    df = pd.read_csv(file_path, encoding="utf-8", low_memory=False)
    dataframes[category_name] = df

print("\nLoaded Categories:")
print(list(dataframes.keys())[:20])


CSV files found: 141
- Air Conditioners.csv
- All Appliances.csv
- All Books.csv
- All Car and Motorbike Products.csv
- All Electronics.csv
- All English.csv
- All Exercise and Fitness.csv
- All Grocery and Gourmet Foods.csv
- All Hindi.csv
- All Home and Kitchen.csv

Loaded Categories:
['Air Conditioners', 'All Appliances', 'All Books', 'All Car and Motorbike Products', 'All Electronics', 'All English', 'All Exercise and Fitness', 'All Grocery and Gourmet Foods', 'All Hindi', 'All Home and Kitchen', 'All Movies and TV Shows', 'All Music', 'All Pet Supplies', 'All Sports Fitness and Outdoors', 'All Video Games', 'Amazon Fashion', 'Amazon Pharmacy', 'Amazon-Products', 'Baby Bath Skin and Grooming', 'Baby Fashion']


In [2]:
sample_categories = ["Air Conditioners", "All Appliances", "Baby Products"]

for cat in sample_categories:
    print("\n==============================")
    print("Category:", cat)
    print("Total rows:", len(dataframes[cat]))
    print("Columns:")
    print(list(dataframes[cat].columns))



Category: Air Conditioners
Total rows: 720
Columns:
['name', 'main_category', 'sub_category', 'image', 'link', 'ratings', 'no_of_ratings', 'discount_price', 'actual_price']

Category: All Appliances
Total rows: 9576
Columns:
['name', 'main_category', 'sub_category', 'image', 'link', 'ratings', 'no_of_ratings', 'discount_price', 'actual_price']

Category: Baby Products
Total rows: 1056
Columns:
['name', 'main_category', 'sub_category', 'image', 'link', 'ratings', 'no_of_ratings', 'discount_price', 'actual_price']


In [3]:
merged_list = []

for category_name, df in dataframes.items():
    df = df.copy()
    df["category_file"] = category_name
    merged_list.append(df)

all_products = pd.concat(merged_list, ignore_index=True)

print("Final merged shape:", all_products.shape)
print("Columns:", list(all_products.columns))
all_products.head()


Final merged shape: (1636942, 13)
Columns: ['name', 'main_category', 'sub_category', 'image', 'link', 'ratings', 'no_of_ratings', 'discount_price', 'actual_price', 'category_file', 'Unnamed: 0', 'discount_percent', 'combined_text']


Unnamed: 0.1,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,category_file,Unnamed: 0,discount_percent,combined_text
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/31UISB90sY...,https://www.amazon.in/Lloyd-Inverter-Convertib...,4.2,2255,"₹32,999","₹58,990",Air Conditioners,,,
1,LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctD...,https://www.amazon.in/LG-Convertible-Anti-Viru...,4.2,2948,"₹46,490","₹75,990",Air Conditioners,,,
2,LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctD...,https://www.amazon.in/LG-Inverter-Convertible-...,4.2,1206,"₹34,490","₹61,990",Air Conditioners,,,
3,LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctD...,https://www.amazon.in/LG-Convertible-Anti-Viru...,4.0,69,"₹37,990","₹68,990",Air Conditioners,,,
4,Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/41lrtqXPiW...,https://www.amazon.in/Carrier-Inverter-Split-C...,4.1,630,"₹34,490","₹67,790",Air Conditioners,,,


In [4]:
# remove extra unnamed columns

all_products = all_products.loc[:, ~all_products.columns.str.contains('^Unnamed')]


In [5]:
def clean_price(x):
    if pd.isna(x):
        return None
    x = str(x).replace("₹", "").replace(",", "").strip()
    return float(x) if x.replace('.', '', 1).isdigit() else None

all_products["discount_price"] = all_products["discount_price"].apply(clean_price)
all_products["actual_price"] = all_products["actual_price"].apply(clean_price)


In [6]:
def clean_number(x):
    if pd.isna(x):
        return None
    x = str(x).replace(",", "").strip()
    return int(x) if x.isdigit() else None

all_products["no_of_ratings"] = all_products["no_of_ratings"].apply(clean_number)


In [7]:
def clean_rating(x):
    try:
        return float(x)
    except:
        return None

all_products["ratings"] = all_products["ratings"].apply(clean_rating)


In [8]:
all_products = all_products[all_products["name"].notna()]


In [9]:
all_products.drop_duplicates(subset=["name", "link"], inplace=True)


In [10]:
print("Final shape after cleaning:", all_products.shape)
all_products.head()


Final shape after cleaning: (551585, 12)


Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,category_file,discount_percent,combined_text
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/31UISB90sY...,https://www.amazon.in/Lloyd-Inverter-Convertib...,4.2,2255.0,32999.0,58990.0,Air Conditioners,,
1,LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctD...,https://www.amazon.in/LG-Convertible-Anti-Viru...,4.2,2948.0,46490.0,75990.0,Air Conditioners,,
2,LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctD...,https://www.amazon.in/LG-Inverter-Convertible-...,4.2,1206.0,34490.0,61990.0,Air Conditioners,,
3,LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctD...,https://www.amazon.in/LG-Convertible-Anti-Viru...,4.0,69.0,37990.0,68990.0,Air Conditioners,,
4,Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/41lrtqXPiW...,https://www.amazon.in/Carrier-Inverter-Split-C...,4.1,630.0,34490.0,67790.0,Air Conditioners,,


In [11]:
df = all_products.copy()

df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce')
df['no_of_ratings'] = pd.to_numeric(df['no_of_ratings'], errors='coerce')

df['score'] = df['ratings'] * df['no_of_ratings']

def get_top_products(category_file=None, n=10):
    temp = df.copy()
    
    if category_file is not None:
        temp = temp[temp['category_file'] == category_file]
    
    temp = temp.sort_values(['score', 'no_of_ratings'], ascending=False)
    
    return temp[['name', 'main_category', 'sub_category',
                 'ratings', 'no_of_ratings',
                 'discount_price', 'actual_price',
                 'category_file']].head(n)

get_top_products(category_file='Air Conditioners', n=10)

Unnamed: 0,name,main_category,sub_category,ratings,no_of_ratings,discount_price,actual_price,category_file
237,Ontel Arctic Air Freedom Portable Personal Air...,appliances,Air Conditioners,3.8,9577.0,,5065.0,Air Conditioners
12,Panasonic 1.5 Ton 5 Star Wi-Fi Inverter Smart ...,appliances,Air Conditioners,4.3,5073.0,45990.0,63400.0,Air Conditioners
245,"LG 1.5 Ton 5 Star Inverter Split AC (Copper, S...",appliances,Air Conditioners,4.4,3562.0,53775.0,67990.0,Air Conditioners
33,"Whirlpool 1.5 Ton 5 Star, Flexicool Inverter S...",appliances,Air Conditioners,4.0,3604.0,38990.0,70600.0,Air Conditioners
14,"Whirlpool 1.5 Ton 3 Star, Flexicool Inverter S...",appliances,Air Conditioners,3.9,3670.0,31990.0,62000.0,Air Conditioners
1,LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...,appliances,Air Conditioners,4.2,2948.0,46490.0,75990.0,Air Conditioners
77,"AmazonBasics 1 Ton 3 Star Split Ac (White, Cop...",appliances,Air Conditioners,3.7,3157.0,25990.0,49000.0,Air Conditioners
44,Blue Star 0.8 Ton 3 Star Inverter Split Ac (Co...,appliances,Air Conditioners,4.2,2722.0,28990.0,41500.0,Air Conditioners
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,4.2,2255.0,32999.0,58990.0,Air Conditioners
23,Panasonic 1 Ton 5 Star Wi-Fi Inverter Smart Sp...,appliances,Air Conditioners,4.3,2178.0,39990.0,54600.0,Air Conditioners


In [12]:
# Replace NaN with actual_price (means no discount)
all_products['discount_price'] = all_products['discount_price'].fillna(all_products['actual_price'])

In [13]:
all_products['discount_percent'] = (
    (all_products['actual_price'] - all_products['discount_price']) / 
    all_products['actual_price']
) * 100

In [14]:
all_products['discount_percent'] = all_products['discount_percent'].fillna(0)

all_products[['discount_price', 'actual_price', 'discount_percent']].head()

Unnamed: 0,discount_price,actual_price,discount_percent
0,32999.0,58990.0,44.06001
1,46490.0,75990.0,38.820897
2,34490.0,61990.0,44.361994
3,37990.0,68990.0,44.934048
4,34490.0,67790.0,49.122289


In [15]:
all_products['combined_text'] = (
    all_products['name'].astype(str) + " " +
    all_products['main_category'].astype(str) + " " +
    all_products['sub_category'].astype(str)
)


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(all_products['combined_text'])


In [18]:
# Similar Products Function

def get_similar_products(product_index, top_n=10):
    cosine_sim = cosine_similarity(tfidf_matrix[product_index], tfidf_matrix).flatten()
    
    related_indices = cosine_sim.argsort()[-top_n-1:-1][::-1]
    
    return all_products.iloc[related_indices][[
        'name', 'main_category', 'sub_category',
        'ratings', 'no_of_ratings', 'discount_price',
        'actual_price', 'category_file'
    ]]


In [19]:
get_similar_products(0, top_n=10)


Unnamed: 0,name,main_category,sub_category,ratings,no_of_ratings,discount_price,actual_price,category_file
747,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,All Appliances,4.2,2255.0,32999.0,58990.0,All Appliances
368633,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Kitchen & Home Appliances,4.2,2255.0,32999.0,58990.0,Amazon-Products
272544,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Heating & Cooling Appliances,4.2,2255.0,32999.0,58990.0,Amazon-Products
23363,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,home & kitchen,All Home & Kitchen,4.2,2262.0,32999.0,58990.0,All Home and Kitchen
69,Lloyd 2.0 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,,,46000.0,77990.0,Air Conditioners
523,Havells-Lloyd 1.5 Ton 3 Star Inverter Split AC...,appliances,Air Conditioners,,,34900.0,58990.0,Air Conditioners
59,Lloyd 2.0 Ton 5 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,4.0,49.0,52090.0,85990.0,Air Conditioners
6,Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,4.2,1097.0,29999.0,49990.0,Air Conditioners
7,Lloyd 1.5 Ton 5 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,4.3,1494.0,39990.0,67990.0,Air Conditioners
16,Lloyd 1.0 Ton 5 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,4.1,88.0,34000.0,57990.0,Air Conditioners


In [20]:
def search_product(
    query,
    max_results=10,
    main_category=None,      # e.g. "appliances"
    sub_category=None,       # e.g. "Air Conditioners"
    min_price=None,          # min price (on selected price_col)
    max_price=None,          # max price
    price_col="discount_price",  # "discount_price" or "actual_price"
    min_rating=None,         # e.g. 4.0
    max_rating=None          # optional
):
    """
    Search products by name + optional filters:
    - main_category
    - sub_category
    - price range (on price_col)
    - rating range
    """

    # --- 1) Basic name search ---
    mask = all_products['name'].str.contains(query, case=False, na=False)

    # --- 2) Category filters 
    if main_category is not None:
        mask &= all_products['main_category'].str.lower().eq(str(main_category).lower())

    if sub_category is not None:
        mask &= all_products['sub_category'].str.lower().eq(str(sub_category).lower())

    # --- 3) Price filters 
    if price_col not in all_products.columns:
        raise ValueError(f"Column '{price_col}' not found in all_products")

    # Min price
    if min_price is not None:
        mask &= all_products[price_col] >= float(min_price)

    # Max price
    if max_price is not None:
        mask &= all_products[price_col] <= float(max_price)

    # --- 4) Rating filters
    if 'ratings' in all_products.columns:
        ratings_series = all_products['ratings'].fillna(0)

        if min_rating is not None:
            mask &= ratings_series >= float(min_rating)

        if max_rating is not None:
            mask &= ratings_series <= float(max_rating)

    # --- 5) Final results ---
    results = all_products[mask].head(max_results)

    if results.empty:
        print("No matching products found for given query + filters.")
        return

    # Important columns to show
    cols_to_show = [
        'name',
        'main_category',
        'sub_category',
        'discount_price',
        'actual_price'
    ]
    # Only keep columns that actually exist
    cols_to_show = [c for c in cols_to_show if c in results.columns]

    display(results[cols_to_show])
    print("\nNote: Use the left-side index number for recommendations.")

    
    return results


In [21]:
# 1. Total missing values per column
all_products.isna().sum()


name                     0
main_category            0
sub_category             0
image                    0
link                     0
ratings             182027
no_of_ratings       182090
discount_price       17813
actual_price         17813
category_file            0
discount_percent         0
combined_text            0
dtype: int64

In [22]:
all_products.shape

(551585, 12)

In [23]:
# STEP 2 — Fix price columns

# Case 1: discount_price NaN & actual_price available
mask1 = all_products['discount_price'].isna() & all_products['actual_price'].notna()
all_products.loc[mask1, 'discount_price'] = all_products.loc[mask1, 'actual_price']

# Case 2: actual_price NaN & discount_price available
mask2 = all_products['actual_price'].isna() & all_products['discount_price'].notna()
all_products.loc[mask2, 'actual_price'] = all_products.loc[mask2, 'discount_price']

# Case 3: both NaN → drop rows
mask3 = all_products['discount_price'].isna() & all_products['actual_price'].isna()
all_products = all_products[~mask3]

# Check again
all_products[['discount_price', 'actual_price']].isna().sum()


discount_price    0
actual_price      0
dtype: int64

In [24]:
# Fix ratings and number of ratings

# Convert to numeric safely
all_products['ratings'] = pd.to_numeric(all_products['ratings'], errors='coerce')
all_products['no_of_ratings'] = pd.to_numeric(all_products['no_of_ratings'], errors='coerce')

# Fill missing values with 0
all_products['ratings'] = all_products['ratings'].fillna(0)
all_products['no_of_ratings'] = all_products['no_of_ratings'].fillna(0)

# Check 
all_products[['ratings', 'no_of_ratings']].isna().sum()


ratings          0
no_of_ratings    0
dtype: int64

In [25]:
# Ensure column names are consistent
all_products.columns = all_products.columns.str.strip()

# Save cleaned dataset with all columns
all_products.to_csv("cleaned_products.csv", index=False)

print("Clean dataset saved successfully as cleaned_products.csv")
print("Final dataset shape:", all_products.shape)
print("Columns:", list(all_products.columns))


Clean dataset saved successfully as cleaned_products.csv
Final dataset shape: (533772, 12)
Columns: ['name', 'main_category', 'sub_category', 'image', 'link', 'ratings', 'no_of_ratings', 'discount_price', 'actual_price', 'category_file', 'discount_percent', 'combined_text']


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

def build_tfidf_engine(df):
    """
    Build a TF-IDF representation for product name + category fields.
    Returns:
        vectorizer: fitted TfidfVectorizer
        tfidf_matrix: sparse matrix (n_products x n_features)
    """
    # Combine text fields into a single corpus
    corpus = (
        df['name'].fillna('') + ' ' +
        df['main_category'].fillna('') + ' ' +
        df['sub_category'].fillna('')
    )

    vectorizer = TfidfVectorizer(
        max_features=50000,
        ngram_range=(1, 2),
        stop_words='english'
    )
    tfidf_matrix = vectorizer.fit_transform(corpus)

    return vectorizer, tfidf_matrix

# Build the engine once (global variables used by recommend_by_index)
tfidf_vectorizer, tfidf_matrix = build_tfidf_engine(all_products)

print("TF-IDF engine built. Shape:", tfidf_matrix.shape)


TF-IDF engine built. Shape: (533772, 50000)


In [27]:
def recommend_by_index(
    product_index,
    top_n=10,
    use_price_rating_weights=True,
    price_filter_mode="none",
):
    """
    Recommend similar products for a given product index.

    Parameters
    ----------
    product_index : int
        Index of the product in all_products.
    top_n : int, default=10
        Number of recommendations to return.
    use_price_rating_weights : bool, default=True
        If True, reweight similarity scores using ratings, number of ratings
        and discount percentage.
    price_filter_mode : {"none", "similar", "cheaper", "cheaper_20"}, default="none"
        Automatic price filter based on the selected product's discount_price.
        - "none"        : no price filter
        - "similar"     : keep products within ±30% of the selected product's price
        - "cheaper"     : keep products with price <= selected product's price
        - "cheaper_20"  : keep products with price <= 80% of selected product's price
    """

    if product_index not in all_products.index:
        raise ValueError("Invalid product index")

    # ------------------------------------------------------------------
    # 1) Base cosine similarity from TF-IDF
    # ------------------------------------------------------------------
    cosine_similarities = linear_kernel(
        tfidf_matrix[product_index:product_index + 1],
        tfidf_matrix
    ).flatten()

    scores = cosine_similarities.copy()

    # ------------------------------------------------------------------
    # 2) Optional reweighting using rating / number of ratings / discount%
    # ------------------------------------------------------------------
    if use_price_rating_weights:
        # Ratings (normalized)
        ratings = all_products['ratings'].fillna(0).astype(float).values
        r_min, r_max = ratings.min(), ratings.max()
        ratings_norm = (ratings - r_min) / (r_max - r_min + 1e-6)

        # Number of ratings (log + normalized)
        num_ratings = all_products['no_of_ratings'].fillna(0).astype(float).values
        num_ratings_log = np.log1p(num_ratings)
        nr_min, nr_max = num_ratings_log.min(), num_ratings_log.max()
        num_ratings_norm = (num_ratings_log - nr_min) / (nr_max - nr_min + 1e-6)

        # Discount percentage (normalized)
        actual = all_products['actual_price'].replace(0, np.nan).astype(float)
        discount = all_products['discount_price'].astype(float)
        discount_pct = ((actual - discount) / actual).replace(
            [np.inf, -np.inf], np.nan
        ).fillna(0).values
        d_min, d_max = discount_pct.min(), discount_pct.max()
        discount_norm = (discount_pct - d_min) / (d_max - d_min + 1e-6)

        # Final weight: you can tune these coefficients
        weight = 1.0 + 0.4 * ratings_norm + 0.3 * num_ratings_norm + 0.3 * discount_norm
        scores = scores * weight

    # ------------------------------------------------------------------
    # 3) (Disabled) Optional brand-level boost to avoid brand bias
    # ------------------------------------------------------------------
    # If you want brand-based boosting later, you can uncomment this block.
    # Right now it is disabled intentionally to avoid bias towards any brand.
    #
    # if "brand" in all_products.columns:
    #     base_brand = all_products.loc[product_index, "brand"]
    #     same_brand = (all_products["brand"] == base_brand).values
    #     brand_boost = np.where(same_brand, 1.1, 1.0)
    #     scores = scores * brand_boost

    # ------------------------------------------------------------------
    # 4) Convert to Series and drop the selected product itself
    # ------------------------------------------------------------------
    score_series = pd.Series(scores, index=all_products.index)
    score_series = score_series.drop(product_index, errors="ignore")

    # ------------------------------------------------------------------
    # 5) Automatic price-range filter
    # ------------------------------------------------------------------
    try:
        base_price = float(all_products.loc[product_index, "discount_price"])
    except Exception:
        base_price = np.nan

    if price_filter_mode != "none" and not np.isnan(base_price):
        price_series = all_products["discount_price"].astype(float)

        if price_filter_mode == "similar":
            # ±30% range around base price
            low = 0.7 * base_price
            high = 1.3 * base_price
            mask = (price_series >= low) & (price_series <= high)

        elif price_filter_mode == "cheaper":
            # All products cheaper or equal
            mask = price_series <= base_price

        elif price_filter_mode == "cheaper_20":
            # At most 80% of base price (20% or more cheaper)
            high = 0.8 * base_price
            mask = price_series <= high

        else:
            mask = None

        if mask is not None:
            score_series = score_series[mask]

    # ------------------------------------------------------------------
    # 6) Rank and return top N
    # ------------------------------------------------------------------
    top_indices = score_series.sort_values(ascending=False).head(top_n).index

    cols = [
        "name", "main_category", "sub_category",
        "discount_price", "actual_price",
        "ratings", "no_of_ratings"
    ]
    return all_products.loc[top_indices, cols]


In [28]:
def search_products_with_filters(
    query=None,
    main_category=None,
    sub_category=None,
    min_discount_price=None,
    max_discount_price=None,
    min_actual_price=None,
    max_actual_price=None,
    min_rating=None,
    max_rating=None,
    max_results=20
):
    # Start from full data
    df = all_products.copy()

    # ---- Name search ----
    if query and query.strip():
        df = df[df['name'].str.contains(query.strip(), case=False, na=False)]

    # ---- Category filters ----
    if main_category and main_category.strip():
        df = df[df['main_category'].str.contains(main_category.strip(), case=False, na=False)]

    if sub_category and sub_category.strip():
        df = df[df['sub_category'].str.contains(sub_category.strip(), case=False, na=False)]

    # ---- Price filters (discount_price) ----
    if min_discount_price is not None:
        df = df[df['discount_price'] >= min_discount_price]

    if max_discount_price is not None:
        df = df[df['discount_price'] <= max_discount_price]

    # ---- Price filters (actual_price) ----
    if min_actual_price is not None:
        df = df[df['actual_price'] >= min_actual_price]

    if max_actual_price is not None:
        df = df[df['actual_price'] <= max_actual_price]

    # ---- Rating filters ----
    if 'ratings' in df.columns:
        if min_rating is not None:
            df = df[df['ratings'] >= min_rating]
        if max_rating is not None:
            df = df[df['ratings'] <= max_rating]

    # Final results
    if df.empty:
        print("No products found for this search + filters. Try changing filters or query.")
        return df

    cols_to_show = [c for c in [
        'name', 'main_category', 'sub_category',
        'discount_price', 'actual_price', 'ratings', 'no_of_ratings'
    ] if c in df.columns]

    display(df[cols_to_show].head(max_results))
    print("\nNote: Use the left-side index number for recommendations.")
    return df


In [29]:
from difflib import get_close_matches

def recommend_from_query(query, max_search_results=20, top_n=10):
    """
    Search for products by a free-text query, apply optional filters,
    let the user choose a product index, and then show similar products.

    Parameters
    ----------
    query : str
        Free-text search query (product name, brand, etc.).
    max_search_results : int, optional
        Maximum number of products to display in the search results.
    top_n : int, optional
        Number of recommendations to show for the selected product.
    """

    # -----------------------------
    # 1) BASIC TEXT SEARCH (contains)
    # -----------------------------
    mask = all_products['name'].str.contains(query, case=False, na=False)
    results = all_products[mask]

    # -----------------------------
    # 2) FUZZY SEARCH FALLBACK
    #    (when no direct/partial match is found)
    # -----------------------------
    if results.empty:
        print("No exact/partial matches found. Trying fuzzy search...")

        product_names = all_products['name'].astype(str).unique()

        # Get up to 50 names that are closest to the query
        close_names = get_close_matches(query, product_names, n=50, cutoff=0.4)

        if close_names:
            results = all_products[all_products['name'].isin(close_names)]
        else:
            print("No similar product name found. Please try a different keyword.")
            return

    # Limit the initial result set
    results = results.head(max_search_results)

    if results.empty:
        print("No products found. Try a slightly different query.")
        return

    # --------------------------------------------------
    # 3) MAIN CATEGORY FILTER (menu selection)
    # --------------------------------------------------
    main_cats = sorted(results['main_category'].dropna().unique())
    if len(main_cats) > 1:
        print("\nAvailable MAIN CATEGORIES in current results:")
        for i, cat in enumerate(main_cats, 1):
            print(f"{i}. {cat}")

        choice = input(
            "Select a main category (enter number, or press Enter to skip): "
        ).strip()

        if choice:
            try:
                idx = int(choice) - 1
                if 0 <= idx < len(main_cats):
                    chosen_main = main_cats[idx]
                    results = results[results['main_category'] == chosen_main]
                else:
                    print("Invalid choice. Skipping main_category filter.")
            except ValueError:
                print("Invalid input. Skipping main_category filter.")

    # --------------------------------------------------
    # 4) SUB CATEGORY FILTER (based on filtered results)
    # --------------------------------------------------
    sub_cats = sorted(results['sub_category'].dropna().unique())
    if len(sub_cats) > 1:
        print("\nAvailable SUB CATEGORIES in current results:")
        for i, cat in enumerate(sub_cats, 1):
            print(f"{i}. {cat}")

        choice = input(
            "Select a sub category (enter number, or press Enter to skip): "
        ).strip()

        if choice:
            try:
                idx = int(choice) - 1
                if 0 <= idx < len(sub_cats):
                    chosen_sub = sub_cats[idx]
                    results = results[results['sub_category'] == chosen_sub]
                else:
                    print("Invalid choice. Skipping sub_category filter.")
            except ValueError:
                print("Invalid input. Skipping sub_category filter.")

    # --------------------------------------------------
    # 5) RATING FILTER OPTIONS
    #    (uses cleaned numeric ratings column)
    # --------------------------------------------------
    rating_options = {
        "1": (1.0, 3.0),
        "2": (1.0, 5.0),
        "3": (2.0, 3.0),
        "4": (3.0, 4.0),
        "5": (4.0, 5.0),
        "6": (2.0, 5.0),
        "7": None,  # no ratings filter
    }

    print("\nRating filter options:")
    print("1. 1.0 to 3.0")
    print("2. 1.0 to 5.0")
    print("3. 2.0 to 3.0")
    print("4. 3.0 to 4.0")
    print("5. 4.0 to 5.0")
    print("6. 2.0 to 5.0")
    print("7. No ratings filter")

    r_choice = input("Select rating filter (1–7, or press Enter to skip): ").strip()

    if r_choice in rating_options and rating_options[r_choice] is not None:
        low, high = rating_options[r_choice]
        results = results[
            (results['ratings'].notna()) &
            (results['ratings'] >= low) &
            (results['ratings'] <= high)
        ]
    elif r_choice and r_choice not in rating_options:
        print("Invalid choice. Skipping rating filter.")

    # --------------------------------------------------
    # 6) PRICE RANGE FILTER (on discount_price)
    # --------------------------------------------------
    print("\nPrice filter (based on discount_price):")
    min_price = input("Minimum price (press Enter for no minimum): ").strip()
    max_price = input("Maximum price (press Enter for no maximum): ").strip()

    try:
        if min_price:
            min_p = float(min_price)
            results = results[results['discount_price'] >= min_p]
    except ValueError:
        print("Minimum price was invalid. Ignoring min price filter.")

    try:
        if max_price:
            max_p = float(max_price)
            results = results[results['discount_price'] <= max_p]
    except ValueError:
        print("Maximum price was invalid. Ignoring max price filter.")

    # If everything gets filtered out, stop early
    if results.empty:
        print("\nNo products found after applying filters. Try relaxing the filters.")
        return

    # --------------------------------------------------
    # 7) SHOW FILTERED SEARCH RESULTS
    # --------------------------------------------------
    print("\nFiltered Search Results:")
    display(
        results[[
            'name',
            'main_category',
            'sub_category',
            'ratings',
            'no_of_ratings',
            'discount_price',
            'actual_price'
        ]]
    )

    # --------------------------------------------------
    # 8) ASK USER FOR INDEX AND SHOW RECOMMENDATIONS
    # --------------------------------------------------
    print("\nUse the LEFT-SIDE index to choose a product from the table above.")
    try:
        chosen_index = int(
            input("Enter the index of the product you want recommendations for: ")
        )
    except ValueError:
        print("Index must be numeric. Please run the function again.")
        return

    # Safety check: index must exist (either in filtered results or global dataset)
    if chosen_index not in results.index and chosen_index not in all_products.index:
        print("Invalid index. Please run again and choose a valid one.")
        return

    # Call the existing recommendation function
    recs = recommend_by_index(chosen_index, top_n=top_n)

    print(f"\nRecommendations similar to index {chosen_index}:")
    display(recs)


In [30]:
recommend_from_query("Lloyd 1.5 Ton", max_search_results=20, top_n=10)



Available MAIN CATEGORIES in current results:
1. appliances
2. home & kitchen


Select a main category (enter number, or press Enter to skip):  1



Available SUB CATEGORIES in current results:
1. Air Conditioners
2. All Appliances
3. Heating & Cooling Appliances


Select a sub category (enter number, or press Enter to skip):  1



Rating filter options:
1. 1.0 to 3.0
2. 1.0 to 5.0
3. 2.0 to 3.0
4. 3.0 to 4.0
5. 4.0 to 5.0
6. 2.0 to 5.0
7. No ratings filter


Select rating filter (1–7, or press Enter to skip):  6



Price filter (based on discount_price):


Minimum price (press Enter for no minimum):  10000
Maximum price (press Enter for no maximum):  50000



Filtered Search Results:


Unnamed: 0,name,main_category,sub_category,ratings,no_of_ratings,discount_price,actual_price
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,4.2,2255.0,32999.0,58990.0
7,Lloyd 1.5 Ton 5 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,4.3,1494.0,39990.0,67990.0
52,Lloyd 1.5 Ton 5 Star Fixed Speed Window Ac (Co...,appliances,Air Conditioners,4.2,474.0,32999.0,54990.0
72,Lloyd 1.5 Ton 4 Star Fixed Speed Window Ac (Co...,appliances,Air Conditioners,3.6,23.0,29799.0,50990.0
91,Lloyd 1.5 Ton 2 Star Fixed Speed Split AC (Cop...,appliances,Air Conditioners,4.1,234.0,33900.0,59990.0
303,Lloyd 1.5 Ton 5 Star Split Inverter AC - White...,appliances,Air Conditioners,4.0,5.0,39499.0,60990.0



Use the LEFT-SIDE index to choose a product from the table above.


Enter the index of the product you want recommendations for:  91



Recommendations similar to index 91:


Unnamed: 0,name,main_category,sub_category,discount_price,actual_price,ratings,no_of_ratings
92,Carrier 1 Ton 3 Star Fixed Speed Split AC (Cop...,appliances,Air Conditioners,31990.0,54590.0,3.9,134.0
5016,Carrier 1 Ton 3 Star Fixed Speed Split AC (Cop...,appliances,All Appliances,31990.0,54590.0,3.9,134.0
371955,Carrier 1 Ton 3 Star Fixed Speed Split AC (Cop...,appliances,Kitchen & Home Appliances,31990.0,54590.0,3.9,134.0
273350,Carrier 1 Ton 3 Star Fixed Speed Split AC (Cop...,appliances,Heating & Cooling Appliances,31990.0,54590.0,3.9,134.0
42,Carrier 1.5 Ton 3 Star Fixed Speed Window AC( ...,appliances,Air Conditioners,30990.0,45090.0,4.3,201.0
1905,Carrier 1.5 Ton 3 Star Fixed Speed Window AC( ...,appliances,All Appliances,30990.0,45090.0,4.3,201.0
176,Carrier 1 Ton 3 Star Fixed Speed Window AC(Cop...,appliances,Air Conditioners,28990.0,39090.0,4.0,1.0
369589,Carrier 1.5 Ton 3 Star Fixed Speed Window AC( ...,appliances,Kitchen & Home Appliances,30990.0,45090.0,4.3,201.0
272795,Carrier 1.5 Ton 3 Star Fixed Speed Window AC( ...,appliances,Heating & Cooling Appliances,30990.0,45090.0,4.3,201.0
138,Blue Star 2.0 Ton 3 Star Fixed Speed Split AC ...,appliances,Air Conditioners,54990.0,64000.0,4.8,10.0


In [31]:
# Normal recommendations (no price filter)
recommend_by_index(7, top_n=10)

# Only products in a similar price band (±30%)
recommend_by_index(7, top_n=10, price_filter_mode="similar")

# Only cheaper or equal-priced alternatives
recommend_by_index(7, top_n=10, price_filter_mode="cheaper")

# Only clearly cheaper (≤ 80% of original price)
recommend_by_index(7, top_n=10, price_filter_mode="cheaper_20")


Unnamed: 0,name,main_category,sub_category,discount_price,actual_price,ratings,no_of_ratings
6,Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,29999.0,49990.0,4.2,1097.0
891,Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,All Appliances,29999.0,49990.0,4.2,1097.0
368741,Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Kitchen & Home Appliances,29999.0,49990.0,4.2,1097.0
272569,Lloyd 1.0 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Heating & Cooling Appliances,29999.0,49990.0,4.2,1097.0
70,Lloyd 1.0 Ton 2 Star Fixed Speed Split AC (Cop...,appliances,Air Conditioners,27499.0,46990.0,3.9,451.0
3186,Lloyd 1.0 Ton 2 Star Fixed Speed Split AC (Cop...,appliances,All Appliances,27499.0,46990.0,3.9,451.0
370512,Lloyd 1.0 Ton 2 Star Fixed Speed Split AC (Cop...,appliances,Kitchen & Home Appliances,27499.0,46990.0,3.9,451.0
273015,Lloyd 1.0 Ton 2 Star Fixed Speed Split AC (Cop...,appliances,Heating & Cooling Appliances,27499.0,46990.0,3.9,451.0
5,Voltas 1.4 Ton 3 Star Inverter Split AC(Copper...,appliances,Air Conditioners,31990.0,70990.0,4.0,1666.0
47,Cruise 1.5 Ton 3 Star Inverter Split AC with 7...,appliances,Air Conditioners,29990.0,52900.0,4.3,23.0


In [32]:
import pickle
from scipy.sparse import save_npz

all_products.to_csv("cleaned_products.csv", index=False)
pickle.dump(tfidf_vectorizer, open("tfidf_model.pkl", "wb"))
save_npz("tfidf_matrix.npz", tfidf_matrix)

print("Export completed!")


Export completed!
