In [None]:
# 📘 Cell 1: Install required packages
# Reasoning: Ensures that all necessary libraries for model training are installed.
!pip install pandas numpy scikit-learn matplotlib seaborn --quiet

In [None]:
# 📘 Cell 2: Import libraries
# Reasoning: Import standard Python libraries for data loading, analysis, and model building.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pickle
sns.set(style="whitegrid")

In [None]:
# 📘 Cell 3: Load the cleaned dataset
# Reasoning: Load the dataset created in the Data Analytics notebook for model training.
DATA_PATH = "data/cleaned_dataset.csv"
df = pd.read_csv(DATA_PATH)
print("✅ Cleaned dataset loaded successfully!")
print("Shape:", df.shape)
df.head()

In [None]:
# 📘 Cell 4: Identify text column to use for model
# Reasoning: Automatically detect a suitable column (e.g., description or clean_description) for training.
text_col = None
for possible in ['clean_description', 'description', 'title']:
    if possible in df.columns:
        text_col = possible
        break
if text_col is None:
    raise ValueError("❌ No suitable text column found. Ensure your dataset has 'description' or 'clean_description'.")
else:
    print(f"✅ Using text column: '{text_col}'")

In [None]:
# 📘 Cell 5: TF-IDF Vectorization
# Reasoning: Convert text data into numerical features suitable for similarity-based recommendation.
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = tfidf.fit_transform(df[text_col])
print("✅ TF-IDF matrix created successfully!")
print("Matrix shape:", tfidf_matrix.shape)

In [None]:
# 📘 Cell 6: Train Nearest Neighbors model
# Reasoning: Use cosine similarity to find similar items based on text features.
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(tfidf_matrix)
print("✅ Nearest Neighbors model trained successfully!")

In [None]:
# 📘 Cell 7: Define recommendation function
# Reasoning: Function to get top-k similar items given a product index.
def get_recommendations(index, top_k=5):
    distances, indices = model.kneighbors(tfidf_matrix[index], n_neighbors=top_k+1)
    recs = []
    for i, dist in zip(indices[0][1:], distances[0][1:]):
        recs.append({
            "recommended_index": i,
            "title": df.iloc[i].get('title', 'N/A'),
            "brand": df.iloc[i].get('brand', 'N/A') if 'brand' in df.columns else 'N/A',
            "distance": round(float(dist), 3)
        })
    return recs

In [None]:
# 📘 Cell 8: Test recommendation on a random product
# Reasoning: Verify that the recommender model works properly.
sample_index = np.random.randint(0, len(df))
print("🔹 Original Product Description:")
print(df[text_col].iloc[sample_index][:400], "...\n")
print("🔹 Top 5 Similar Recommendations:")
recs = get_recommendations(sample_index, top_k=5)
for r in recs:
    print(f"- {r['title']} | Brand: {r['brand']} | Similarity: {1 - r['distance']:.2f}")

In [None]:
# 📘 Cell 9: Evaluate model (semantic relevance check)
# Reasoning: Measure how often similar products belong to the same category.
if 'category' in df.columns:
    def precision_at_k(idx, k=5):
        distances, indices = model.kneighbors(tfidf_matrix[idx], n_neighbors=k+1)
        target_category = df['category'].iloc[idx]
        similar_categories = df['category'].iloc[indices[0][1:]]
        same = (similar_categories == target_category).sum()
        return same / k
    precision_scores = [precision_at_k(i) for i in range(min(100, len(df)))]
    print(f"Average Precision@5 (Category-based): {np.mean(precision_scores):.3f}")
else:
    print("⚠️ 'category' column not found — skipping category-based evaluation.")

In [None]:
# 📘 Cell 10: Save trained model and vectorizer
# Reasoning: Save model files so they can be reused for predictions in FastAPI or Streamlit apps.
with open("data/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
with open("data/recommender_model.pkl", "wb") as f:
    pickle.dump(model, f)
print("✅ Saved TF-IDF vectorizer and Nearest Neighbors model to 'data/' folder.")

In [None]:
# 📘 Cell 11: Recommend products based on user text query
# Reasoning: Allows users to get recommendations by entering their own text.
def recommend_from_text(query, top_k=5):
    vec = tfidf.transform([query])
    distances, indices = model.kneighbors(vec, n_neighbors=top_k)
    recs = []
    for i, dist in zip(indices[0], distances[0]):
        recs.append({
            "title": df.iloc[i].get('title', 'N/A'),
            "brand": df.iloc[i].get('brand', 'N/A') if 'brand' in df.columns else 'N/A',
            "distance": round(float(dist), 3)
        })
    return recs
query = "wooden office chair"
print(f"🔹 Recommendations for: '{query}'")
for r in recommend_from_text(query, top_k=5):
    print(f"- {r['title']} | Similarity: {1 - r['distance']:.2f}")

In [None]:
# 📘 Cell 12: Visualize product similarity (optional)
# Reasoning: Create a heatmap showing pairwise cosine similarity between sample products.
sample_df = df.sample(10, random_state=42)
sample_vectors = tfidf.transform(sample_df[text_col])
similarity_matrix = cosine_similarity(sample_vectors)
plt.figure(figsize=(8,6))
sns.heatmap(similarity_matrix, cmap='viridis', annot=True, fmt='.2f')
plt.title("Cosine Similarity between Sample Products")
plt.show()

In [None]:
# 📘 Cell 13: Summary
# Reasoning: Summarize what was accomplished in this notebook.
print("""
✅ Model Training Summary
-------------------------
1. Loaded cleaned dataset from data/cleaned_dataset.csv
2. Converted text data into TF-IDF embeddings
3. Trained Nearest Neighbors model for content-based recommendations
4. Tested the model using random samples and text queries
5. Evaluated semantic relevance (Precision@K)
6. Saved trained models for deployment (FastAPI/Streamlit)
7. Visualized product similarity for interpretability

🚀 Next Step: Integrate these models into your FastAPI or Streamlit app for live recommendations!
""")