# importing packages

In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm
import numpy as np


# Train CSV File

In [None]:
train_df = pd.read_csv('/content/train.csv')

# Load pretrained ResNet50 and remove the last classification





In [None]:
# Load pretrained ResNet50 and remove the last classification layer
model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))  # output = 2048-dim
model.eval()

# Define image transform (resize, normalize, etc.)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])




Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 394MB/s]


# Train Image_Feature Extraction function

In [None]:
def extract_image_feature(img_url):
    try:
        response = requests.get(img_url, timeout=5)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img_t = transform(img).unsqueeze(0)
        with torch.no_grad():
            feat = model(img_t).squeeze().numpy()
        return feat
    except Exception:
        return np.zeros(2048)  # fallback if image fails


#Image Feature storing and Extracting from image "Url'

In [None]:
image_features = []

for url in tqdm(train_df['image_link'], desc="Extracting train image features"):
    feat = extract_image_feature(url)
    image_features.append(feat)

image_features = np.array(image_features)
print("Train image features shape:", image_features.shape)


Extracting train image features:  41%|████      | 30539/75000 [1:26:27<1:53:39,  6.52it/s]

# Save as .npy file,.csv file for model training


In [None]:
np.save('train_image_features.npy', image_features)

# (Optional) Save as CSV if you want to merge later
df = pd.DataFrame(image_features)
df['sample_id'] = train_df['sample_id']
df.to_csv('train_image_feature.csv', index=False)


# AUTOMATIC SEMANTIC EMBEDDING & CATEGORIZATION PIPELINE


In [None]:


!pip install -q sentence-transformers umap-learn hdbscan plotly matplotlib seaborn pandas numpy torch tqdm scikit-learn nltk


import os
import re
import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from collections import Counter
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# TEXT CLEANING FUNCTION
def clean_text(text):
    """Clean product descriptions for better semantic clustering"""
    if pd.isna(text):
        return "no description"

    text = str(text).lower()

    # Remove URLs
    text = re.sub(r"http\S+", " ", text)

    # Remove special characters but keep meaningful punctuation
    text = re.sub(r"[^a-zA-Z0-9\s\-\.]", " ", text)

    # Remove common e-commerce stop words
    ecommerce_stop_words = {
        'bullet', 'point', 'item', 'name', 'value', 'product', 'com', 'description',
        'feature', 'benefit', 'specification', 'detail', 'information', 'please',
        'buy', 'purchase', 'order', 'shipping', 'delivery', 'price', 'sale',
        'brand', 'new', 'free', 'best', 'quality', 'high', 'premium'
    }

    # Tokenize and remove stop words
    words = text.split()
    words = [w for w in words if w not in ecommerce_stop_words and len(w) > 2]

    text = " ".join(words)
    text = re.sub(r"\s+", " ", text).strip()

    return text if text else "no description"

# EMBEDDING EXTRACTOR

class SemanticEmbedder:
    def _init_(self, model_name="sentence-transformers/all-mpnet-base-v2"):
        print(f" Loading model: {model_name}")
        try:
            # Using a larger model for better semantic understanding
            self.model = SentenceTransformer(model_name)
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            self.model = self.model.to(self.device)
            print(f" Using device: {self.device}")
        except Exception as e:
            print(f" Error loading model: {e}")
            # Fallback to smaller model
            self.model = SentenceTransformer("all-MiniLM-L6-v2")
            self.device = "cpu"
            self.model = self.model.to(self.device)

    def get_embeddings(self, texts, batch_size=32):
        """Generate embeddings for a list of texts"""
        print(f" Processing {len(texts)} texts...")

        # Clean texts with progress bar
        cleaned_texts = []
        for text in tqdm(texts, desc="Cleaning texts"):
            cleaned_texts.append(clean_text(text))

        print(" Generating embeddings...")
        try:
            embeddings = self.model.encode(
                cleaned_texts,
                batch_size=batch_size,
                show_progress_bar=True,
                convert_to_numpy=True,
                normalize_embeddings=True,
                device=self.device
            )
            print(f" Generated embeddings shape: {embeddings.shape}")
            return embeddings
        except Exception as e:
            print(f" Error generating embeddings: {e}")
            raise

# CATEGORIZATION WITH BETTER CLUSTERING
class ImprovedAutoCategorizer:
    def _init_(self, dataset_size=None):
        self.dataset_size = dataset_size

        # More conservative UMAP parameters for better separation
        self.umap_params = {
            'n_neighbors': min(20, max(10, dataset_size // 50)) if dataset_size else 15,
            'n_components': min(50, max(15, dataset_size // 20)) if dataset_size else 25,
            'min_dist': 0.05,
            'metric': "cosine",
            'random_state': 42,
            'low_memory': False
        }

        # More conservative HDBSCAN parameters
        if dataset_size:
            min_cluster_size = max(10, dataset_size // 100)
            min_samples = max(5, dataset_size // 200)        5
        else:
            min_cluster_size = 15
            min_samples = 5

        self.hdbscan_params = {
            'min_cluster_size': min_cluster_size,
            'min_samples': min_samples,
            'cluster_selection_epsilon': 0.1,
            'metric': "euclidean",
            'cluster_selection_method': 'eom',
            'prediction_data': True
        }

        self.umap_reducer = None
        self.clusterer = None
        self.labels_ = None
        self.embedding_2d = None
        self.category_interpretations = {}
        self.nn_classifier = None
        self.cluster_centroids = None

        print(f" Improved parameters for dataset size {dataset_size}:")
        print(f"   - min_cluster_size: {self.hdbscan_params['min_cluster_size']}")
        print(f"   - min_samples: {self.hdbscan_params['min_samples']}")

    def fit(self, embeddings, product_descriptions=None):
        """Fit the categorization model with improved clustering"""
        print(" Starting Improved Automatic Categorization")
        print("=" * 50)

        # Step 1: UMAP with multiple attempts if needed
        print(" Step 1: UMAP Dimensionality Reduction")
        best_embeddings_reduced = None
        best_score = -1

        # Try different UMAP parameters to find best separation
        umap_configs = [
            self.umap_params,
            {**self.umap_params, 'min_dist': 0.01, 'n_neighbors': 15},
            {**self.umap_params, 'min_dist': 0.1, 'n_neighbors': 25}
        ]

        for i, config in enumerate(umap_configs):
            print(f"   - Trying UMAP config {i+1}: n_neighbors={config['n_neighbors']}, min_dist={config['min_dist']}")
            try:
                umap_reducer = umap.UMAP(**config)
                embeddings_reduced = umap_reducer.fit_transform(embeddings)

                # Quick clustering to evaluate this configuration
                test_clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=self.hdbscan_params['min_cluster_size'],
                    min_samples=self.hdbscan_params['min_samples']
                )
                test_labels = test_clusterer.fit_predict(embeddings_reduced)

                # Score based on number of clusters and noise ratio
                n_clusters = len(set(test_labels)) - (1 if -1 in test_labels else 0)
                noise_ratio = np.sum(test_labels == -1) / len(test_labels)

                if n_clusters > 0:
                    score = n_clusters * (1 - noise_ratio)
                    if score > best_score:
                        best_score = score
                        best_embeddings_reduced = embeddings_reduced
                        self.umap_reducer = umap_reducer
                        print(f"      Good configuration: {n_clusters} clusters, {noise_ratio:.1%} noise")
            except Exception as e:
                print(f"      UMAP config {i+1} failed: {e}")
                continue

        if best_embeddings_reduced is None:
            print(" All UMAP configurations failed, using default")
            self.umap_reducer = umap.UMAP(**self.umap_params)
            best_embeddings_reduced = self.umap_reducer.fit_transform(embeddings)

        embeddings_reduced = best_embeddings_reduced
        print(f"Final reduced embeddings shape: {embeddings_reduced.shape}")

        # Step 2:HDBSCAN with multiple attempts
        print("\n Step 2: HDBSCAN Clustering")
        best_labels = None
        best_clusterer = None
        best_silhouette = -1

        hdbscan_configs = [
            self.hdbscan_params,
            {**self.hdbscan_params, 'min_cluster_size': max(5, self.dataset_size // 200)},
            {**self.hdbscan_params, 'cluster_selection_epsilon': 0.05}
        ]

        for i, config in enumerate(hdbscan_configs):
            print(f"   - Trying HDBSCAN config {i+1}: min_cluster_size={config['min_cluster_size']}")
            try:
                clusterer = hdbscan.HDBSCAN(**config)
                labels = clusterer.fit_predict(embeddings_reduced)

                # Calculate quality metrics
                non_noise_mask = labels != -1
                non_noise_labels = labels[non_noise_mask]

                if len(np.unique(non_noise_labels)) >= 2:
                    non_noise_embeddings = embeddings_reduced[non_noise_mask]
                    silhouette = silhouette_score(non_noise_embeddings, non_noise_labels)

                    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
                    noise_ratio = np.sum(labels == -1) / len(labels)

                    print(f"      {n_clusters} clusters, noise: {noise_ratio:.1%}, silhouette: {silhouette:.3f}")

                    if silhouette > best_silhouette and n_clusters >= 2:
                        best_silhouette = silhouette
                        best_labels = labels
                        best_clusterer = clusterer
                else:
                    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
                    print(f"     Only {n_clusters} clusters found")

            except Exception as e:
                print(f" HDBSCAN config {i+1} failed: {e}")
                continue

        if best_labels is None:
            print("All HDBSCAN configurations failed, using last attempt")
            self.clusterer = hdbscan.HDBSCAN(**self.hdbscan_params)
            self.labels_ = self.clusterer.fit_predict(embeddings_reduced)
        else:
            self.clusterer = best_clusterer
            self.labels_ = best_labels

        # Final cluster statistics
        n_clusters = len(set(self.labels_)) - (1 if -1 in self.labels_ else 0)
        n_noise = np.sum(self.labels_ == -1)
        coverage = (len(self.labels_) - n_noise) / len(self.labels_) * 100

        print(f"\n FINAL CLUSTERING RESULTS:")
        print(f"   - Clusters found: {n_clusters}")
        print(f"   - Noise points: {n_noise} ({n_noise/len(self.labels_)*100:.1f}%)")
        print(f"   - Coverage: {coverage:.1f}%")

        # Step 3: Train classifier
        print("\n Step 3: Training Classifier")
        self._train_nn_classifier(embeddings_reduced)

        # Step 4: 2D visualization
        print("\n Step 4: Generating 2D Visualization")
        try:
            umap_2d = umap.UMAP(n_components=2, random_state=42, metric='cosine', min_dist=0.1)
            self.embedding_2d = umap_2d.fit_transform(embeddings)
            print(" 2D coordinates generated")
        except Exception as e:
            print(f" 2D visualization failed: {e}")

        # Step 5: Improved interpretations
        if product_descriptions is not None and len(np.unique(self.labels_)) > 1:
            print("\n Step 5: Generating Improved Category Interpretations")
            try:
                self._generate_improved_interpretations(embeddings, product_descriptions)
                print(" Category interpretations generated")
            except Exception as e:
                print(f" Interpretation generation failed: {e}")
        else:
            print("\n Step 5: Skipping interpretations")

        return self.labels_

    def _train_nn_classifier(self, embeddings_reduced):
        """Train a nearest neighbor classifier"""
        non_noise_mask = self.labels_ != -1
        non_noise_embeddings = embeddings_reduced[non_noise_mask]
        non_noise_labels = self.labels_[non_noise_mask]

        if len(non_noise_labels) == 0:
            print(" No non-noise clusters found for NN classifier")
            self.nn_classifier = None
            return

        # Calculate cluster centroids
        self.cluster_centroids = {}
        unique_labels = np.unique(non_noise_labels)
        for label in unique_labels:
            mask = non_noise_labels == label
            cluster_points = non_noise_embeddings[mask]
            self.cluster_centroids[label] = np.mean(cluster_points, axis=0)

        # Train KNN classifier
        self.nn_classifier = NearestNeighbors(n_neighbors=5, metric='euclidean')
        self.nn_classifier.fit(non_noise_embeddings)
        self.nn_embeddings = non_noise_embeddings
        self.nn_labels = non_noise_labels

        print(f" Trained NN classifier on {len(non_noise_labels)} non-noise samples")

    def _generate_improved_interpretations(self, embeddings, descriptions):
        """Generate meaningful category interpretations"""
        unique_labels = np.unique(self.labels_)
        if -1 in unique_labels:
            unique_labels = unique_labels[unique_labels != -1]

        print(f" Analyzing {len(unique_labels)} categories...")

        # Extended stop words
        extended_stop_words = {
            "the", "and", "for", "with", "this", "that", "are", "from", "has", "have",
            "product", "description", "item", "name", "value", "bullet", "point", "com",
            "your", "our", "will", "can", "one", "using", "use", "made", "make", "includes",
            "include", "features", "feature", "benefits", "benefit", "specifications"
        }

        for label in unique_labels:
            mask = self.labels_ == label
            category_embeddings = embeddings[mask]

            if hasattr(descriptions, 'iloc'):
                category_desc = descriptions.iloc[np.where(mask)[0]]
            else:
                category_desc = [descriptions[i] for i in np.where(mask)[0]]

            if len(category_embeddings) == 0:
                continue

            # Find representative product (closest to centroid)
            centroid = np.mean(category_embeddings, axis=0)
            distances = np.linalg.norm(category_embeddings - centroid, axis=1)
            rep_idx = np.argmin(distances)

            # Extract meaningful keywords
            all_text = " ".join([str(desc) for desc in category_desc])

            # Use multiple techniques to extract keywords
            words = re.findall(r"\b[a-zA-Z]{3,15}\b", all_text.lower())
            words = [w for w in words if w not in extended_stop_words and not w.isdigit()]

            # Get frequency-based keywords
            word_freq = Counter(words)
            common_words = [w for w, count in word_freq.most_common(10) if count >= max(2, len(category_desc) * 0.1)]

            # Get representative description
            if hasattr(category_desc, 'iloc'):
                rep_desc = category_desc.iloc[rep_idx]
            else:
                rep_desc = category_desc[rep_idx]

            # Create a meaningful category name from top keywords
            category_name = " & ".join(common_words[:3]).title() if common_words else f"Category {label}"

            short_desc = str(rep_desc)
            if len(short_desc) > 150:
                short_desc = short_desc[:147] + "..."

            self.category_interpretations[label] = {
                "size": len(category_embeddings),
                "category_name": category_name,
                "representative_description": short_desc,
                "top_keywords": common_words[:5],
                "sample_size": len(category_embeddings)
            }

    def predict(self, embeddings, method='hybrid'):
        if self.umap_reducer is None or self.clusterer is None:
            raise ValueError("Model must be fitted before prediction")

        unique_labels = np.unique(self.labels_)
        non_noise_labels = unique_labels[unique_labels != -1]

        if len(non_noise_labels) == 0:
            print(" No clusters found in training, returning all noise labels")
            return np.array([-1] * len(embeddings))

        print(f" Predicting categories for {len(embeddings)} samples using {method} method...")

        try:
            embeddings_reduced = self.umap_reducer.transform(embeddings)

            if method == 'hdbscan':
                return self._predict_hdbscan(embeddings_reduced)
            elif method == 'nearest_neighbor':
                return self._predict_nearest_neighbor(embeddings_reduced)
            elif method == 'centroid':
                return self._predict_centroid(embeddings_reduced)
            elif method == 'hybrid':
                return self._predict_hybrid(embeddings_reduced)
            else:
                raise ValueError(f"Unknown prediction method: {method}")

        except Exception as e:
            print(f" Prediction failed: {e}")
            return np.array([-1] * len(embeddings))

    def _predict_hdbscan(self, embeddings_reduced):
        """Use HDBSCAN's approximate_predict method"""
        try:
            labels, strengths = hdbscan.approximate_predict(self.clusterer, embeddings_reduced)
            min_strength = 0.3
            labels[strengths < min_strength] = -1

            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            print(f"HDBSCAN prediction: {n_clusters} clusters found")
            return labels
        except Exception as e:
            print(f" HDBSCAN prediction failed: {e}")
            return np.array([-1] * len(embeddings_reduced))

    def _predict_nearest_neighbor(self, embeddings_reduced):
        """Use nearest neighbor classification"""
        if self.nn_classifier is None:
            return np.array([-1] * len(embeddings_reduced))

        distances, indices = self.nn_classifier.kneighbors(embeddings_reduced)
        neighbor_labels = self.nn_labels[indices]

        labels = []
        for i in range(len(embeddings_reduced)):
            unique, counts = np.unique(neighbor_labels[i], return_counts=True)
            if len(unique) > 0:
                majority_label = unique[np.argmax(counts)]
                # Check if the distance is reasonable
                avg_distance = np.mean(distances[i])
                if avg_distance < 1.0:
                    labels.append(majority_label)
                else:
                    labels.append(-1)
            else:
                labels.append(-1)

        labels = np.array(labels)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        print(f" NN prediction: {n_clusters} clusters found")
        return labels

    def _predict_centroid(self, embeddings_reduced):
        """Assign to nearest cluster centroid"""
        if not self.cluster_centroids:
            return np.array([-1] * len(embeddings_reduced))

        labels = []
        for point in embeddings_reduced:
            min_dist = float('inf')
            best_label = -1

            for label, centroid in self.cluster_centroids.items():
                dist = np.linalg.norm(point - centroid)
                if dist < min_dist:
                    min_dist = dist
                    best_label = label

            # Use distance threshold
            if min_dist < 1.0:
                labels.append(best_label)
            else:
                labels.append(-1)

        labels = np.array(labels)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        print(f" Centroid prediction: {n_clusters} clusters found")
        return labels

    def _predict_hybrid(self, embeddings_reduced):
        """Hybrid approach"""
        print(" Trying hybrid prediction approach...")

        # Try HDBSCAN first
        hdbscan_labels = self._predict_hdbscan(embeddings_reduced)
        hdbscan_coverage = np.sum(hdbscan_labels != -1) / len(hdbscan_labels)

        if hdbscan_coverage > 0.5:
            print(f" Using HDBSCAN prediction (coverage: {hdbscan_coverage:.1%})")
            return hdbscan_labels

        # Otherwise try NN
        nn_labels = self._predict_nearest_neighbor(embeddings_reduced)
        nn_coverage = np.sum(nn_labels != -1) / len(nn_labels)

        if nn_coverage > hdbscan_coverage:
            print(f" Using NN prediction (coverage: {nn_coverage:.1%})")
            return nn_labels

        # Finally try centroid
        centroid_labels = self._predict_centroid(embeddings_reduced)
        print(f" Using centroid prediction")
        return centroid_labels

# IMPROVED MAIN PIPELINE
def improved_main_pipeline():
    print(" STARTING IMPROVED SEMANTIC EMBEDDING & CATEGORIZATION PIPELINE")
    print("=" * 70)
    print(" MODE: Improved clustering with meaningful categories")
    print("=" * 70)

    try:
        # Step 0: Load data
        print("\n STEP 1: Loading Data")
        print("-" * 40)

        train_df = pd.read_csv("dataset/train.csv")
        test_df = pd.read_csv("dataset/test.csv")

        # Handle missing values
        train_df["catalog_content"] = train_df["catalog_content"].fillna("No description")
        test_df["catalog_content"] = test_df["catalog_content"].fillna("No description")

        print(f" Data loaded:")
        print(f"   - Training samples: {len(train_df)}")
        print(f"   - Test samples: {len(test_df)}")

        # Step 1: Extract embeddings with improved model
        print("\n STEP 2: Extracting Improved Semantic Embeddings")
        print("-" * 40)

        embedder = SemanticEmbedder("sentence-transformers/all-mpnet-base-v2")

        print(" Generating TRAIN embeddings...")
        train_embeddings = embedder.get_embeddings(train_df["catalog_content"])

        print(" Generating TEST embeddings...")
        test_embeddings = embedder.get_embeddings(test_df["catalog_content"])

        # Save embeddings
        save_embeddings(train_embeddings, "improved_train_embeddings.npy")
        save_embeddings(test_embeddings, "improved_test_embeddings.npy")

        # Step 2: Improved categorization
        print("\n STEP 3: Improved Automatic Categorization")
        print("-" * 40)

        categorizer = ImprovedAutoCategorizer(dataset_size=len(train_df))
        train_labels = categorizer.fit(train_embeddings, train_df["catalog_content"])

        # Predict categories for test data
        print("\n STEP 3b: Predicting Categories for Test Data")
        print("-" * 40)

        test_labels = categorizer.predict(test_embeddings, method='hybrid')

        # Analyze clustering quality
        unique_train_labels = np.unique(train_labels)
        if len(unique_train_labels) > 1 or (len(unique_train_labels) == 1 and unique_train_labels[0] != -1):
            analyze_clustering_quality(train_embeddings, train_labels)
        else:
            print("No clusters found for quality analysis")

        # Print improved category summary
        if categorizer.category_interpretations:
            print_improved_category_summary(categorizer.category_interpretations, len(train_labels))
        else:
            print("No category interpretations available")

        # Save results
        save_categories(train_labels, categorizer.category_interpretations, "improved_train_categories.npy")
        save_categories(test_labels, categorizer.category_interpretations, "improved_test_categories.npy")

        # Step 3: Visualizations
        print("\n STEP 4: Creating Visualizations")
        print("-" * 40)

        if categorizer.embedding_2d is not None:
            plot_clusters(categorizer.embedding_2d, train_labels, "Improved Train Data - Product Categories")

            # Generate 2D for test data
            umap_2d_test = umap.UMAP(n_components=2, random_state=42, metric='cosine', min_dist=0.1)
            test_embedding_2d = umap_2d_test.fit_transform(test_embeddings)
            plot_clusters(test_embedding_2d, test_labels, "Improved Test Data - Product Categories")

        plot_category_sizes(train_labels, "improved_train_category_sizes.png")
        plot_category_sizes(test_labels, "improved_test_category_sizes.png")

        print("\n IMPROVED PIPELINE COMPLETED SUCCESSFULLY!")
        print("=" * 70)

        # Final statistics
        n_train_clusters = len(set(train_labels)) - (1 if -1 in train_labels else 0)
        n_train_noise = np.sum(train_labels == -1)
        train_coverage = (len(train_labels) - n_train_noise) / len(train_labels) * 100

        n_test_clusters = len(set(test_labels)) - (1 if -1 in test_labels else 0)
        n_test_noise = np.sum(test_labels == -1)
        test_coverage = (len(test_labels) - n_test_noise) / len(test_labels) * 100

        print(f"\n IMPROVED RESULTS SUMMARY:")
        print(f"   TRAIN DATA:")
        print(f"     - Total products: {len(train_labels)}")
        print(f"     - Categories discovered: {n_train_clusters}")
        print(f"     - Noise products: {n_train_noise} ({n_train_noise/len(train_labels)*100:.1f}%)")
        print(f"     - Categorized products: {len(train_labels)-n_train_noise} ({train_coverage:.1f}%)")

        print(f"   TEST DATA:")
        print(f"     - Total products: {len(test_labels)}")
        print(f"     - Categories assigned: {n_test_clusters}")
        print(f"     - Noise products: {n_test_noise} ({n_test_noise/len(test_labels)*100:.1f}%)")
        print(f"     - Categorized products: {len(test_labels)-n_test_noise} ({test_coverage:.1f}%)")

        return categorizer, train_labels, test_labels

    except Exception as e:
        print(f"\n PIPELINE FAILED: {e}")
        import traceback
        traceback.print_exc()

# IMPROVED ANALYSIS FUNCTIONS
def print_improved_category_summary(interpretations, total_samples):
    """Print improved summary of discovered categories"""
    if not interpretations:
        print(" No category interpretations available")
        return

    print(f"\n MEANINGFUL CATEGORIES DISCOVERED")
    print("=" * 60)

    # Sort categories by size
    sorted_categories = sorted(
        interpretations.items(),
        key=lambda x: x[1]['size'],
        reverse=True
    )[:15]  # Show top 15

    for i, (label, info) in enumerate(sorted_categories, 1):
        percentage = (info['size'] / total_samples) * 100
        print(f"{i:2d}. {info['category_name']} (Size: {info['size']:4d} products, {percentage:.1f}%)")
        print(f"     Sample: {info['representative_description']}")
        print(f"     Keywords: {', '.join(info['top_keywords'])}")
        print()

# Keep the existing utility functions (save_embeddings, load_embeddings, etc.)

if _name_ == "_main_":
    improved_main_pipeline()

# Structure Feature from train_embedding.npy

In [None]:
import re
from sklearn.preprocessing import LabelEncoder, StandardScaler
from typing import Tuple



TRAIN_FILE =  'train.csv'
TEST_FILE =  'test.csv'


def load_data():
    try:
        df_train = pd.read_csv(TRAIN_FILE)
        df_test = pd.read_csv(TEST_FILE)

        return df_train.copy(), df_test.copy()
    except FileNotFoundError:
        print(f"Error: Could not find files . Please check paths.")


N_brands = 0
N_categories = 0

# --- Build TF-IDF representation of clusters ---


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = np.load("all_categories.npy", allow_pickle=True).item()
interpretations = data["interpretations"]


cluster_ids = list(interpretations.keys())
cluster_texts = [
    f"{v['representative_description']} {' '.join(v['top_keywords'])}"
    for v in interpretations.values()
]

vectorizer = TfidfVectorizer(stop_words='english')
cluster_vectors = vectorizer.fit_transform(cluster_texts)
import re

def extract_brand(content: str) -> str:

    # 1. Extract 'Item Name'
    item_name_match = re.search(r'Item Name:\s*(.*?)\n', content, re.DOTALL)
    if not item_name_match:
        return 'Unknown'

    item_name = item_name_match.group(1).strip()
    if not item_name:
        return 'Unknown'

    # 2. Remove trailing punctuation/dashes
    item_name = re.sub(r'[-—–,:].*$', '', item_name).strip()

    # 3. Split into words
    words = item_name.split()
    if not words:
        return 'Unknown'

    # 4. Collect brand words at start
    brand_words = []
    stop_words = {'and', 'or', 'the', 'for', 'a', 'an'}

    for word in words:
        # Accept any alphabetic starting word (capitalized or lowercase)
        if word[0].isalpha():
            brand_words.append(word)
        elif word.lower() in stop_words and len(brand_words) > 0:
            brand_words.append(word)
        else:
            break

    if not brand_words:
        return 'Unknown'

    # Remove trailing stop words
    while brand_words and brand_words[-1].lower() in stop_words:
        brand_words.pop()

    return " ".join(brand_words)


def extract_ipq(content: str) -> float:


    value_match = re.search(r'Value:\s*(\d+\.?\d*)', content, re.IGNORECASE)
    if value_match:
        return float(value_match.group(1))


    unit_match = re.search(r'(\d+\.?\d*)\s*(oz|g|lb|count|pack)', content, re.IGNORECASE)
    if unit_match:
        return float(unit_match.group(1))

    return 1.0

def extract_quantity(content: str) -> float:

    if not isinstance(content, str):
        return 1.0


    patterns = [
        r'pack\s*of\s*(\d+)',
        r'(\d+)\s*-\s*pack',
        r'(\d+)\s*pack',
        r'(\d+)\s*count',
        r'x\s*(\d+)'
    ]
    for pat in patterns:
        m = re.search(pat, content, re.IGNORECASE)
        if m:
            return float(m.group(1))
    return 1.0

def extract_unit_value(content: str) -> float:

    if not isinstance(content, str):
        return 0.0

    # Match number + unit (oz, g, ml, etc.)
    match = re.search(r'(\d+\.?\d*)\s*(ml|l|oz|g|kg|lb)', content, re.IGNORECASE)
    if not match:
        return 0.0

    value, unit = float(match.group(1)), match.group(2).lower()


    conversions = {
        'ml': 1,
        'l': 1000,
        'oz': 28.35,
        'g': 1,
        'kg': 1000,
        'lb': 453.59
    }
    return value * conversions.get(unit, 1.0)


def extract_category(content: str) -> str:

    if not content or not isinstance(content, str):
        return "Miscellaneous"

    text_lower = content.lower().strip()

    # comprehensive list of words that are generic (nuisance)
    NUISANCE_WORDS = {
        'point', 'bullet', 'value', 'our', 'your', 'for', 'item', 'name',
        'product', 'description', 'pack', 'unit', 'count', 'ounce', 'oz',
        'fl', 'fluid', 'gallon', 'quart', 'liter', 'ml', 'g', 'kg', 'lb',
        'can', 'bottle', 'case', 'box', 'set', 'bag', 'mix', 'style',
        'assortment', 'free', 'natural', 'pure', 'food', 'drink', 'premium',
        'great', 'delicious', 'gift', 'kit', 'original', 'favorite'
    }

    # Clustering and Retrieval (Tier 3: Best Match)

    text_vector = vectorizer.transform([text_lower])
    sims = cosine_similarity(text_vector, cluster_vectors).flatten()
    best_cluster_idx = sims.argmax()
    best_cluster_id = cluster_ids[best_cluster_idx]

y
    top_words = interpretations[best_cluster_id]["top_keywords"]

    # 2. Tier 1: Strict Filtering (Get 3 meaningful words)
    meaningful_words = [word for word in top_words if word.lower() not in NUISANCE_WORDS]

    if len(meaningful_words) >= 1:
        # Success: Found at least one good word. Use the top 3.
        final_category = " ".join(meaningful_words[:3]).title()
    else:
        e.
        if top_words:
            fallback_word = top_words[0]
            # Double-check for extremely short words which are often abbreviations
            if len(fallback_word) > 2:
                final_category = fallback_word.title()
            else:
                final_category = "Product - " + fallback_word.upper()
        else:

            content_words = [word.strip() for word in text_lower.split()
                             if word.strip() and word.lower() not in NUISANCE_WORDS]

            if content_words:
                final_category = "Content - " + " ".join(content_words[:2]).title()
            else:
                final_category = "Miscellaneous"

    return final_category

# --- MAIN PROCESSING PIPELINE ---


In [None]:
import json
def prepare_structured_features(df_train: pd.DataFrame, df_test: pd.DataFrame):

    global N_brands, N_categories

    print("1. Extracting Brand, IPQ, Category, Quantity, and Unit...")

    for df in [df_train, df_test]:
        df['brand'] = df['catalog_content'].apply(extract_brand)
        df['ipq'] = df['catalog_content'].apply(extract_ipq)
        df['category'] = df['catalog_content'].apply(extract_category)
        df['quantity'] = df['catalog_content'].apply(extract_quantity)
        df['unit_value'] = df['catalog_content'].apply(extract_unit_value)

    print("2. Encoding categorical features...")
    brand_encoder = LabelEncoder()
    category_encoder = LabelEncoder()

    all_brands = pd.concat([df_train['brand'], df_test['brand']]).unique()
    all_categories = pd.concat([df_train['category'], df_test['category']]).unique()

    brand_encoder.fit(all_brands)
    category_encoder.fit(all_categories)

    df_train['brand_encoded'] = brand_encoder.transform(df_train['brand'])
    df_test['brand_encoded'] = brand_encoder.transform(df_test['brand'])

    df_train['category_encoded'] = category_encoder.transform(df_train['category'])
    df_test['category_encoded'] = category_encoder.transform(df_test['category'])

    N_brands = len(brand_encoder.classes_)
    N_categories = len(category_encoder.classes_)

    print("3. Scaling numerical features (IPQ, Quantity, Unit)...")
    num_scaler = StandardScaler()

    # Combine numerical features
    train_nums = df_train[['ipq', 'quantity', 'unit_value']].fillna(0)
    test_nums = df_test[['ipq', 'quantity', 'unit_value']].fillna(0)

    scaled_train = num_scaler.fit_transform(train_nums)
    scaled_test = num_scaler.transform(test_nums)

    # Split back to columns
    df_train[['ipq_scaled', 'quantity_scaled', 'unit_scaled']] = scaled_train
    df_test[['ipq_scaled', 'quantity_scaled', 'unit_scaled']] = scaled_test

    print(f"\nEncoding complete. Found {N_brands} brands and {N_categories} categories.")

    return df_train, df_test


def save_structured_output(df_train: pd.DataFrame, df_test: pd.DataFrame,
                           N_brands: int, N_categories: int):


    # Define columns to be saved
    feature_cols = ['brand_encoded', 'category_encoded', 'ipq_scaled', 'quantity_scaled', 'unit_scaled']


    train_features = df_train[feature_cols].values
    np.save('./structured_features_train.npy', train_features)
    print(f" Saved training structured features → structured_features_train.npy (Shape: {train_features.shape})")

    test_features = df_test[feature_cols].values
    np.save('./structured_features_test.npy', test_features)
    print(f"Saved test structured features → structured_features_test.npy (Shape: {test_features.shape})")

    model_dims = {
        "N_brands": N_brands,
        "N_categories": N_categories,
        "feature_order": feature_cols
    }

    with open('./structured_model_config.json', 'w', encoding='utf-8') as f:
        json.dump(model_dims, f, indent=4)
    print(" Saved model configuration → structured_model_config.json")


    readable_cols = feature_cols + ['brand', 'category']
    if all(col in df_train.columns for col in readable_cols):
        train_records = df_train[readable_cols].to_dict(orient='records')
        with open('./structured_train_data.json', 'w', encoding='utf-8') as f:
            json.dump(train_records, f, ensure_ascii=False, indent=4)
        print(f" Saved readable structured data → structured_train_data.json ({len(train_records)} records)")
    else:
        print(" Warning: Some readable columns not found; skipping JSON export.")

    print("\n All structured outputs saved successfully.")

# --- EXECUTION BLOCK ---


In [None]:

if __name__ == '__main__':
    df_train, df_test = load_data()

    # Run the pipeline
    df_train_processed, df_test_processed = prepare_structured_features(df_train, df_test)

    print("\n--- Training Data Sample (Final Structured Inputs) ---")
    print(df_train_processed[['brand_encoded', 'category_encoded', 'ipq_scaled', 'brand', 'category']].head())

    print("\n--- Keras/TF Inputs for Structured Branch ---")
    print(f"Total Unique Brands (Embedding input_dim): {N_brands}")
    print(f"Total Unique Categories (Embedding input_dim): {N_categories}")
    print(f"IPQ Scaled Array Shape: {df_train_processed['ipq_scaled'].values.shape}")
    save_structured_output(df_train_processed, df_test_processed, N_brands=N_brands, N_categories=N_categories)

In [None]:
import numpy as np
from pprint import pprint

#  Load the file
data = np.load("all_categories.npy", allow_pickle=True)

#  If it's a dictionary (like your file), extract it
if isinstance(data.item(), dict):
    data = data.item()

#  See what keys exist
print("Keys in file:", list(data.keys()))

#  Display summary for each key
for key, value in data.items():
    print(f"\n--- {key} ---")
    if isinstance(value, np.ndarray):
        print(f"Shape: {value.shape}, dtype: {value.dtype}")
    elif isinstance(value, dict):
        print(f"Dictionary with {len(value)} entries. Example:")
        pprint(list(value.items())[:5])
    else:
        print("Type:", type(value))
        print("Example:", value)


# ---  Now,Training Model from train.csv,train_embedding,train_strutrue_feature.npy,Train_image_feature.npy ---

In [None]:
# Full Pipeline: Supervised Autoencoder + Stacking Ensemble

import os, gc, joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

np.random.seed(42)
torch.manual_seed(42)

# Load Data
train_path = "train.csv"
if not os.path.exists(train_path):
    raise FileNotFoundError(" train.csv not found")
df_train = pd.read_csv(train_path)
print(f" Loaded train.csv: {df_train.shape}")

#  Load / Simulate Features
def load_or_simulate(path, shape):
    if os.path.exists(path):
        arr = np.load(path)
        print(f" Loaded {os.path.basename(path)}: {arr.shape}")
    else:
        print(f"{os.path.basename(path)} not found — simulating features")
        arr = np.random.rand(*shape).astype(np.float32)
    return arr.astype(np.float32)

X_structured = load_or_simulate("structured_features_train.npy", (len(df_train), 5))
X_text = load_or_simulate("train_embeddings.npy", (len(df_train), 384))
X_image = load_or_simulate("merged_image_features_75k.npy", (len(df_train), 2048))

#  Scale Features
scaler_struct = StandardScaler()
X_structured_scaled = scaler_struct.fit_transform(X_structured)

scaler_text = StandardScaler()
X_text_scaled = scaler_text.fit_transform(X_text)

scaler_image = StandardScaler()
X_image_scaled = scaler_image.fit_transform(X_image)

# Supervised Autoencoder for Image Features
X_image_tensor = torch.from_numpy(X_image_scaled).float()
y_tensor = torch.from_numpy(np.log1p(df_train["price"].values.reshape(-1,1)).astype(np.float32))

class SupervisedAutoencoder(nn.Module):
    def _init_(self, input_dim=2048, bottleneck_dim=512):
        super()._init_()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, bottleneck_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, input_dim)
        )
        self.regressor = nn.Linear(bottleneck_dim, 1)

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        y_pred = self.regressor(z)
        return x_hat, y_pred, z

bottleneck_dim = 512
epochs = 50
batch_size = 256
learning_rate = 1e-3
alpha = 0.5

model = SupervisedAutoencoder(input_dim=X_image_scaled.shape[1], bottleneck_dim=bottleneck_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion_recon = nn.MSELoss()
criterion_reg = nn.MSELoss()

dataset = TensorDataset(X_image_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    for x_batch, y_batch in loader:
        optimizer.zero_grad()
        x_hat, y_pred, _ = model(x_batch)
        loss_recon = criterion_recon(x_hat, x_batch)
        loss_reg = criterion_reg(y_pred, y_batch)
        loss = alpha*loss_recon + (1-alpha)*loss_reg
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x_batch.size(0)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(dataset):.6f}")

# Bottleneck embeddings
model.eval()
with torch.no_grad():
    _, _, X_image_bottleneck = model(X_image_tensor)
    X_image_bottleneck = X_image_bottleneck.numpy().astype(np.float32)

print(f" Image embeddings reduced: {X_image_scaled.shape} → {X_image_bottleneck.shape}")

# Combine all features
X_full = np.hstack([X_structured_scaled, X_text_scaled, X_image_bottleneck]).astype(np.float32)
y = np.log1p(df_train["price"].values.astype(np.float32))
print(f" Combined features shape: {X_full.shape}")

# Clean up
del X_image_scaled, X_image_tensor
gc.collect()

#  Define Base Models
base_models = {
    "lgb": lgb.LGBMRegressor(
        n_estimators=200, num_leaves=31, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=1, verbose=-1
    ),
    "xgb": xgb.XGBRegressor(
        n_estimators=200, max_depth=5, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8, tree_method='hist',
        random_state=42, n_jobs=1, verbosity=0
    ),
    "mlp": MLPRegressor(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        batch_size=512,
        learning_rate_init=0.001,
        max_iter=300,
        random_state=42,
        verbose=False
    )
}

#  K-Fold Stacking
kf = KFold(n_splits=5, shuffle=True, random_state=42)
meta_features = np.zeros((len(X_full), len(base_models)), dtype=np.float32)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_full), 1):
    print(f"\n===== Fold {fold} =====")
    X_train, X_val = X_full[train_idx], X_full[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    for i, (name, model_) in enumerate(base_models.items()):
        print(f"→ Training {name.upper()}...")
        model_.fit(X_train, y_train)
        preds_val = model_.predict(X_val)
        meta_features[val_idx, i] = preds_val
        rmse = np.sqrt(mean_squared_error(y_val, preds_val))
        print(f"   RMSE ({name}): {rmse:.4f}")
    gc.collect()

#  Train Meta Model
meta_model = lgb.LGBMRegressor(
    n_estimators=300, learning_rate=0.03, num_leaves=31, random_state=42
)
meta_model.fit(meta_features, y)
print("Meta model trained successfully")

#  Evaluate
meta_preds = meta_model.predict(meta_features)
rmse = np.sqrt(mean_squared_error(y, meta_preds))
r2 = r2_score(y, meta_preds)
smape_val = 100*np.mean(2*np.abs(np.expm1(meta_preds)-np.expm1(y))/(np.expm1(meta_preds)+np.expm1(y)+1e-8))

print(f"\n Validation Results:\nRMSE: {rmse:.4f}\nR²: {r2:.4f}\nSMAPE: {smape_val:.2f}%")

# Save Models
final_model = {
    "scalers": {"structured": scaler_struct, "text": scaler_text, "image": scaler_image},
    "autoencoder": model.state_dict(),
    "base_models": base_models,
    "meta_model": meta_model
}
joblib.dump(final_model, "stacked_model_supervised_autoencoder.pkl")
print(" Models saved successfully!")

# **--- Now test the model using the Stacked_model_supervised_autoencoder.pk file, test_embeddings.NPY, test.csv, and structured_test_features.npy ---**

# -- THEN,IT TEST_OUT.CSV FILE. --

# Inference Pipeline: Predict Product Price (Test Data)


In [None]:
# Inference Pipeline: Predict Product Price (Test Data)

import os, gc, joblib
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

# Load Trained Model and Scalers
model_bundle = joblib.load(r'/content/stacked_model_supervised_autoencoder.pkl')

scaler_struct = model_bundle["scalers"]["structured"]
scaler_text = model_bundle["scalers"]["text"]
scaler_image = model_bundle["scalers"]["image"]

autoencoder_state = model_bundle["autoencoder"]
base_models = model_bundle["base_models"]
meta_model = model_bundle["meta_model"]

print(" Loaded all models successfully!")

#  Define Autoencoder Class (same as used in training)
class SupervisedAutoencoder(nn.Module):
    def _init_(self, input_dim=2048, bottleneck_dim=512):
        super()._init_()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, bottleneck_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, input_dim)
        )
        self.regressor = nn.Linear(bottleneck_dim, 1)

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        y_pred = self.regressor(z)
        return x_hat, y_pred, z

# Recreate the model and load trained weights
autoencoder = SupervisedAutoencoder(input_dim=2048, bottleneck_dim=512)
autoencoder.load_state_dict(autoencoder_state)
autoencoder.eval()

print("Autoencoder loaded and ready!")

#  Load Test Dataset
test_path = r'test.csv'
if not os.path.exists(test_path):
    raise FileNotFoundError(" test.csv not found")

df_test = pd.read_csv(test_path)
print(f" Loaded test.csv: {df_test.shape}")

#  Load Test Features
def load_or_simulate(path, shape):
    if os.path.exists(path):
        arr = np.load(path)
        print(f" Loaded {os.path.basename(path)}: {arr.shape}")
    else:
        print(f" {os.path.basename(path)} not found — simulating features")
        arr = np.random.rand(*shape).astype(np.float32)
    return arr.astype(np.float32)

X_structured_test = load_or_simulate(r'/content/drive/MyDrive/amazon/structured_features_test.npy', (len(df_test), 5))
X_text_test = load_or_simulate(r'/content/drive/MyDrive/amazon/test_embeddings.npy', (len(df_test), 384))
X_image_test = load_or_simulate(r'/content/Test_merged_image_features_75k.npy', (len(df_test), 2048))

#  Scale Test Features (using training scalers)
X_structured_scaled = scaler_struct.transform(X_structured_test)
X_text_scaled = scaler_text.transform(X_text_test)
X_image_scaled = scaler_image.transform(X_image_test)

#  Encode Image Features via Autoencoder (bottleneck)
X_image_tensor = torch.from_numpy(X_image_scaled).float()
with torch.no_grad():
    _, _, X_image_bottleneck = autoencoder(X_image_tensor)
    X_image_bottleneck = X_image_bottleneck.numpy().astype(np.float32)

print(f"Image bottleneck embeddings: {X_image_bottleneck.shape}")


#  Combine All Features
X_test_full = np.hstack([X_structured_scaled, X_text_scaled, X_image_bottleneck]).astype(np.float32)
print(f" Final combined test features: {X_test_full.shape}")

# Predict Using Base Models → Generate Meta Features

meta_features_test = np.zeros((len(X_test_full), len(base_models)), dtype=np.float32)

for i, (name, model_) in enumerate(base_models.items()):
    print(f"→ Predicting with {name.upper()}...")
    preds_test = model_.predict(X_test_full)
    meta_features_test[:, i] = preds_test

print(f" Meta-level features created: {meta_features_test.shape}")


#  Predict Final Prices Using Meta Model

meta_preds_test = meta_model.predict(meta_features_test)
final_price_preds = np.expm1(meta_preds_test)
final_price_preds = np.maximum(final_price_preds, 0)

# Save Output File
submission = pd.DataFrame({
    "sample_id": df_test["sample_id"],
    "price": final_price_preds
})

submission.to_csv("Super_test_out.csv", index=False)
print(" test_out.csv saved successfully!")
print(submission.head())