In [1]:
import pandas as pd
import numpy as np
import heapq
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, silhouette_score
from sklearn.utils import resample
from scipy.sparse import issparse
import joblib
import warnings
warnings.filterwarnings('ignore')

# ====================== ENHANCED CUSTOM MODELS ======================

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=5, min_samples_leaf=2, max_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.tree_ = None
        self.feature_importances_ = None

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.values
            
        n_features = X.shape[1]
        if self.max_features is None:
            self.max_features = n_features
        elif self.max_features == 'sqrt':
            self.max_features = int(np.sqrt(n_features))
        elif self.max_features == 'log2':
            self.max_features = int(np.log2(n_features))
            
        self.feature_importances_ = np.zeros(n_features)
        self.tree_ = self._build_tree(X, y, depth=0)
        self.feature_importances_ /= self.feature_importances_.sum()
    
    def _build_tree(self, X, y, depth):
        num_samples, n_features = X.shape
        
        # Enhanced stopping conditions
        if (self.max_depth is not None and depth >= self.max_depth) or \
           num_samples < self.min_samples_split or \
           num_samples < 2 * self.min_samples_leaf or \
           len(np.unique(y)) == 1 or \
           np.var(y) < 1e-7:
            return np.mean(y)

        # Feature sampling for better generalization
        if self.max_features < n_features:
            feature_indices = np.random.choice(n_features, self.max_features, replace=False)
        else:
            feature_indices = np.arange(n_features)

        best_split = self._find_best_split(X, y, feature_indices)
        if best_split is None:
            return np.mean(y)
        
        # Update feature importance
        self.feature_importances_[best_split['feature']] += best_split['importance']
        
        left_indices = X[:, best_split['feature']] <= best_split['value']
        right_indices = ~left_indices
        
        # Ensure minimum samples in each leaf
        if np.sum(left_indices) < self.min_samples_leaf or np.sum(right_indices) < self.min_samples_leaf:
            return np.mean(y)
        
        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        
        return {
            'feature': best_split['feature'],
            'value': best_split['value'],
            'left': left_tree,
            'right': right_tree
        }
    
    def _find_best_split(self, X, y, feature_indices):
        best_split = None
        best_score = float('inf')
        current_mse = np.var(y)

        for feature in feature_indices:
            unique_values = np.unique(X[:, feature])
            if len(unique_values) == 1:
                continue
                
            # Smarter split point selection
            if len(unique_values) > 20:
                # Use quantiles for continuous features
                split_points = np.percentile(unique_values, [10, 25, 50, 75, 90])
            else:
                # Use midpoints for discrete features
                split_points = [(unique_values[i] + unique_values[i+1]) / 2 
                               for i in range(len(unique_values)-1)]
            
            for value in split_points:
                left_indices = X[:, feature] <= value
                right_indices = ~left_indices
                
                if np.sum(left_indices) < self.min_samples_leaf or np.sum(right_indices) < self.min_samples_leaf:
                    continue
                
                left_y = y[left_indices]
                right_y = y[right_indices]
                
                # Weighted MSE
                left_weight = len(left_y) / len(y)
                right_weight = len(right_y) / len(y)
                weighted_mse = left_weight * np.var(left_y) + right_weight * np.var(right_y)
                
                if weighted_mse < best_score:
                    importance = current_mse - weighted_mse  # Information gain
                    best_split = {
                        'feature': feature, 
                        'value': value, 
                        'importance': importance
                    }
                    best_score = weighted_mse
        
        return best_split

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        return np.array([self._predict(sample, self.tree_) for sample in X])
    
    def _predict(self, sample, tree):
        if not isinstance(tree, dict):
            return tree
        
        if sample[tree['feature']] <= tree['value']:
            return self._predict(sample, tree['left'])
        return self._predict(sample, tree['right'])


class RandomForest:
    def __init__(self, n_estimators=150, max_depth=15, max_features='sqrt', 
                 min_samples_split=5, min_samples_leaf=2, bootstrap=True, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.trees = []
        self.feature_indices = []
        self.feature_importances_ = None
        
    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.values
            
        if self.random_state is not None:
            np.random.seed(self.random_state)
            
        n_samples, n_features = X.shape
        
        # Better max_features calculation
        if self.max_features == 'sqrt':
            max_feats = max(1, int(np.sqrt(n_features)))
        elif self.max_features == 'log2':
            max_feats = max(1, int(np.log2(n_features)))
        elif isinstance(self.max_features, float):
            max_feats = max(1, int(self.max_features * n_features))
        else:
            max_feats = self.max_features or n_features
            
        feature_importance_sum = np.zeros(n_features)
        
        for i in range(self.n_estimators):
            # Bootstrap sampling
            if self.bootstrap:
                indices = np.random.choice(n_samples, n_samples, replace=True)
                X_sample, y_sample = X[indices], y[indices]
            else:
                X_sample, y_sample = X, y
                
            # Random feature selection
            feature_idx = np.random.choice(n_features, max_feats, replace=False)
            
            # Create and train tree
            tree = DecisionTree(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_features=max_feats
            )
            
            X_sub = X_sample[:, feature_idx]
            tree.fit(X_sub, y_sample)
            
            self.trees.append(tree)
            self.feature_indices.append(feature_idx)
            
            # Accumulate feature importance
            if tree.feature_importances_ is not None:
                feature_importance_sum[feature_idx] += tree.feature_importances_
            
        # Normalize feature importance
        self.feature_importances_ = feature_importance_sum / self.n_estimators
        if self.feature_importances_.sum() > 0:
            self.feature_importances_ = self.feature_importances_ / self.feature_importances_.sum()
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        all_preds = np.zeros((self.n_estimators, X.shape[0]))
        
        for i, (tree, feat_idx) in enumerate(zip(self.trees, self.feature_indices)):
            X_sub = X[:, feat_idx]
            all_preds[i] = tree.predict(X_sub)
            
        return np.mean(all_preds, axis=0)


class CustomKNN:
    def __init__(self, k=7, metric='hybrid', weights='distance'):
        self.k = k
        self.metric = metric
        self.weights = weights
        self.X_train = None
        self.y_train = None
        self.feature_weights = None
        
    def set_feature_weights(self, weights):
        """Set importance weights for different features"""
        self.feature_weights = np.array(weights) if weights is not None else None
    
    def _cosine_similarity(self, a, b, weights=None):
        if weights is not None:
            a = a * weights
            b = b * weights
        norm_a = np.linalg.norm(a)
        norm_b = np.linalg.norm(b)
        if norm_a == 0 or norm_b == 0:
            return 0
        return np.dot(a, b) / (norm_a * norm_b)
    
    def _euclidean_distance(self, a, b, weights=None):
        if weights is not None:
            a = a * weights
            b = b * weights
        return np.sqrt(np.sum((a - b) ** 2))
    
    def _manhattan_distance(self, a, b, weights=None):
        if weights is not None:
            a = a * weights
            b = b * weights
        return np.sum(np.abs(a - b))
    
    def _hybrid_similarity(self, a, b, weights=None):
        """Combines multiple similarity metrics for better accuracy"""
        cos_sim = self._cosine_similarity(a, b, weights)
        euc_dist = self._euclidean_distance(a, b, weights)
        # Convert euclidean to similarity
        euc_sim = 1 / (1 + euc_dist)
        
        # Weighted combination - cosine is better for high-dimensional sparse data
        return 0.7 * cos_sim + 0.3 * euc_sim
    
    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
        self.X_train = X
        self.y_train = y.values if isinstance(y, pd.Series) else y
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        predictions = []
        for sample in X:
            if self.metric == 'hybrid':
                similarities = [self._hybrid_similarity(sample, x, self.feature_weights) for x in self.X_train]
                neighbor_indices = np.argpartition(similarities, -self.k)[-self.k:]
                neighbor_similarities = [similarities[i] for i in neighbor_indices]
            elif self.metric == 'cosine':
                similarities = [self._cosine_similarity(sample, x, self.feature_weights) for x in self.X_train]
                neighbor_indices = np.argpartition(similarities, -self.k)[-self.k:]
                neighbor_similarities = [similarities[i] for i in neighbor_indices]
            else:  # euclidean or manhattan
                if self.metric == 'manhattan':
                    distances = [self._manhattan_distance(sample, x, self.feature_weights) for x in self.X_train]
                else:
                    distances = [self._euclidean_distance(sample, x, self.feature_weights) for x in self.X_train]
                neighbor_indices = np.argpartition(distances, self.k)[:self.k]
                neighbor_similarities = [1/(1+distances[i]) for i in neighbor_indices]  # Convert to similarities
            
            neighbor_values = self.y_train[neighbor_indices]
            
            # Apply distance weighting if requested
            if self.weights == 'distance':
                total_similarity = sum(neighbor_similarities)
                if total_similarity > 0:
                    weights_array = np.array(neighbor_similarities) / total_similarity
                    prediction = np.average(neighbor_values, weights=weights_array)
                else:
                    prediction = np.mean(neighbor_values)
            else:
                prediction = np.mean(neighbor_values)
                
            predictions.append(prediction)
            
        return np.array(predictions)
    
    def get_similar_laptops(self, X_input, df, top_n=5, price_range_factor=0.3):
        """Enhanced similarity search with price filtering and diversity"""
        if isinstance(X_input, pd.DataFrame):
            X_input = X_input.values
        if issparse(X_input):
            X_input = X_input.toarray()
            
        similarities = []
        for i, sample in enumerate(self.X_train):
            if self.metric == 'hybrid':
                sim = self._hybrid_similarity(X_input[0], sample, self.feature_weights)
            else:
                sim = self._cosine_similarity(X_input[0], sample, self.feature_weights)
            similarities.append((sim, i))
        
        # Sort by similarity (descending)
        similarities.sort(reverse=True)
        
        # Get diverse recommendations
        recommendations = []
        seen_companies = set()
        seen_types = set()
        
        for sim_score, idx in similarities:
            if len(recommendations) >= top_n:
                break
                
            laptop = df.iloc[idx].copy()
            company = laptop.get('Company', 'Unknown')
            type_name = laptop.get('TypeName', 'Unknown')
            
            # Diversity constraints
            company_count = sum(1 for r in recommendations if r.get('Company') == company)
            type_key = f"{company}-{type_name}"
            
            if company_count >= 2 or type_key in seen_types:
                continue
                
            seen_companies.add(company)
            seen_types.add(type_key)
            
            # Build comprehensive laptop info
            ram = laptop.get('Ram', 0)
            ssd = laptop.get('SSD', 0)
            hdd = laptop.get('HDD', 0)
            cpu = laptop.get('Cpu brand', 'Unknown')
            gpu = laptop.get('Gpu brand', 'Unknown')
            weight = laptop.get('Weight', 0)
            price = laptop.get('Price', 0)
            
            # Storage description
            storage_parts = []
            if ssd > 0:
                storage_parts.append(f"{int(ssd)}GB SSD")
            if hdd > 0:
                storage_parts.append(f"{int(hdd)}GB HDD")
            storage = " + ".join(storage_parts) if storage_parts else "Storage info unavailable"
            
            # Feature classification
            features = []
            if laptop.get('Touchscreen', 0):
                features.append('Touchscreen')
            if laptop.get('Ips', 0):
                features.append('IPS Display')
            
            # Performance classification
            if ram >= 16 and ssd >= 512:
                features.append('High Performance')
            elif ram >= 8 and ssd >= 256:
                features.append('Mid Performance')
            else:
                features.append('Basic Performance')
            
            laptop_info = {
                'Company': company,
                'TypeName': type_name,
                'Title': f"{company} {type_name}",
                'Ram': f"{int(ram)}GB",
                'Storage': storage,
                'Cpu_brand': cpu,
                'Gpu_brand': gpu,
                'Weight': f"{weight:.1f}kg" if weight > 0 else "Weight N/A",
                'Price': float(price),
                'Similarity': f"{sim_score:.3f}",
                'Features': ', '.join(features) if features else 'Standard Features',
                'Touchscreen': 'Yes' if laptop.get('Touchscreen', 0) else 'No',
                'Ips': 'Yes' if laptop.get('Ips', 0) else 'No',
                'os': laptop.get('os', 'Unknown OS')
            }
            
            recommendations.append(laptop_info)
        
        return recommendations


class CustomKMeans:
    def __init__(self, n_clusters=5, max_iters=300, random_state=None, init='k-means++'):
        self.n_clusters = n_clusters
        self.max_iters = max_iters
        self.random_state = random_state
        self.init = init
        self.centroids = None
        self.labels_ = None
        self.inertia_ = None
        
        # Enhanced cluster names based on laptop characteristics
        self.cluster_names = {
            0: "Budget-Friendly Everyday Laptops",
            1: "Mid-Range Professional Laptops", 
            2: "Premium Business Workstations",
            3: "Gaming & High-Performance Systems",
            4: "Ultraportable & Designer Laptops"
        }
        
    def _kmeans_plus_plus_init(self, X):
        """K-means++ initialization for better cluster centroids"""
        n_samples, n_features = X.shape
        centroids = np.zeros((self.n_clusters, n_features))
        
        # Choose first centroid randomly
        centroids[0] = X[np.random.choice(n_samples)]
        
        for k in range(1, self.n_clusters):
            # Calculate distances to nearest centroid
            distances = np.array([min([np.sum((x - c) ** 2) for c in centroids[:k]]) for x in X])
            
            # Choose next centroid with probability proportional to squared distance
            probabilities = distances / distances.sum()
            cumulative_probabilities = probabilities.cumsum()
            r = np.random.random()
            
            for j, p in enumerate(cumulative_probabilities):
                if r < p:
                    centroids[k] = X[j]
                    break
                    
        return centroids
        
    def fit(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        if self.random_state is not None:
            np.random.seed(self.random_state)
            
        n_samples, n_features = X.shape
        
        # Initialize centroids
        if self.init == 'k-means++':
            self.centroids = self._kmeans_plus_plus_init(X)
        else:
            idx = np.random.choice(n_samples, self.n_clusters, replace=False)
            self.centroids = X[idx]
        
        prev_inertia = float('inf')
        
        for iteration in range(self.max_iters):
            # Assign points to nearest centroid
            old_labels = self.labels_.copy() if self.labels_ is not None else np.zeros(n_samples)
            distances = self._calculate_distances(X)
            self.labels_ = np.argmin(distances, axis=1)
            
            # Calculate inertia
            self.inertia_ = np.sum([distances[i, self.labels_[i]] for i in range(n_samples)])
            
            # Check for convergence
            if np.all(old_labels == self.labels_) or abs(prev_inertia - self.inertia_) < 1e-6:
                break
                
            prev_inertia = self.inertia_
            
            # Update centroids
            new_centroids = np.zeros_like(self.centroids)
            for k in range(self.n_clusters):
                if np.sum(self.labels_ == k) > 0:
                    new_centroids[k] = np.mean(X[self.labels_ == k], axis=0)
                else:
                    # If cluster is empty, reinitialize
                    new_centroids[k] = X[np.random.choice(n_samples)]
            
            self.centroids = new_centroids
        
        return self
    
    def _calculate_distances(self, X):
        """Calculate distances from all points to all centroids"""
        distances = np.zeros((X.shape[0], self.n_clusters))
        for k in range(self.n_clusters):
            distances[:, k] = np.sum((X - self.centroids[k]) ** 2, axis=1)
        return distances
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
        distances = self._calculate_distances(X)
        return np.argmin(distances, axis=1)
    
    def get_cluster_examples(self, cluster_id, df, X_all=None, top_n=5):
        """Get diverse and representative examples from the cluster"""
        try:
            if X_all is not None:
                cluster_labels = self.predict(X_all)
            else:
                cluster_labels = self.labels_
                
            cluster_mask = cluster_labels == cluster_id
            cluster_df = df[cluster_mask].copy()
            
            if len(cluster_df) == 0:
                return []
            
            # Enhanced diversity scoring with more factors
            cluster_df['diversity_score'] = (
                cluster_df['Ram'] * 0.25 +                    # Memory importance
                cluster_df.get('SSD', 0) * 0.0001 +           # SSD storage
                cluster_df.get('ppi', 100) * 0.008 +          # Display quality
                (cluster_df['Weight'] * -3) +                 # Lighter is better
                cluster_df.get('Touchscreen', 0) * 8 +        # Premium features
                cluster_df.get('Ips', 0) * 8 +                # Display quality
                np.random.normal(0, 2, len(cluster_df))       # Add variety
            )
            
            # Sort by diversity and select varied examples
            cluster_df_sorted = cluster_df.sort_values(['diversity_score', 'Price'], 
                                                      ascending=[False, True])
            
            examples = []
            seen_companies = set()
            
            for _, laptop in cluster_df_sorted.iterrows():
                if len(examples) >= top_n:
                    break
                    
                company = laptop.get('Company', 'Unknown')
                
                # Ensure brand diversity
                if len(seen_companies) < 3 or company not in seen_companies:
                    seen_companies.add(company)
                    
                    type_name = laptop.get('TypeName', 'Laptop')
                    ram = laptop.get('Ram', 0)
                    ssd = laptop.get('SSD', 0)
                    hdd = laptop.get('HDD', 0)
                    cpu = laptop.get('Cpu brand', 'Unknown')
                    gpu = laptop.get('Gpu brand', 'Unknown')
                    weight = laptop.get('Weight', 0)
                    price = laptop.get('Price', 0)
                    
                    # Build storage info
                    storage_parts = []
                    if ssd > 0:
                        storage_parts.append(f"{int(ssd)}GB SSD")
                    if hdd > 0:
                        storage_parts.append(f"{int(hdd)}GB HDD")
                    storage = " + ".join(storage_parts) if storage_parts else "No storage info"
                    
                    # Enhanced feature list
                    features = []
                    if laptop.get('Touchscreen', 0):
                        features.append('Touchscreen')
                    if laptop.get('Ips', 0):
                        features.append('IPS Display')
                    if ram >= 16:
                        features.append('High Memory')
                    if ssd >= 512:
                        features.append('Fast Storage')
                    if weight < 2.0:
                        features.append('Lightweight')
                        
                    features_text = ', '.join(features) if features else 'Standard Features'
                    
                    example = {
                        'Company': company,
                        'TypeName': type_name,
                        'Title': f"{company} {type_name}",
                        'Ram': f"{int(ram)}GB",
                        'Storage': storage,
                        'Cpu_brand': cpu,
                        'Gpu_brand': gpu,
                        'Weight': f"{weight:.1f}kg" if weight > 0 else "Weight N/A",
                        'Price': f"₹{price:,.2f}",  # Fixed currency symbol
                        'Features': features_text,
                        'Touchscreen': 'Yes' if laptop.get('Touchscreen', 0) else 'No',
                        'Ips': 'Yes' if laptop.get('Ips', 0) else 'No',
                        'os': laptop.get('os', 'Unknown OS')
                    }
                    
                    examples.append(example)
            
            return examples
            
        except Exception as e:
            print(f"Error getting cluster examples: {e}")
            return []


# ====================== ENHANCED DATA PIPELINE ======================

print("Loading and preprocessing data...")
df = pd.read_csv('laptop_data.csv')
df.drop(columns=["Unnamed: 0"], inplace=True)

# Data cleaning with better error handling
df["Ram"] = df["Ram"].str.replace("GB", "").astype("int")
df["Weight"] = df["Weight"].str.replace("kg", "").astype("float")

# Remove outliers
df = df[(df['Price'] > 1000) & (df['Price'] < 500000)]  # Reasonable price range
df = df[(df['Weight'] > 0.5) & (df['Weight'] < 5.0)]    # Reasonable weight range
df = df[(df['Ram'] >= 2) & (df['Ram'] <= 64)]           # Reasonable RAM range

# Feature engineering
df["Touchscreen"] = df["ScreenResolution"].apply(lambda x: 1 if "Touchscreen" in x else 0)
df["Ips"] = df["ScreenResolution"].apply(lambda x: 1 if "IPS" in x else 0)

# Enhanced resolution processing
temp = df["ScreenResolution"].str.split("x", n=1, expand=True)
df["X_res"] = temp[0].str.replace(',', '').str.findall(r'(\d+\.?\d+)').apply(lambda x: x[0] if x else 1920).astype(int)
df["Y_res"] = temp[1].fillna('1080').astype(int)
df['ppi'] = (((df['X_res']**2) + (df['Y_res']**2))**0.5/df['Inches']).astype('float')
df.drop(columns=["ScreenResolution", "X_res", "Y_res", "Inches"], inplace=True)

# Enhanced CPU processing
df['Cpu Name'] = df['Cpu'].apply(lambda x: " ".join(x.split()[0:3]))
def fetch_processor(text):
    if text in ['Intel Core i7', 'Intel Core i5', 'Intel Core i3']:
        return text
    elif 'Intel' in text:
        return 'Other Intel Processor'
    else:
        return 'AMD Processor'
df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
df.drop(columns=['Cpu', 'Cpu Name'], inplace=True)

# Enhanced Memory processing
df['Memory'] = df['Memory'].astype(str).replace(r'\.0', '', regex=True)
df["Memory"] = df["Memory"].str.replace('GB', '').str.replace('TB', '000')
new = df["Memory"].str.split("+", n=1, expand=True)

# Better memory parsing
df["first"] = new[0].str.strip()
df["second"] = new[1].fillna("0").str.strip()

# Extract numbers and storage types
def extract_storage(storage_str):
    if pd.isna(storage_str) or storage_str == "0":
        return 0, 'None'
    
    # Extract number
    numbers = re.findall(r'\d+', str(storage_str))
    if not numbers:
        return 0, 'None'
    
    size = int(numbers[0])
    storage_type = 'HDD' if 'HDD' in str(storage_str) else ('SSD' if 'SSD' in str(storage_str) else 'Flash')
    return size, storage_type

import re
df[['first_size', 'first_type']] = df['first'].apply(lambda x: pd.Series(extract_storage(x)))
df[['second_size', 'second_type']] = df['second'].apply(lambda x: pd.Series(extract_storage(x)))

# Calculate HDD and SSD
df["HDD"] = 0
df["SSD"] = 0

# First storage device
df.loc[df['first_type'] == 'HDD', 'HDD'] += df['first_size']
df.loc[df['first_type'].isin(['SSD', 'Flash']), 'SSD'] += df['first_size']

# Second storage device
df.loc[df['second_type'] == 'HDD', 'HDD'] += df['second_size']
df.loc[df['second_type'].isin(['SSD', 'Flash']), 'SSD'] += df['second_size']

df.drop(columns=['first', 'second', 'first_size', 'first_type', 'second_size', 'second_type', 'Memory'], inplace=True)

# Enhanced GPU processing
df['Gpu brand'] = df['Gpu'].apply(lambda x: x.split()[0])
df = df[df['Gpu brand'] != 'ARM']  # Remove ARM GPUs
df.drop(columns=['Gpu'], inplace=True)

# Enhanced OS processing
def cat_os(inp):
    if inp in ['Windows 10', 'Windows 7', 'Windows 10 S']:
        return 'Windows'
    elif inp in ['macOS', 'Mac OS X']:
        return 'Mac'
    else:
        return 'Others/No OS/Linux'
df['os'] = df['OpSys'].apply(cat_os)
df.drop(columns=['OpSys'], inplace=True)

# Additional feature engineering
df['price_per_ram'] = df['Price'] / df['Ram']
df['storage_total'] = df['HDD'] + df['SSD']
df['ssd_ratio'] = df['SSD'] / (df['SSD'] + df['HDD'] + 1)  # +1 to avoid division by zero
df['is_gaming'] = df['TypeName'].apply(lambda x: 1 if 'Gaming' in str(x) else 0)
df['is_ultrabook'] = df['TypeName'].apply(lambda x: 1 if 'Ultrabook' in str(x) else 0)

print(f"Dataset shape after preprocessing: {df.shape}")
print(f"Price range: ${df['Price'].min():.2f} - ${df['Price'].max():.2f}")

# Features and target
X = df.drop(columns=['Price'])
y = np.log(df['Price'])  # Log transformation for better distribution

# Enhanced preprocessing pipeline with robust scaling
cat_cols = ['Company', 'TypeName', 'Cpu brand', 'Gpu brand', 'os']
num_cols = ['Ram', 'Weight', 'Touchscreen', 'Ips', 'ppi', 'HDD', 'SSD', 
            'price_per_ram', 'storage_total', 'ssd_ratio', 'is_gaming', 'is_ultrabook']

preprocessor = ColumnTransformer([
    ('num', RobustScaler(), num_cols),  # RobustScaler is better for outliers
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='drop')

# Train/validation/test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.18, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples") 
print(f"Test set: {X_test.shape[0]} samples")

# Transform data
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)
X_test_transformed = preprocessor.transform(X_test)

if issparse(X_train_transformed):
    X_train_transformed = X_train_transformed.toarray()
if issparse(X_val_transformed):
    X_val_transformed = X_val_transformed.toarray()
if issparse(X_test_transformed):
    X_test_transformed = X_test_transformed.toarray()

print(f"Feature dimensions after preprocessing: {X_train_transformed.shape[1]}")

# ====================== ENHANCED MODEL TRAINING ======================

print("\nTraining enhanced Random Forest...")
rf_model = RandomForest(
    n_estimators=200, 
    max_depth=20, 
    max_features='sqrt',
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
rf_model.fit(X_train_transformed, y_train)

# Evaluate on validation set
y_val_pred_rf = rf_model.predict(X_val_transformed)
val_mse_rf = mean_squared_error(y_val, y_val_pred_rf)
val_mae_rf = mean_absolute_error(y_val, y_val_pred_rf)
val_r2_rf = r2_score(y_val, y_val_pred_rf)

# Evaluate on test set
y_test_pred_rf = rf_model.predict(X_test_transformed)
test_mse_rf = mean_squared_error(y_test, y_test_pred_rf)
test_mae_rf = mean_absolute_error(y_test, y_test_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

print("\nRandom Forest Performance:")
print(f"Validation - MSE: {val_mse_rf:.4f}, MAE: {val_mae_rf:.4f}, R²: {val_r2_rf:.4f}")
print(f"Test - MSE: {test_mse_rf:.4f}, MAE: {test_mae_rf:.4f}, R²: {test_r2_rf:.4f}")

# Feature importance
if hasattr(rf_model, 'feature_importances_') and rf_model.feature_importances_ is not None:
    feature_names = num_cols + list(preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols))
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10))

print("\nTraining enhanced KNN...")
knn_model = CustomKNN(k=7, metric='hybrid', weights='distance')

# Set feature weights based on RF importance
if hasattr(rf_model, 'feature_importances_') and rf_model.feature_importances_ is not None:
    # Normalize importances and boost important features
    weights = rf_model.feature_importances_.copy()
    weights = weights / weights.max()  # Normalize to [0, 1]
    weights = weights * 2 + 0.5  # Scale to [0.5, 2.5] range
    knn_model.set_feature_weights(weights)
    print("Feature weights set based on Random Forest importance")

knn_model.fit(X_train_transformed, y_train)

# Evaluate KNN
y_val_pred_knn = knn_model.predict(X_val_transformed)
val_mse_knn = mean_squared_error(y_val, y_val_pred_knn)
val_mae_knn = mean_absolute_error(y_val, y_val_pred_knn)
val_r2_knn = r2_score(y_val, y_val_pred_knn)

y_test_pred_knn = knn_model.predict(X_test_transformed)
test_mse_knn = mean_squared_error(y_test, y_test_pred_knn)
test_mae_knn = mean_absolute_error(y_test, y_test_pred_knn)
test_r2_knn = r2_score(y_test, y_test_pred_knn)

print("\nKNN Performance:")
print(f"Validation - MSE: {val_mse_knn:.4f}, MAE: {val_mae_knn:.4f}, R²: {val_r2_knn:.4f}")
print(f"Test - MSE: {test_mse_knn:.4f}, MAE: {test_mae_knn:.4f}, R²: {test_r2_knn:.4f}")

print("\nTraining enhanced K-Means...")
kmeans_model = CustomKMeans(n_clusters=5, max_iters=300, random_state=42, init='k-means++')
kmeans_model.fit(X_train_transformed)

# Evaluate K-Means
train_silhouette = silhouette_score(X_train_transformed, kmeans_model.labels_)
print(f"\nK-Means Performance:")
print(f"Silhouette Score: {train_silhouette:.4f}")
print(f"Inertia: {kmeans_model.inertia_:.2f}")

# Analyze clusters
print("\nCluster Analysis:")
train_cluster_labels = kmeans_model.predict(X_train_transformed)
for i in range(5):
    cluster_mask = train_cluster_labels == i
    cluster_data = X_train[cluster_mask]
    avg_price = np.exp(y_train[cluster_mask]).mean()  # Convert back from log
    avg_ram = cluster_data['Ram'].mean()
    avg_weight = cluster_data['Weight'].mean()
    common_type = cluster_data['TypeName'].mode().iloc[0] if len(cluster_data) > 0 else 'Unknown'
    
    print(f"Cluster {i} ({kmeans_model.cluster_names.get(i, f'Cluster {i}')}): "
          f"{cluster_mask.sum()} laptops, "
          f"Avg Price: ${avg_price:.0f}, "
          f"Avg RAM: {avg_ram:.1f}GB, "
          f"Avg Weight: {avg_weight:.1f}kg, "
          f"Common Type: {common_type}")

# ====================== MODEL ENSEMBLE (BONUS) ======================

print("\nCreating ensemble predictions...")
# Combine RF and KNN predictions with weights
ensemble_val_pred = 0.7 * y_val_pred_rf + 0.3 * y_val_pred_knn
ensemble_test_pred = 0.7 * y_test_pred_rf + 0.3 * y_test_pred_knn

ensemble_val_r2 = r2_score(y_val, ensemble_val_pred)
ensemble_test_r2 = r2_score(y_test, ensemble_test_pred)
ensemble_val_mae = mean_absolute_error(y_val, ensemble_val_pred)
ensemble_test_mae = mean_absolute_error(y_test, ensemble_test_pred)

print(f"Ensemble Validation - R²: {ensemble_val_r2:.4f}, MAE: {ensemble_val_mae:.4f}")
print(f"Ensemble Test - R²: {ensemble_test_r2:.4f}, MAE: {ensemble_test_mae:.4f}")

# ====================== ENHANCED MODEL SAVING ======================

print("\nSaving enhanced models...")

# Add metadata for better tracking
model_metadata = {
    'training_date': pd.Timestamp.now().isoformat(),
    'dataset_size': len(df),
    'feature_count': X_train_transformed.shape[1],
    'rf_performance': {
        'test_r2': test_r2_rf,
        'test_mae': test_mae_rf,
        'val_r2': val_r2_rf
    },
    'knn_performance': {
        'test_r2': test_r2_knn,
        'test_mae': test_mae_knn,
        'val_r2': val_r2_knn
    },
    'ensemble_performance': {
        'test_r2': ensemble_test_r2,
        'test_mae': ensemble_test_mae,
        'val_r2': ensemble_val_r2
    },
    'kmeans_performance': {
        'silhouette_score': train_silhouette,
        'inertia': kmeans_model.inertia_
    }
}

joblib.dump({
    'df': df,
    'preprocessor': preprocessor,
    'random_forest': rf_model,
    'knn': knn_model,
    'kmeans': kmeans_model,
    'metadata': model_metadata,
    'feature_names': num_cols + list(preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols))
}, 'laptop_models_full_custom.pkl')

print("✅ Enhanced models saved successfully!")
print(f"📊 Final Model Performance Summary:")
print(f"   Random Forest R²: {test_r2_rf:.4f}")
print(f"   KNN R²: {test_r2_knn:.4f}")
print(f"   Ensemble R²: {ensemble_test_r2:.4f}")
print(f"   K-Means Silhouette: {train_silhouette:.4f}")

# ====================== SAVE FEATURE WEIGHTS FOR APP ======================

print("\nGenerating feature configuration for app...")
if hasattr(rf_model, 'feature_importances_') and rf_model.feature_importances_ is not None:
    # Create feature weight mapping for app.py
    feature_config = {
        'weights': rf_model.feature_importances_.tolist(),
        'feature_count': len(rf_model.feature_importances_),
        'top_features': importance_df.head(10).to_dict('records')
    }
    
    with open('feature_config.json', 'w') as f:
        import json
        json.dump(feature_config, f, indent=2)
    
    print("💾 Feature configuration saved to feature_config.json")

print("\n🎉 Training completed successfully!")
print("📁 Files generated:")
print("   - laptop_models_full_custom.pkl (main model file)")
print("   - feature_config.json (feature weights for app)")
print("\nYou can now use these models in your Flask app with improved accuracy!")

Loading and preprocessing data...
Dataset shape after preprocessing: (1302, 18)
Price range: $9270.72 - $324954.72
Training set: 906 samples
Validation set: 200 samples
Test set: 196 samples
Feature dimensions after preprocessing: 48

Training enhanced Random Forest...

Random Forest Performance:
Validation - MSE: 0.1076, MAE: 0.2583, R²: 0.7178
Test - MSE: 0.1125, MAE: 0.2676, R²: 0.6927

Top 10 Most Important Features:
                    feature  importance
7             price_per_ram    0.082344
9                 ssd_ratio    0.072143
4                       ppi    0.070447
1                    Weight    0.066750
0                       Ram    0.058035
6                       SSD    0.056005
8             storage_total    0.039717
34        TypeName_Notebook    0.032712
40  Cpu brand_Intel Core i7    0.031299
39  Cpu brand_Intel Core i5    0.030330

Training enhanced KNN...
Feature weights set based on Random Forest importance

KNN Performance:
Validation - MSE: 0.0281, MAE: 0.1153

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, silhouette_score
from sklearn.ensemble import RandomForestRegressor  # Fallback
from sklearn.utils import resample
from scipy.sparse import issparse
from sklearn.decomposition import PCA
import joblib
import logging

# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# ====================== CUSTOM MODELS ======================

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=10, min_impurity_decrease=0.001):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_impurity_decrease = min_impurity_decrease
        self.tree_ = None
        self.feature_importances_ = None

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.values
        self.feature_importances_ = np.zeros(X.shape[1])
        self.tree_ = self._build_tree(X, y, depth=0)
        self.feature_importances_ /= np.sum(self.feature_importances_) + 1e-10
        return self
    
    def _build_tree(self, X, y, depth):
        num_samples = X.shape[0]
        
        if (self.max_depth is not None and depth >= self.max_depth) or \
           num_samples < self.min_samples_split or \
           len(np.unique(y)) == 1:
            return {'value': np.mean(y), 'size': num_samples}

        best_split = self._find_best_split(X, y)
        if best_split is None or best_split['gain'] < self.min_impurity_decrease:
            return {'value': np.mean(y), 'size': num_samples}
        
        left_indices = X[:, best_split['feature']] <= best_split['value']
        right_indices = ~left_indices
        
        if np.sum(left_indices) < 2 or np.sum(right_indices) < 2:
            return {'value': np.mean(y), 'size': num_samples}
        
        self.feature_importances_[best_split['feature']] += best_split['gain'] * num_samples
        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        
        return {
            'feature': best_split['feature'],
            'value': best_split['value'],
            'left': left_tree,
            'right': right_tree,
            'size': num_samples
        }
    
    def _find_best_split(self, X, y):
        best_split = None
        best_mse = float('inf')
        best_gain = 0
        num_features = X.shape[1]
        total_var = np.var(y) * len(y)

        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            split_points = np.percentile(unique_values, [25, 50, 75]) if len(unique_values) > 10 else unique_values
            
            for value in split_points:
                left_indices = X[:, feature] <= value
                right_indices = ~left_indices
                
                if np.sum(left_indices) < 2 or np.sum(right_indices) < 2:
                    continue
                
                left_y = y[left_indices]
                right_y = y[right_indices]
                
                mse = (np.var(left_y) * len(left_y) + np.var(right_y) * len(right_y)) / len(y)
                gain = total_var - mse
                
                if mse < best_mse:
                    best_split = {'feature': feature, 'value': value, 'gain': gain}
                    best_mse = mse
        
        return best_split

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
        return np.array([self._predict(sample, self.tree_) for sample in X])
    
    def _predict(self, sample, tree):
        if 'value' in tree:
            return tree['value']
        if sample[tree['feature']] <= tree['value']:
            return self._predict(sample, tree['left'])
        return self._predict(sample, tree['right'])


class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, max_features='sqrt'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []
        self.feature_importances_ = None

    def fit(self, X, y):
        logging.info("Training Random Forest...")
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.values
            
        n_features = X.shape[1]
        self.feature_importances_ = np.zeros(n_features)
        max_feats = int(np.sqrt(n_features)) if self.max_features == 'sqrt' else self.max_features
        
        for _ in range(self.n_estimators):
            X_sample, y_sample = resample(X, y)
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=10, min_impurity_decrease=0.001)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
            self.feature_importances_ += tree.feature_importances_
        
        self.feature_importances_ /= self.n_estimators + 1e-10
        return self
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        all_preds = np.zeros((self.n_estimators, X.shape[0]))
        for i, tree in enumerate(self.trees):
            all_preds[i] = tree.predict(X)
        predictions = np.mean(all_preds, axis=0)
        logging.info(f"Random Forest predictions sample: {predictions[:5]}")
        return predictions


class CustomKNN:
    def __init__(self, k=5, metric='cosine'):
        self.k = k
        self.metric = metric
        self.X_train = None
        self.y_train = None
        
    def _cosine_similarity(self, a, b):
        norm_a = np.linalg.norm(a)
        norm_b = np.linalg.norm(b)
        if norm_a == 0 or norm_b == 0:
            return 0
        return np.dot(a, b) / (norm_a * norm_b)
    
    def _euclidean_distance(self, a, b):
        return np.sqrt(np.sum((a - b) ** 2))
    
    def fit(self, X, y):
        logging.info("Training KNN...")
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
        self.X_train = X
        self.y_train = y.values if isinstance(y, pd.Series) else y
        return self
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        predictions = []
        for sample in X:
            if self.metric == 'cosine':
                distances = np.array([self._cosine_similarity(sample, x) for x in self.X_train])
                neighbors = np.argpartition(distances, -self.k)[-self.k:]
                weights = distances[neighbors]
            else:
                distances = np.array([self._euclidean_distance(sample, x) for x in self.X_train])
                neighbors = np.argpartition(distances, self.k)[:self.k]
                weights = 1 / (distances[neighbors] + 1e-10)
            
            prediction = np.average(self.y_train[neighbors], weights=weights)
            predictions.append(prediction)
            
        return np.array(predictions)
    
    def get_similar_laptops(self, X_input, df, top_n=5):
        logging.info("Generating laptop recommendations...")
        if isinstance(X_input, pd.DataFrame):
            X_input = X_input.values
        if issparse(X_input):
            X_input = X_input.toarray()
            
        similarities = []
        for i, sample in enumerate(self.X_train):
            sim = self._cosine_similarity(X_input[0], sample)
            similarities.append((sim, i))
        
        similarities.sort(reverse=True)
        top_indices = [idx for _, idx in similarities[:top_n]]
        
        recommendations = []
        for i, idx in enumerate(top_indices):
            laptop = df.iloc[idx].copy()
            
            company = laptop.get('Company', 'Unknown')
            type_name = laptop.get('TypeName', 'Laptop')
            ram = laptop.get('Ram', 0)
            ssd = laptop.get('SSD', 0)
            hdd = laptop.get('HDD', 0)
            cpu = laptop.get('Cpu brand', 'Unknown')
            gpu = laptop.get('Gpu brand', 'Unknown')
            weight = laptop.get('Weight', 0)
            price = laptop.get('Price', 0)
            
            storage_parts = []
            if ssd > 0:
                storage_parts.append(f"{ssd}GB SSD")
            if hdd > 0:
                storage_parts.append(f"{hdd}GB HDD")
            storage = " + ".join(storage_parts) if storage_parts else "Storage info unavailable"
            
            laptop_info = {
                'Company': company,
                'TypeName': type_name,
                'Title': f"{company} {type_name}",
                'Ram': f"{ram}GB",
                'Storage': storage,
                'Cpu_brand': cpu,
                'Gpu_brand': gpu,
                'Weight': f"{weight:.1f}kg" if weight > 0 else "Weight N/A",
                'Price': f"RS {price:,.2f}",
                'Similarity': f"{similarities[i][0]:.2%}"
            }
            
            if laptop.get('Touchscreen', 0):
                laptop_info['Features'] = ['Touchscreen']
            if laptop.get('Ips', 0):
                laptop_info['Features'] = laptop_info.get('Features', []) + ['IPS Display']
            
            laptop_info['Features'] = ', '.join(laptop_info.get('Features', [])) or 'Standard Features'
            recommendations.append(laptop_info)
        
        return recommendations


class CustomKMeans:
    def __init__(self, n_clusters=5, max_iters=100, random_state=None, init='k-means++'):
        self.n_clusters = n_clusters
        self.max_iters = max_iters
        self.random_state = random_state
        self.init = init
        self.centroids = None
        self.labels_ = None
        self.cluster_names = {
            0: "Budget-Friendly Laptops",
            1: "Mid-Range Performance",
            2: "Premium Workstations",
            3: "Gaming & High-Performance",
            4: "Ultraportable & Business"
        }
        
    def fit(self, X):
        logging.info(f"Training K-Means with {self.n_clusters} clusters...")
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        if self.random_state is not None:
            np.random.seed(self.random_state)
            
        n_samples, n_features = X.shape
        if self.init == 'k-means++':
            self.centroids = self._kmeans_plus_plus(X)
        else:
            idx = np.random.choice(n_samples, self.n_clusters, replace=False)
            self.centroids = X[idx]
        
        for _ in range(self.max_iters):
            old_labels = self.labels_ if self.labels_ is not None else np.zeros(n_samples)
            self.labels_ = self._assign_clusters(X)
            
            if np.all(old_labels == self.labels_):
                break
                
            for k in range(self.n_clusters):
                if np.sum(self.labels_ == k) > 0:
                    self.centroids[k] = np.mean(X[self.labels_ == k], axis=0)
                else:
                    self.centroids[k] = X[np.random.choice(n_samples)]
        
        return self
    
    def _kmeans_plus_plus(self, X):
        n_samples = X.shape[0]
        centroids = [X[np.random.choice(n_samples)]]
        
        for _ in range(1, self.n_clusters):
            distances = np.array([min([np.sum((x - c) ** 2) for c in centroids]) for x in X])
            probs = distances / (distances.sum() + 1e-10)
            cumprobs = probs.cumsum()
            r = np.random.random()
            for j, p in enumerate(cumprobs):
                if r < p:
                    centroids.append(X[j])
                    break
        
        return np.array(centroids)
    
    def _assign_clusters(self, X):
        distances = np.zeros((X.shape[0], self.n_clusters))
        for k in range(self.n_clusters):
            distances[:, k] = np.sqrt(np.sum((X - self.centroids[k]) ** 2, axis=1))
        return np.argmin(distances, axis=1)
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
        return self._assign_clusters(X)
    
    def get_cluster_examples(self, cluster_id, df, X_all, top_n=5):
        logging.info(f"Generating examples for cluster {cluster_id}...")
        try:
            cluster_labels = self.predict(X_all)
            cluster_mask = cluster_labels == cluster_id
            cluster_df = df[cluster_mask].copy()
            
            if len(cluster_df) == 0:
                return []
            
            cluster_df['diversity_score'] = (
                cluster_df['Ram'] * 0.5 +
                cluster_df['SSD'] * 0.001 +
                cluster_df['ppi'] * 0.02 +
                (cluster_df['Weight'] * -1) +
                cluster_df['Touchscreen'] * 5 +
                cluster_df['Ips'] * 5
            )
            
            cluster_df_sorted = cluster_df.sort_values(['diversity_score', 'Price'], 
                                                      ascending=[False, True])
            
            examples = []
            for _, laptop in cluster_df_sorted.head(top_n).iterrows():
                company = laptop.get('Company', 'Unknown')
                type_name = laptop.get('TypeName', 'Laptop')
                ram = laptop.get('Ram', 0)
                ssd = laptop.get('SSD', 0)
                hdd = laptop.get('HDD', 0)
                cpu = laptop.get('Cpu brand', 'Unknown')
                gpu = laptop.get('Gpu brand', 'Unknown')
                weight = laptop.get('Weight', 0)
                price = laptop.get('Price', 0)
                
                storage_parts = []
                if ssd > 0:
                    storage_parts.append(f"{ssd}GB SSD")
                if hdd > 0:
                    storage_parts.append(f"{hdd}GB HDD")
                storage = " + ".join(storage_parts) if storage_parts else "No storage info"
                
                features = []
                if laptop.get('Touchscreen', 0):
                    features.append('Touchscreen')
                if laptop.get('Ips', 0):
                    features.append('IPS Display')
                features_text = ', '.join(features) if features else 'Standard Features'
                
                example = {
                    'Company': company,
                    'TypeName': type_name,
                    'Title': f"{company} {type_name}",
                    'Ram': f"{ram}GB",
                    'Storage': storage,
                    'Cpu_brand': cpu,
                    'Gpu_brand': gpu,
                    'Weight': f"{weight:.1f}kg" if weight > 0 else "Weight N/A",
                    'Price': f"RS {price:,.2f}",
                    'Features': features_text,
                    'Touchscreen': 'Yes' if laptop.get('Touchscreen', 0) else 'No',
                    'Ips': 'Yes' if laptop.get('Ips', 0) else 'No',
                    'os': laptop.get('os', 'Unknown OS')
                }
                examples.append(example)
            
            return examples
            
        except Exception as e:
            logging.error(f"Error getting cluster examples: {e}")
            return []

# ====================== DATA PIPELINE ======================

logging.info("Loading and preprocessing data...")
try:
    df = pd.read_csv('laptop_data.csv')
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=["Unnamed: 0"], inplace=True)
except FileNotFoundError:
    logging.error("laptop_data.csv not found!")
    raise

# Data cleaning
df["Ram"] = df["Ram"].str.replace("GB", "", regex=False).astype("int")
df["Weight"] = df["Weight"].str.replace("kg", "", regex=False).str.strip()
df["Weight"] = pd.to_numeric(df["Weight"], errors='coerce').fillna(df["Weight"].mode()[0]).astype("float")

# Handle outliers in Price and Weight
df['Price'] = df['Price'].clip(upper=df['Price'].quantile(0.99))
df['Weight'] = df['Weight'].clip(upper=df['Weight'].quantile(0.99))

# Feature engineering
df["Touchscreen"] = df["ScreenResolution"].apply(lambda x: 1 if "Touchscreen" in str(x) else 0)
df["Ips"] = df["ScreenResolution"].apply(lambda x: 1 if "IPS" in str(x) else 0)

# Process resolution
try:
    temp = df["ScreenResolution"].str.split("x", n=1, expand=True)
    df["X_res"] = temp[0].str.replace(',', '').str.findall(r'(\d+\.?\d+)').apply(lambda x: x[0] if x else '1920').astype(int)
    df["Y_res"] = temp[1].astype(int)
    df['ppi'] = (((df['X_res']**2) + (df['Y_res']**2))**0.5/df['Inches']).astype('float')
    df.drop(columns=["ScreenResolution", "X_res", "Y_res", "Inches"], inplace=True)
except Exception as e:
    logging.error(f"Error processing resolution: {e}")
    raise

# Process CPU
df['Cpu Name'] = df['Cpu'].apply(lambda x: " ".join(x.split()[0:3]))
def fetch_processor(text):
    if text in ['Intel Core i7', 'Intel Core i5', 'Intel Core i3']:
        return text
    elif text.split()[0] == 'Intel':
        return 'Other Intel Processor'
    else:
        return 'AMD Processor'
df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
df.drop(columns=['Cpu', 'Cpu Name'], inplace=True)

# Process Memory
df['Memory'] = df['Memory'].astype(str).replace(r'\.0', '', regex=True)
df["Memory"] = df["Memory"].str.replace('GB', '', regex=False).str.replace('TB', '000', regex=False)
new = df["Memory"].str.split("+", n=1, expand=True)
df["first"] = new[0].str.strip().str.replace(r'\D', '', regex=True).astype(int)
df["second"] = new[1].fillna("0").str.replace(r'\D', '', regex=True).astype(int)
df["HDD"] = (df["first"] * df["first"].apply(lambda x: 1 if "HDD" in str(x) else 0)) + \
            (df["second"] * df["second"].apply(lambda x: 1 if "HDD" in str(x) else 0))
df["SSD"] = (df["first"] * df["first"].apply(lambda x: 1 if "SSD" in str(x) else 0)) + \
            (df["second"] * df["second"].apply(lambda x: 1 if "SSD" in str(x) else 0))
df.drop(columns=['first', 'second', 'Memory'], inplace=True)

# Process GPU
df['Gpu brand'] = df['Gpu'].apply(lambda x: x.split()[0])
df = df[df['Gpu brand'] != 'ARM']
df.drop(columns=['Gpu'], inplace=True)

# Process OS
def cat_os(inp):
    if inp in ['Windows 10', 'Windows 7', 'Windows 10 S']:
        return 'Windows'
    elif inp in ['macOS', 'Mac OS X']:
        return 'Mac'
    else:
        return 'Others/No OS/Linux'
df['os'] = df['OpSys'].apply(cat_os)
df.drop(columns=['OpSys'], inplace=True)

# Features and target
X = df.drop(columns=['Price'])
y = np.log(df['Price'])  # Log-transform Price
logging.info(f"Target (y) range: min={y.min():.4f}, max={y.max():.4f}, mean={y.mean():.4f}")

# Preprocessing pipeline for Random Forest and KNN
cat_cols = ['Company', 'TypeName', 'Cpu brand', 'Gpu brand', 'os']
num_cols = ['Ram', 'Weight', 'Touchscreen', 'Ips', 'ppi', 'HDD', 'SSD']
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

# Preprocessing pipeline for K-Means (numerical features only)
kmeans_preprocessor = StandardScaler()
X_num = df[num_cols]
X_num_transformed = kmeans_preprocessor.fit_transform(X_num)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_train_num = kmeans_preprocessor.transform(X_train[num_cols])
X_test_num = kmeans_preprocessor.transform(X_test[num_cols])

# Apply PCA for K-Means
pca = PCA(n_components=5)  # Fixed number of components
X_train_pca = pca.fit_transform(X_train_num)
X_test_pca = pca.transform(X_test_num)
logging.info(f"PCA explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")

# ====================== MODEL TRAINING ======================

# Tune Random Forest
logging.info("Tuning Random Forest...")
best_rf = None
best_r2_rf = -float('inf')
rf_params = [(n, d, s) for n in [100, 200] for d in [10, 20, None] for s in [10, 20]]
for n_estimators, max_depth, min_samples_split in rf_params:
    rf = RandomForest(n_estimators=n_estimators, max_depth=max_depth, max_features='sqrt')
    rf.fit(X_train_transformed, y_train)
    y_pred = rf.predict(X_test_transformed)
    r2 = r2_score(y_test, y_pred)
    logging.info(f"R² for n_estimators={n_estimators}, max_depth={max_depth}, min_samples_split={min_samples_split}: {r2:.4f}")
    if r2 > best_r2_rf:
        best_r2_rf = r2
        best_rf = rf

rf_model = best_rf
y_pred_rf = rf_model.predict(X_test_transformed)
logging.info(f"Random Forest predictions range: min={y_pred_rf.min():.4f}, max={y_pred_rf.max():.4f}, mean={y_pred_rf.mean():.4f}")
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

logging.info("\nRandom Forest Performance:")
logging.info(f"MSE: {mse_rf:.4f}")
logging.info(f"MAE: {mae_rf:.4f}")
logging.info(f"R² Score: {r2_rf:.4f}")

# Fallback: Train sklearn RandomForestRegressor
logging.info("\nTraining sklearn RandomForestRegressor for comparison...")
sklearn_rf = RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=10, random_state=42)
sklearn_rf.fit(X_train_transformed, y_train)
y_pred_sklearn_rf = sklearn_rf.predict(X_test_transformed)
mse_sklearn_rf = mean_squared_error(y_test, y_pred_sklearn_rf)
mae_sklearn_rf = mean_absolute_error(y_test, y_pred_sklearn_rf)
r2_sklearn_rf = r2_score(y_test, y_pred_sklearn_rf)

logging.info("\nSklearn RandomForestRegressor Performance:")
logging.info(f"MSE: {mse_sklearn_rf:.4f}")
logging.info(f"MAE: {mae_sklearn_rf:.4f}")
logging.info(f"R² Score: {r2_sklearn_rf:.4f}")

# Train KNN
logging.info("\nTraining KNN...")
knn_model = CustomKNN(k=5, metric='cosine')
knn_model.fit(X_train_transformed, y_train)
y_pred_knn = knn_model.predict(X_test_transformed)
mse_knn = mean_squared_error(y_test, y_pred_knn)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

logging.info("\nKNN Performance:")
logging.info(f"MSE: {mse_knn:.4f}")
logging.info(f"MAE: {mae_knn:.4f}")
logging.info(f"R² Score: {r2_knn:.4f}")

# Tune K-Means with Elbow Method
logging.info("\nTuning K-Means...")
best_kmeans = None
best_silhouette = -float('inf')
inertia = []
for k in range(2, 10):
    kmeans = CustomKMeans(n_clusters=k, max_iters=100, random_state=42, init='k-means++')
    kmeans.fit(X_train_pca)
    score = silhouette_score(X_train_pca, kmeans.labels_)
    inertia.append(np.sum([np.sum((X_train_pca[kmeans.labels_ == i] - kmeans.centroids[i])**2) 
                           for i in range(k)]))
    logging.info(f"Silhouette Score for k={k}: {score:.4f}")
    if score > best_silhouette:
        best_silhouette = score
        best_kmeans = kmeans

kmeans_model = best_kmeans
silhouette = silhouette_score(X_train_pca, kmeans_model.labels_)
logging.info("\nK-Means Performance:")
logging.info(f"Best Number of Clusters: {kmeans_model.n_clusters}")
logging.info(f"Silhouette Score: {silhouette:.4f}")
logging.info(f"Inertia for k={kmeans_model.n_clusters}: {inertia[kmeans_model.n_clusters-2]:.4f}")

# ====================== SAVE MODELS ======================

logging.info("\nSaving models...")
joblib.dump({
    'df': df,
    'preprocessor': preprocessor,
    'kmeans_preprocessor': kmeans_preprocessor,
    'pca': pca,
    'random_forest': rf_model,
    'sklearn_random_forest': sklearn_rf,
    'knn': knn_model,
    'kmeans': kmeans_model
}, 'laptop_models_improved.pkl')

logging.info("Saved successfully to laptop_models_improved.pkl ✅")

2025-09-22 00:38:16,426 - INFO - Loading and preprocessing data...
2025-09-22 00:38:16,476 - INFO - Target (y) range: min=9.1346, max=12.0304, mean=10.8126
2025-09-22 00:38:16,502 - INFO - PCA explained variance ratio: 1.0000
2025-09-22 00:38:16,503 - INFO - Tuning Random Forest...
2025-09-22 00:38:16,517 - INFO - Training Random Forest...
2025-09-22 00:38:47,487 - INFO - Random Forest predictions sample: [-0.47391216 -0.47391216 -0.47391216 -0.47391216 -0.47391216]
2025-09-22 00:38:47,487 - INFO - R² for n_estimators=100, max_depth=10, min_samples_split=10: -336.5811
2025-09-22 00:38:47,487 - INFO - Training Random Forest...
