In [21]:
import numpy as np
import heapq
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.utils import resample
from scipy.sparse import issparse
import joblib

# ====================== CUSTOM MODELS ======================

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree_ = None

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.values
            
        self.tree_ = self._build_tree(X, y, depth=0)
    
    def _build_tree(self, X, y, depth):
        num_samples = X.shape[0]
        
        # Stopping conditions
        if (self.max_depth is not None and depth >= self.max_depth) or \
           num_samples < self.min_samples_split or \
           len(np.unique(y)) == 1:
            return np.mean(y)

        best_split = self._find_best_split(X, y)
        if best_split is None:
            return np.mean(y)
        
        left_indices = X[:, best_split['feature']] <= best_split['value']
        right_indices = ~left_indices
        
        if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
            return np.mean(y)
        
        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        
        return {
            'feature': best_split['feature'],
            'value': best_split['value'],
            'left': left_tree,
            'right': right_tree
        }
    
    def _find_best_split(self, X, y):
        best_split = None
        best_mse = float('inf')
        num_features = X.shape[1]

        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            split_points = np.percentile(unique_values, [25, 50, 75]) if len(unique_values) > 10 else unique_values
            
            for value in split_points:
                left_indices = X[:, feature] <= value
                right_indices = ~left_indices
                
                if np.sum(left_indices) < 2 or np.sum(right_indices) < 2:
                    continue
                
                left_y = y[left_indices]
                right_y = y[right_indices]
                
                mse = (np.var(left_y) * len(left_y) + np.var(right_y) * len(right_y)) / len(y)
                
                if mse < best_mse:
                    best_split = {'feature': feature, 'value': value}
                    best_mse = mse
        
        return best_split

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        return np.array([self._predict(sample, self.tree_) for sample in X])
    
    def _predict(self, sample, tree):
        if not isinstance(tree, dict):
            return tree
        
        if sample[tree['feature']] <= tree['value']:
            return self._predict(sample, tree['left'])
        return self._predict(sample, tree['right'])


class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, max_features='sqrt'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []
        self.feature_indices = []

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.values
            
        n_features = X.shape[1]
        max_feats = int(np.sqrt(n_features)) if self.max_features == 'sqrt' else self.max_features
        
        for _ in range(self.n_estimators):
            X_sample, y_sample = resample(X, y)
            feature_idx = np.random.choice(n_features, max_feats, replace=False)
            X_sub = X_sample[:, feature_idx]
            
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X_sub, y_sample)
            
            self.trees.append(tree)
            self.feature_indices.append(feature_idx)
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        all_preds = np.zeros((self.n_estimators, X.shape[0]))
        
        for i, (tree, feat_idx) in enumerate(zip(self.trees, self.feature_indices)):
            X_sub = X[:, feat_idx]
            all_preds[i] = tree.predict(X_sub)
            
        return np.mean(all_preds, axis=0)


class CustomKNN:
    def __init__(self, k=5, metric='cosine'):
        self.k = k
        self.metric = metric
        self.X_train = None
        self.y_train = None
        
    def _cosine_similarity(self, a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    
    def _euclidean_distance(self, a, b):
        return np.sqrt(np.sum((a - b) ** 2))
    
    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
        self.X_train = X
        # Convert y to numpy array if it's a pandas Series
        self.y_train = y.values if isinstance(y, pd.Series) else y
    
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        if issparse(X):
            X = X.toarray()
            
        predictions = []
        for sample in X:
            distances = []
            if self.metric == 'cosine':
                distances = [self._cosine_similarity(sample, x) for x in self.X_train]
                # Get indices of k largest cosine similarities
                neighbors = np.argpartition(distances, -self.k)[-self.k:]
            else:
                distances = [self._euclidean_distance(sample, x) for x in self.X_train]
                # Get indices of k smallest distances
                neighbors = np.argpartition(distances, self.k)[:self.k]
            
            # Get the average of the neighbors' values
            prediction = np.mean(self.y_train[neighbors])
            predictions.append(prediction)
            
        return np.array(predictions)


# ====================== DATA PIPELINE ======================

print("Loading and preprocessing data...")
df = pd.read_csv('laptop_data.csv')
df.drop(columns=["Unnamed: 0"], inplace=True)

# Data cleaning (assuming prices are already in INR)
df["Price"] = df["Price"].astype(float)  # Ensure price is numeric
df["Ram"] = df["Ram"].str.replace("GB", "").astype("int")
df["Weight"] = df["Weight"].str.replace("kg", "").astype("float")

# Feature engineering with Indian market considerations
df["Touchscreen"] = df["ScreenResolution"].apply(lambda x: 1 if "Touchscreen" in x else 0)
df["Ips"] = df["ScreenResolution"].apply(lambda x: 1 if "IPS" in x else 0)

# Process resolution
temp = df["ScreenResolution"].str.split("x", n=1, expand=True)
df["X_res"] = temp[0].str.replace(',', '').str.findall(r'(\d+\.?\d+)').apply(lambda x: x[0]).astype(int)
df["Y_res"] = temp[1].astype(int)
df['ppi'] = (((df['X_res']**2) + (df['Y_res']**2))**0.5/df['Inches']).astype('float')
df.drop(columns=["ScreenResolution", "X_res", "Y_res", "Inches"], inplace=True)

# Process CPU - adding Indian market specific processors
df['Cpu Name'] = df['Cpu'].apply(lambda x: " ".join(x.split()[0:3]))
def fetch_processor(text):
    if text in ['Intel Core i7', 'Intel Core i5', 'Intel Core i3']:
        return text
    elif text.split()[0] == 'Intel':
        return 'Other Intel Processor'
    else:
        return 'AMD Processor'
df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
df.drop(columns=['Cpu', 'Cpu Name'], inplace=True)

# Process Memory (common Indian configurations)
df['Memory'] = df['Memory'].astype(str).replace(r'\.0', '', regex=True)
df["Memory"] = df["Memory"].str.replace('GB', '').str.replace('TB', '000')
new = df["Memory"].str.split("+", n=1, expand=True)
df["first"] = new[0].str.strip().str.replace(r'\D', '', regex=True).astype(int)
df["second"] = new[1].fillna("0").str.replace(r'\D', '', regex=True).astype(int)
df["HDD"] = (df["first"] * df["first"].apply(lambda x: 1 if "HDD" in str(x) else 0)) + \
            (df["second"] * df["second"].apply(lambda x: 1 if "HDD" in str(x) else 0))
df["SSD"] = (df["first"] * df["first"].apply(lambda x: 1 if "SSD" in str(x) else 0)) + \
            (df["second"] * df["second"].apply(lambda x: 1 if "SSD" in str(x) else 0))
df.drop(columns=['first', 'second', 'Memory'], inplace=True)

# Process GPU - focusing on brands available in India
df['Gpu brand'] = df['Gpu'].apply(lambda x: x.split()[0])
df = df[df['Gpu brand'] != 'ARM']
df.drop(columns=['Gpu'], inplace=True)

# Process OS - adding Indian market specific OS versions
def cat_os(inp):
    if inp in ['Windows 10', 'Windows 7', 'Windows 10 S', 'Windows 11']:
        return 'Windows'
    elif inp in ['macOS', 'Mac OS X']:
        return 'Mac'
    else:
        return 'Others/No OS/Linux'
df['os'] = df['OpSys'].apply(cat_os)
df.drop(columns=['OpSys'], inplace=True)

# Features and target (price already in INR)
X = df.drop(columns=['Price'])
y = np.log(df['Price'])  # Using log transformation for better modeling

# Preprocessing pipeline
cat_cols = ['Company', 'TypeName', 'Cpu brand', 'Gpu brand', 'os']
num_cols = ['Ram', 'Weight', 'Touchscreen', 'Ips', 'ppi', 'HDD', 'SSD']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

if issparse(X_train_transformed):
    X_train_transformed = X_train_transformed.toarray()
if issparse(X_test_transformed):
    X_test_transformed = X_test_transformed.toarray()

# ====================== MODEL TRAINING ======================

print("\nTraining custom Random Forest...")
rf_model = RandomForest(n_estimators=100, max_depth=10, max_features='sqrt')
rf_model.fit(X_train_transformed, y_train)

# Evaluate Random Forest
y_pred_rf = rf_model.predict(X_test_transformed)
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Performance:")
print(f"MSE: {mse_rf:.4f}")
print(f"MAE: {mae_rf:.4f}")
print(f"R² Score: {r2_rf:.4f}")

print("\nTraining custom KNN...")
knn_model = CustomKNN(k=5, metric='cosine')
knn_model.fit(X_train_transformed, y_train)

# Evaluate KNN
y_pred_knn = knn_model.predict(X_test_transformed)
mse_knn = mean_squared_error(y_test, y_pred_knn)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

print("\nKNN Performance:")
print(f"MSE: {mse_knn:.4f}")
print(f"MAE: {mae_knn:.4f}")
print(f"R² Score: {r2_knn:.4f}")

# ====================== SAVE MODELS ======================

print("\nSaving models...")
joblib.dump({
    'df': df,
    'preprocessor': preprocessor,
    'random_forest': rf_model,
    'knn': knn_model
}, 'laptop_models_inr.pkl')

print("Saved successfully to laptop_models_inr.pkl ✅")

# ====================== INDIAN MARKET RECOMMENDATION SYSTEM ======================

class IndiaLaptopRecommender:
    def __init__(self, models_path='laptop_models_inr.pkl'):
        # Load saved models and data
        saved_data = joblib.load(models_path)
        self.df = saved_data['df']
        self.preprocessor = saved_data['preprocessor']
        self.rf_model = saved_data['random_forest']
        
        # Preprocess the entire dataset
        self.X_all = self.preprocessor.transform(self.df.drop(columns=['Price']))
        if issparse(self.X_all):
            self.X_all = self.X_all.toarray()
            
        # Get predicted prices (already in INR)
        self.all_pred_prices = np.exp(self.rf_model.predict(self.X_all))
        self.df['PredictedPrice'] = self.all_pred_prices
        
        # Indian market specific feature weights
        self.indian_feature_weights = {
            'Ram': 0.25,
            'SSD': 0.25,
            'ppi': 0.15,
            'Weight': 0.15,
            'Company': 0.1,  # Brand matters more in India
            'Cpu brand': 0.1  # CPU brand preference strong in India
        }
        
        # Normalize features for scoring
        self._normalize_features()
        
    def _normalize_features(self):
        """Normalize features to 0-1 scale for scoring"""
        # Hardware specs
        self.df['Ram_norm'] = (self.df['Ram'] - self.df['Ram'].min()) / (self.df['Ram'].max() - self.df['Ram'].min())
        self.df['SSD_norm'] = (self.df['SSD'] - self.df['SSD'].min()) / (self.df['SSD'].max() - self.df['SSD'].min())
        self.df['ppi_norm'] = (self.df['ppi'] - self.df['ppi'].min()) / (self.df['ppi'].max() - self.df['ppi'].min())
        self.df['Weight_norm'] = 1 - ((self.df['Weight'] - self.df['Weight'].min()) / 
                                      (self.df['Weight'].max() - self.df['Weight'].min()))
        
        # Brand preferences (Indian market specific)
        popular_brands = ['HP', 'Dell', 'Lenovo', 'Asus', 'Acer']
        self.df['Company_norm'] = self.df['Company'].apply(lambda x: 1.2 if x in popular_brands else 1)
        
        # CPU preferences (Indian market specific)
        self.df['Cpu_brand_norm'] = self.df['Cpu brand'].apply(
            lambda x: 1.3 if 'Intel Core i7' in x else 1.1 if 'Intel Core i5' in x else 1
        )

    def calculate_score(self, custom_weights=None, brand_boost=None, cpu_boost=None):
        """
        Calculate scores with Indian market considerations
        
        Parameters:
        - custom_weights: Override default weight distribution
        - brand_boost: Dict of brand multipliers (e.g., {'Dell': 1.2})
        - cpu_boost: Dict of CPU multipliers (e.g., {'Intel Core i7': 1.3})
        """
        weights = custom_weights if custom_weights else self.indian_feature_weights
        
        # Apply brand boosts if provided
        if brand_boost:
            self.df['Company_norm'] = self.df['Company'].map(brand_boost).fillna(1)
        
        # Apply CPU boosts if provided
        if cpu_boost:
            self.df['Cpu_brand_norm'] = self.df['Cpu brand'].map(cpu_boost).fillna(1)
        
        # Calculate weighted score
        score = (
            self.df['Ram_norm'] * weights.get('Ram', 0) +
            self.df['SSD_norm'] * weights.get('SSD', 0) +
            self.df['ppi_norm'] * weights.get('ppi', 0) +
            self.df['Weight_norm'] * weights.get('Weight', 0) +
            self.df['Company_norm'] * weights.get('Company', 0) +
            self.df['Cpu_brand_norm'] * weights.get('Cpu brand', 0)
        )
        
        return score

    def recommend(self, budget_inr, weights=None, brand_prefs=None, cpu_prefs=None, 
                 top_n=5, price_importance=0.3, min_ram=None, min_ssd=None):
        """
        Recommend laptops for Indian market
        
        Parameters:
        - budget_inr: Budget in INR
        - weights: Custom feature weights
        - brand_prefs: Brand preferences
        - cpu_prefs: CPU preferences
        - top_n: Number of recommendations
        - price_importance: How much to prioritize price (0-1)
        - min_ram: Minimum RAM requirement
        - min_ssd: Minimum SSD requirement
        """
        # Filter by budget and requirements
        mask = self.df['PredictedPrice'] <= budget_inr
        if min_ram:
            mask &= self.df['Ram'] >= min_ram
        if min_ssd:
            mask &= self.df['SSD'] >= min_ssd
            
        budget_df = self.df[mask].copy()
        
        if len(budget_df) == 0:
            closest = self.df.iloc[(self.df['PredictedPrice'] - budget_inr).abs().argsort()[:5]]
            print(f"⚠️ No laptops within ₹{budget_inr:,} budget. Showing closest options:")
            return closest[['Company', 'TypeName', 'Ram', 'SSD', 'Weight', 'PredictedPrice']]
        
        # Calculate scores
        budget_df['FeatureScore'] = self.calculate_score(weights, brand_prefs, cpu_prefs)
        
        # Price score (higher is better - cheaper laptops score higher)
        budget_df['PriceScore'] = 1 - ((budget_df['PredictedPrice'] - budget_df['PredictedPrice'].min()) / 
                                      (budget_df['PredictedPrice'].max() - budget_df['PredictedPrice'].min()))
        
        # Combined score
        budget_df['CombinedScore'] = (
            (1 - price_importance) * budget_df['FeatureScore'] +
            price_importance * budget_df['PriceScore']
        )
        
        # Format INR prices
        budget_df['Price_INR'] = budget_df['PredictedPrice'].apply(lambda x: f"₹{x:,.2f}")
        
        # Sort and return recommendations
        recommendations = budget_df.sort_values(['CombinedScore', 'PredictedPrice'], 
                                              ascending=[False, True])
        
        return recommendations.head(top_n)[[
            'Company', 'TypeName', 'Ram', 'SSD', 'Weight', 'Price_INR',
            'FeatureScore', 'PriceScore', 'CombinedScore'
        ]]

# ====================== EXAMPLE USAGE FOR INDIAN MARKET ======================

print("\nTesting Indian market recommendation system...")
india_recommender = IndiaLaptopRecommender()

# Example 1: Student laptop under ₹50,000
print("\nStudent laptop recommendations under ₹50,000:")
student_weights = {
    'Ram': 0.3,
    'SSD': 0.3,
    'Weight': 0.2,
    'Company': 0.2  # Trusted brands important for students
}
print(india_recommender.recommend(
    budget_inr=50000,
    weights=student_weights,
    min_ram=8,
    price_importance=0.5
))

# Example 2: Premium laptop under ₹1,00,000 with brand preference
print("\nPremium laptop recommendations under ₹1,00,000:")
premium_brands = {
    'Apple': 1.5,
    'Dell': 1.3,
    'HP': 1.2
}
print(india_recommender.recommend(
    budget_inr=100000,
    brand_prefs=premium_brands,
    min_ram=16,
    min_ssd=512,
    price_importance=0.2
))

# Example 3: Budget gaming laptop under ₹70,000
print("\nBudget gaming laptop under ₹70,000:")
gaming_weights = {
    'Ram': 0.4,
    'SSD': 0.3,
    'Cpu brand': 0.3  # CPU important for gaming
}
gaming_cpus = {
    'Intel Core i7': 1.5,
    'Intel Core i5': 1.2,
    'AMD Processor': 1.1
}
print(india_recommender.recommend(
    budget_inr=70000,
    weights=gaming_weights,
    cpu_prefs=gaming_cpus,
    min_ram=8,
    min_ssd=256
))

SyntaxError: invalid syntax (3354635584.py, line 1)