In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import issparse
import joblib

print("Libraries imported successfully.")

# 2. DATA LOADING AND PREPROCESSING
df = pd.read_csv('laptop_data.csv')
df.drop(columns=["Unnamed: 0"], inplace=True)

# Data cleaning
df["Ram"] = df["Ram"].str.replace("GB", "").astype("int")
df["Weight"] = df["Weight"].str.replace("kg", "").astype("float")

# Feature engineering
df["Touchscreen"] = df["ScreenResolution"].apply(lambda x: 1 if "Touchscreen" in x else 0)
df["Ips"] = df["ScreenResolution"].apply(lambda x: 1 if "IPS" in x else 0)

# Process resolution
temp = df["ScreenResolution"].str.split("x", n=1, expand=True)
df["X_res"] = temp[0].str.replace(',', '').str.findall(r'(\d+\.?\d+)').apply(lambda x: x[0]).astype(int)
df["Y_res"] = temp[1].astype(int)
df['ppi'] = (((df['X_res']**2) + (df['Y_res']**2))**0.5 / df['Inches']).astype('float')
df.drop(columns=["ScreenResolution", "X_res", "Y_res", "Inches"], inplace=True)

# Process CPU
df['Cpu Name'] = df['Cpu'].apply(lambda x: " ".join(x.split()[0:3]))
def fetch_processor(text):
    if text in ['Intel Core i7', 'Intel Core i5', 'Intel Core i3']:
        return text
    elif text.split()[0] == 'Intel':
        return 'Other Intel Processor'
    else:
        return 'AMD Processor'
df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
df.drop(columns=['Cpu', 'Cpu Name'], inplace=True)

# Process Memory
df['Memory'] = df['Memory'].astype(str).replace(r'\.0', '', regex=True)
df["Memory"] = df["Memory"].str.replace('GB', '').str.replace('TB', '000')
new = df["Memory"].str.split("+", n=1, expand=True)
df["first"] = new[0].str.strip().str.replace(r'\D', '', regex=True).astype(int)
df["second"] = new[1].fillna("0").str.replace(r'\D', '', regex=True).astype(int)
df["HDD"] = (df["first"] * df["first"].apply(lambda x: 1 if "HDD" else 0)) + \
            (df["second"] * df["second"].apply(lambda x: 1 if "HDD" else 0))
df["SSD"] = (df["first"] * df["first"].apply(lambda x: 1 if "SSD" else 0)) + \
            (df["second"] * df["second"].apply(lambda x: 1 if "SSD" else 0))
df.drop(columns=['first', 'second', 'Memory'], inplace=True)

# Process GPU
df['Gpu brand'] = df['Gpu'].apply(lambda x: x.split()[0])
df = df[df['Gpu brand'] != 'ARM']
df.drop(columns=['Gpu'], inplace=True)

# Process OS
def cat_os(inp):
    if inp in ['Windows 10', 'Windows 7', 'Windows 10 S']:
        return 'Windows'
    elif inp in ['macOS', 'Mac OS X']:
        return 'Mac'
    else:
        return 'Others/No OS/Linux'
df['os'] = df['OpSys'].apply(cat_os)
df.drop(columns=['OpSys'], inplace=True)

# Features and target
X = df.drop(columns=['Price'])
y = np.log(df['Price'])

# Preprocessing pipeline
cat_cols = ['Company', 'TypeName', 'Cpu brand', 'Gpu brand', 'os']
num_cols = ['Ram', 'Weight', 'Touchscreen', 'Ips', 'ppi', 'HDD', 'SSD']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

if issparse(X_train_transformed):
    X_train_transformed = X_train_transformed.toarray()
if issparse(X_test_transformed):
    X_test_transformed = X_test_transformed.toarray()

# 3. TRAIN MODELS USING SCIKIT-LEARN
print("\nTraining Random Forest model...")
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, max_features='sqrt', random_state=42)
rf_model.fit(X_train_transformed, y_train)

# Predictions and evaluation
y_pred = rf_model.predict(X_test_transformed)
print("\nModel Evaluation:")
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

print("\nSetting up KNN Recommender...")
knn_recommender = NearestNeighbors(n_neighbors=11, metric='cosine')  # 10 neighbors + self
knn_recommender.fit(X_train_transformed)

# 4. SAVE EVERYTHING USING JOBLIB
print("\nSaving with joblib...")
joblib.dump({
    'df': df,
    'preprocessor': preprocessor,
    'model': rf_model,
    'knn_recommender': knn_recommender
}, 'laptop_models.pkl')

print("Saved successfully to laptop_models.pkl ✅")

Libraries imported successfully.

Training Random Forest model...

Model Evaluation:
R2 Score: 0.8577
Mean Absolute Error: 0.1848
Root Mean Squared Error: 0.2342

Setting up KNN Recommender...

Saving with joblib...
Saved successfully to laptop_models.pkl ✅


In [3]:
import joblib

# Your actual variables
joblib.dump((df, preprocessor, rf_model), 'laptop_models.pkl')

['laptop_models.pkl']