### Step 0: Import libraries

In [None]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor


### Step 1: Load Dataset & Preview

In [None]:
# 1a. dataset path
data_path = '../data/raw/laptop_prices.csv'
data = pd.read_csv(data_path, encoding="ISO-8859-1")

In [None]:
# 1b. preview dataset
print("Dataset Preview:")
display(data.head())

In [None]:
# replace Cpu with CPU, and so on.
data.rename(columns=lambda x: x.strip().replace('Cpu', 'CPU').replace('Gpu', 'GPU').replace('Ram', 'RAM'), inplace=True)
# df = data.rename(columns=lambda x: x.strip().replace(' ', '_').replace('-', '_').capitalize(), inplace=True)
print("\nColumns after renaming:")
print(data.columns.tolist())

In [None]:
# 1c. dataset info
print("\nDataset Info:")
df = data
print(df.info())

In [None]:
# 1d. check for missing values
print("\nMissing Values in Each Column:")
print(df.isnull().sum())

### Step 2: Exploratory Data Analysis (EDA)

In [None]:
# 2a. Statistical summary
print("\nStatistical Summary:")
display(df.describe())

In [None]:
# 2b. Visualize Target Distribution
target_column = 'Price_euros'
plt.figure(figsize=(8, 5))
sns.histplot(df[target_column], bins=30, kde=True, color='skyblue')
plt.title(f"{target_column} Distribution")
plt.xlabel(target_column)
plt.ylabel("Frequency")
plt.show()

In [None]:
# 2c. Correlation Heatmap (numeric features only)
plt.figure(figsize=(10, 8))
# to select only numeric columns for correlation
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

### Step 3: Data Preprocessing

In [None]:
# Handle columns like 'Weight' and 'Inches' to convert them to numeric
df['Weight'] = pd.to_numeric(df['Weight'], errors='coerce')
df['Inches'] = pd.to_numeric(df['Inches'], errors='coerce')
# df.dropna(subset=['Weight', 'Inches', target_column], inplace=True)

In [None]:
# 3a. Select feature columns
# numeric columns for features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_columns = numeric_cols.copy()
X = df[feature_columns]
y = df[target_column]

In [None]:
# 3b. Split the dataset (Train-Test Split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Step 4: Model Training

In [None]:
# 4a: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [None]:
# 4b: Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

### Step 5: Model Evaluation

In [None]:
# 5a. Model Evaluation Function
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n======== {model_name} Performance ======")
    print(f"Mean Absolute Error (MAE):", round(mean_absolute_error(y_true, y_pred), 2))
    print(f"Root Mean Squared Error (RMSE):", round(np.sqrt(mean_squared_error(y_true, y_pred)), 2))
    print(f"R-squared (R2 ): ", round(r2_score(y_true, y_pred), 2))

In [None]:
# 5b. Evaluate Linear Regression
evaluate_model(y_test, y_pred_lr, "Linear Regression")

In [None]:
# 5c. Evaluate Random Forest Regressor
evaluate_model(y_test, y_pred_rf, "Random Forest Regressor")

### Step 6: Feature Importance (Random Forest)

In [None]:
# 6a: Feature Importance from Random Forest
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importances (Random Forest):")
display(feature_importance_df)

In [None]:
# 6b: Visualize Feature Importance
plt.figure(figsize=(8, 5))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title("Feature Importance from Random Forest")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

### Step 7: Actionable Recommendations

In [None]:
# Example: recommend items under certain budgets.
budget = 1000  # Example budget
recommended_items = df[df[target_column] <= budget]
print(f"\nItems recommended under budget of {budget}:")
display(recommended_items[['Brand', 'Model', target_column]])

### Step 8: Save the Best Model

In [None]:
# Choose the best performing model so far.
best_model = rf_model if r2_score(y_test, y_pred_rf) > r2_score(y_test, y_pred_lr) else lr_model
model_filename = f'../models/best_model_{target_column}.pkl'
joblib.dump(best_model, model_filename)
print(f"\nBest model saved as {model_filename}.")