In [175]:
import pandas as pd
import joblib
import sys
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer

sys.path.append(os.path.abspath("../../"))
from utils.helper import clean_data

# Loding Data

In [176]:
data = pd.read_csv('../../data/processed/cleaned_laptop_data.csv')

In [177]:
data.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,Width,Height,CpuBrand,GpuBrand,SSD,HDD
0,Apple,Ultrabook,13.3,8,macOS,1.37,71378.6832,2560,1600,Intel,Intel,128,0
1,Apple,Ultrabook,13.3,8,macOS,1.34,47895.5232,1440,900,Intel,Intel,0,0
2,HP,Notebook,15.6,8,No OS,1.86,30636.0,1920,1080,Intel,Intel,256,0
3,Apple,Ultrabook,15.4,16,macOS,1.83,135195.336,2880,1800,Intel,AMD,512,0
4,Apple,Ultrabook,13.3,8,macOS,1.37,96095.808,2560,1600,Intel,Intel,256,0


In [178]:
data.shape

(1303, 13)

# ML

In [179]:
# Spliting data into features and target variable
X = data.drop(columns=['Price'], axis=1)
y = data['Price']

In [180]:
X.shape, y.shape

((1303, 12), (1303,))

In [181]:
# Train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Combine encoder and scaler into a single ColumnTransformer
preprocessor = ColumnTransformer([
	('encoder', OneHotEncoder(), ['Company', 'TypeName', 'OpSys', 'CpuBrand', 'GpuBrand']),
	('scaler', MinMaxScaler(), ['Inches', 'Ram', 'Weight', 'Width', 'Height', 'SSD', 'HDD'])
], remainder='passthrough')

# Create a new pipeline with the combined preprocessor
pipeline_fixed = Pipeline(steps=[
	('preprocessor', preprocessor),
	('model', RandomForestRegressor(n_estimators=100, max_depth=20, min_samples_split=2, min_samples_leaf=1, random_state=42))
])

pipeline_fixed.fit(X_train, y_train)
# Doing predictions
predictions = pipeline_fixed.predict(X_test)
# Evaluating the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')



Mean Squared Error: 299518960.99290603
R2 Score: 0.7922720213076211


In [190]:
# Now I want to check the column names after preprocessing
# Get the feature names after preprocessing
feature_names = pipeline_fixed.named_steps['preprocessor'].get_feature_names_out()
print("Feature names after preprocessing:")
print(feature_names)

Feature names after preprocessing:
['encoder__Company_Acer' 'encoder__Company_Apple' 'encoder__Company_Asus'
 'encoder__Company_Chuwi' 'encoder__Company_Dell'
 'encoder__Company_Fujitsu' 'encoder__Company_Google'
 'encoder__Company_HP' 'encoder__Company_Huawei' 'encoder__Company_LG'
 'encoder__Company_Lenovo' 'encoder__Company_MSI'
 'encoder__Company_Mediacom' 'encoder__Company_Microsoft'
 'encoder__Company_Razer' 'encoder__Company_Samsung'
 'encoder__Company_Toshiba' 'encoder__Company_Vero'
 'encoder__Company_Xiaomi' 'encoder__TypeName_2 in 1 Convertible'
 'encoder__TypeName_Gaming' 'encoder__TypeName_Netbook'
 'encoder__TypeName_Notebook' 'encoder__TypeName_Ultrabook'
 'encoder__TypeName_Workstation' 'encoder__OpSys_Android'
 'encoder__OpSys_Chrome OS' 'encoder__OpSys_Linux'
 'encoder__OpSys_Mac OS X' 'encoder__OpSys_No OS'
 'encoder__OpSys_Windows 10' 'encoder__OpSys_Windows 10 S'
 'encoder__OpSys_Windows 7' 'encoder__OpSys_macOS' 'encoder__CpuBrand_AMD'
 'encoder__CpuBrand_Intel' '

In [195]:
# Create a dictionary of X_test column names and their datatypes
column_info = {col: X_test[col].dtype for col in X_test.columns}
# Print the column names and their datatypes
print("Column names and their datatypes:")
for col, dtype in column_info.items():
    print(f"{col}: {dtype}")

Column names and their datatypes:
Company: object
TypeName: object
Inches: float64
Ram: int64
OpSys: object
Weight: float64
Width: int64
Height: int64
CpuBrand: object
GpuBrand: object
SSD: int64
HDD: int64


In [192]:
# Now saving the model
model_path = '../../models/Laptop_price_predictor.pkl'
joblib.dump(pipeline_fixed, model_path)

['../../models/Laptop_price_predictor.pkl']