In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv('laptop_pricing_dataset2.csv')

In [4]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [5]:
df.sample(5)

Unnamed: 0,Manufacturer,Category,Screen,GPU,OS,CPU_core,Screen_Size_cm,CPU_frequency,RAM_GB,Storage_GB_SSD,Weight_kg,Price
212,Xiaomi,4,IPS Panel,3,1,5,33.02,2.3,8,256,1.28,1188
227,Toshiba,3,IPS Panel,2,1,5,35.56,2.3,4,128,1.47,1404
161,Lenovo,4,IPS Panel,2,1,7,30.48,2.7,8,256,1.36,2012
198,MSI,1,Full HD,3,1,5,39.624,2.5,8,256,2.4,1523
184,HP,3,Full HD,2,1,5,39.624,2.4,8,256,1.84,2414


In [6]:
df.shape

(238, 12)

In [9]:
X = df.drop(columns = ['Price'], inplace = False)
categorical = [column_Name for column_Name in X.columns if X[column_Name].dtype == 'object'] #finds which columns are categorical
numerical = [column_Name for column_Name in X.columns if column_Name not in categorical] #finds which columns are numerical

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer

In [11]:
numerical_Transformer = SimpleImputer(strategy='median') #Use a simple imputer that uses median because some columns 
#                                                         are numerical categories meaning the values must be integers, 
#                                                         so mean would produce a float which is wrong
categorical_Transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')), #must use most_frequent since categories don't have mean or median
    ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output= False)) #Use OneHotEncoding so that model can understand data
])
#Putting everything together into a preprocessor
pipe = ColumnTransformer(
    transformers=[
    ('num', numerical_Transformer, numerical),
    ('cat', categorical_Transformer, categorical)
])

In [12]:
X_procced = pipe.fit_transform(X)

In [16]:
y=df['Price']

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X_procced,y,test_size=0.2,random_state=42)

In [18]:
newlr=LinearRegression()
newlr.fit(X_train,y_train)

In [19]:
y_pred=newlr.predict(X_test)

In [25]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [27]:
print('Mean Squared error in Linear Regression:',mean_squared_error(y_pred,y_test))
print('R2 score:',r2_score(y_pred,y_test))
print('Mean absolute error in Linear Regression:',mean_absolute_error(y_pred,y_test))

Mean Squared error in Linear Regression: 125997.48295277066
R2 score: 0.41764025652375014
Mean absolute error in Linear Regression: 280.4643844024384


In [22]:
import pickle

In [23]:

pickle.dump(newlr,open('newlr.sav','wb'))