In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('/kaggle/input/laptop-data/laptop_data.csv')
df.head()



In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

Now, we removed certain columns that are not necessary and do some feature engineering, We use regex to get relevant data from the columns

In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df.head()

In [None]:
df['Ram']=df['Ram'].str.replace('GB','')
df['Weight']=df['Weight'].str.replace('kg','')


In [None]:
df['Ram']=df['Ram'].astype('int32')
df['Weight']=df['Weight'].astype('float32')
df.head()
df.info()

We do some Data Analysis now

In [None]:
sns.distplot(df['Price'])

We see that most laptops have lower prices and those with high prices are quite low.

In [None]:
df['Company'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Company'],y=df['Price'])
plt.xticks(rotation='vertical')

In [None]:
df['TypeName'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['TypeName'],y=df['Price'])
plt.xticks(rotation='vertical')

In [None]:
sns.distplot(df['Inches'])


We now focus on the Screen Resolution, it is providing multiple data so we need to break it down a bit and do some anaylsis on it

In [None]:
df['ScreenResolution'].value_counts()

In [None]:
df['TouchScreen']=df['ScreenResolution'].apply(lambda x:1 if 'Touchscreen' in x else 0)

In [None]:
df['TouchScreen'].value_counts().plot(kind='bar')

Most laptops do not have a touchscreen, now we see the trend between Price of the laptop and Touchscreen feature

In [None]:
sns.barplot(x=df['TouchScreen'],y=df['Price'])

From the above barplot, we see that having Touchscreen does have a higher price on an average.

We create a new column which will indicate whether the laptop has an IPS panel or not.

In [None]:
df['Ips']=df['ScreenResolution'].apply(lambda x:1 if 'IPS' in x else 0)

In [None]:
sns.barplot(x=df['Ips'],y=df['Price'])

From the above barplot, we see that the presence of an IPS panel means the price of laptop is higher

We need to extract resolution details from the ScreenResolution column

In [None]:
new=df['ScreenResolution'].str.split('x',n=1,expand=True)


In [None]:
df['Xres']=new[0]
df['Yres']=new[1]
#Using regex to get the X_resolution
df['Xres']=df['Xres'].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x:x[0])


In [None]:
df.head()

In [None]:
df['Xres']=df['Xres'].astype('int32')
df['Yres']=df['Yres'].astype('int32')
df.info()

In [None]:
df.corr()['Price']

Inches do not have much relation with price, it has decent correlation with resolution. As resolution increases, so does the price.

In [None]:
#We make a new column named PPI so that we can remove Xres and Yres columns to make things simpler
df['ppi']=(((df['Xres']**2+df['Yres']**2))**0.5/df['Inches']).astype('float')

In [None]:
df.corr()['Price']
df.drop(columns=['ScreenResolution','Xres','Yres'],inplace=True)


In [None]:
df.drop(columns=['Inches'],inplace=True)
df.head()

In [None]:
df.head()

In [None]:
df['CpuName']=df['Cpu'].apply(lambda x:" ".join(x.split()[0:3]))

In [None]:
df.head()

We now differentiate Cpu names as Intel I3/I5/I7, Other Intel and AMD Processor as in the dataset, only AMD processor is present which is not Intel

We use a function to check what processor the laptop has

In [None]:
def processor(t):
    if t=='Intel Core i7' or t=='Intel Core i5' or t=='Intel Core i3':
        return t
    else:
        if t.split()[0]=='Intel':
            return 'Other Intel processor'
        else:
            return 'AMD processor'
        

In [None]:
df['CPU_Brand']=df['CpuName'].apply(processor)

In [None]:
df.head()

In [None]:
df['CPU_Brand'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['CPU_Brand'],y=df['Price'])
plt.xticks(rotation='vertical')

From the plot, we see that i7 processor is most expensive 

In [None]:
df.drop(columns=['Cpu','CpuName'],inplace=True)


In [None]:
df.head()

In [None]:
df['Ram'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Ram'],y=df['Price'])


**Now we focus on Memory Column, and make it appropriate for analyzing it**

In [None]:
df['Memory'].value_counts()

In [None]:
df.head()

The below code is for basically making 4 columns for each type of memory and the data in it is the value given accordingly in Memory Column

Online Resources have helped me out for this.

In [None]:

df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
df["Memory"] = df["Memory"].str.replace('GB', '')
df["Memory"] = df["Memory"].str.replace('TB', '000')
new = df["Memory"].str.split("+", n = 1, expand = True)

df["first"]= new[0]
df["first"]=df["first"].str.strip()

df["second"]= new[1]

df["Layer1HDD"] = df["first"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer1SSD"] = df["first"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer1Hybrid"] = df["first"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer1Flash_Storage"] = df["first"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df['first'] = df['first'].str.replace(r'\D', '')

df["second"].fillna("0", inplace = True)

df["Layer2HDD"] = df["second"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer2SSD"] = df["second"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer2Hybrid"] = df["second"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer2Flash_Storage"] = df["second"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df['second'] = df['second'].str.replace(r'\D', '')

df["first"] = df["first"].astype(int)
df["second"] = df["second"].astype(int)

df["HDD"]=(df["first"]*df["Layer1HDD"]+df["second"]*df["Layer2HDD"])
df["SSD"]=(df["first"]*df["Layer1SSD"]+df["second"]*df["Layer2SSD"])
df["Hybrid"]=(df["first"]*df["Layer1Hybrid"]+df["second"]*df["Layer2Hybrid"])
df["Flash_Storage"]=(df["first"]*df["Layer1Flash_Storage"]+df["second"]*df["Layer2Flash_Storage"])

df.drop(columns=['first', 'second', 'Layer1HDD', 'Layer1SSD', 'Layer1Hybrid',
       'Layer1Flash_Storage', 'Layer2HDD', 'Layer2SSD', 'Layer2Hybrid',
       'Layer2Flash_Storage'],inplace=True)




In [None]:
df.drop(columns=['Memory'],inplace=True)
df.head()

In [None]:
df.corr()['Price']


We see that Price has
* Strong correlation with SSD
* Negative correlation with HDD as in if more HDD, cheaper the laptop
* Hybrid and Flash_Storage don't have much correlation with Price itself so they can actually be removed

In [None]:
df.drop(columns=['Hybrid','Flash_Storage'],inplace=True)


In [None]:
df.head()

We now focus on GPU column and OpSys columns and then move on to build the model.

In [None]:
#GPU
df['Gpu'].value_counts()

We just need brand name of GPU i.e Intel, Nvidia and AMD

In [None]:
df['Gpu_brand']=df['Gpu'].apply(lambda x:x.split()[0])

In [None]:
df['Gpu_brand'].value_counts()

Since there isonly one ARM branded GPU, removing it won't effect our results


In [None]:

df=df[df['Gpu_brand']!='ARM']

In [None]:
df['Gpu_brand'].value_counts()
#To check if ARM has been removed

In [None]:
sns.barplot(x=df['Gpu_brand'],y=df['Price'],estimator=np.median)
plt.xticks(rotation='vertical')

* Prices are in the order Nvidia>Intel>AMD

In [None]:
df.drop(columns=['Gpu'],inplace=True)
df.head()


Finally, we analyze the Operating System.

In [None]:
df['OpSys'].value_counts()

In [None]:
def os(text):
    if text=='Windows 10' or text=='Windows 7' or text=='Windows 10 S':
        return 'Windows'
    elif text=='macOS' or text=='Mac OS X':
        return 'Mac'
    else:
        return 'Others/No OS/Linux'

In [None]:
df['OS']=df['OpSys'].apply(os)

In [None]:
df.drop(columns=['OpSys'],inplace=True)

In [None]:
df.head()

In [None]:
sns.barplot(x=df['OS'],y=df['Price'],estimator=np.median)
plt.xticks(rotation='vertical')

Mac is most expensive out of the 3 types we have made.

In [None]:
sns.distplot(df['Weight'])

Most laptops are in the 2kgish region.

In [None]:
sns.heatmap(df.corr())

In [None]:
sns.distplot(np.log(df['Price']))

We take log so that it becomes less skewed and whilst showing output, we take exponential and reverse what we had done.

**Splitting Data and Training the model**

In [None]:
X=df.drop(columns=['Price'])
Y=np.log(df['Price'])

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.15,random_state=2)

We have to make all data in numerics, so we do One Hot Encoding and convert categorical data to numeric.

In [None]:
#For making data suitable for training
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error

Importing many algorithms to get best results

In [None]:
#Importing variuous models and comparing their results and picking the most accurate one
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [None]:
#step1=ColumnTransformer(transformers=[('col_tnf',OneHotEncoder(sparse=False,drop='first'),[0,1,7,10,11])],remainder='passthrough')

In [None]:
pipe1=make_column_transformer((OneHotEncoder(),['Company','TypeName','CPU_Brand','Gpu_brand','OS']),remainder='passthrough')

In [None]:
X_train.head()

We use all algorithms and then the best r2 score we got was with 

In [None]:
step2=RandomForestRegressor(n_estimators=600,random_state=4,max_samples=0.65,max_features=0.85,max_depth=80)
pipe=Pipeline([('step1',step1),('step2',step2)])
pipe.fit(X_train,Y_train)

Y_pred=pipe.predict(X_test)
print('R2 Score is ',r2_score(Y_test,Y_pred))
print('MeanAbsErr is ',mean_absolute_error(Y_test,Y_pred))

> Exporting the Model


In [None]:
import pickle
pickle.dump(df,open('df.pkl','wb'))
pickle.dump(pipe,open('pipe.pkl','wb'))