# Laptop Price Prediction Model

Importing necessary modules and packages.

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('laptop_price.csv', encoding="ISO-8859-1")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## 1. Data Cleaning & Preprocessing

In [None]:
df = df.rename(columns=str.lower)

In [None]:
df.columns

In [None]:
df = df.rename(columns={'price_euros':'price'})

#### Checking Duplicates

In [None]:
df[df.duplicated()]

### Feature Engineering + Regular Expression

In [None]:
import regex as re

#### Laptop ID

In [None]:
df['laptop_id']

In [None]:
df=df.drop('laptop_id', axis=1)

We dropped the laptop_id column since we don't need it.

### Company

In [None]:
df['company'].value_counts()

In [None]:
count = 0
for i in df['company']:
    if i in ['Xiaomi','Vero','Chuwi','Google','Fujitsu','LG','Huawei']:
        df['company'][count] = 'Other'
    else:
        df['company'][count] = i
    count+=1

All the companies which have a count value less than 5, renamed as other

In [None]:
df['company'].value_counts()

### Product

In [None]:
df['product'].value_counts()

There are 617 categories.Therefore we can ignore the effect of product variable on the laptop price.

### Type name

In [None]:
df['typename'].value_counts()

### Inches

In [None]:
df['inches'].value_counts()

Inches variable is numerical but it categorical since there few distinct values.

In [None]:
conditions = [
    (df["inches"].lt(14)),
    (df["inches"].ge(14) & df["inches"].lt(16)),
    (df["inches"].ge(16)),
]
choices = ["< 14", "14-16", "> 16"]

df["inches"] = np.select(conditions, choices)

df["inches"].value_counts()

We recode this variable into 3 categories (up to 14 inches, between 14 and 16 inches and over 16 inches)

#### Screen Resolution

In [None]:
df['screenresolution']

In the screen resolution column, there are multiple useful informations which can't be read properly yet.

So we will separate this 3 informations **(Screen type, Resolution, Touchscreen)** in 3 different columns using regular expression.

In [None]:
df['resolution'] = df['screenresolution'].str.extract(r'(\d+x\d+)')

In [None]:
df['screentype'] = df['screenresolution'].replace(r'(\d+x\d+)','',regex=True)

In [None]:
df['screentype'] = df['screentype'].replace(r'(Full HD|Quad HD|Quad HD|\+|/|4K Ultra HD)','',regex=True)

In [None]:
df['screentype']

In [None]:
df['touchscreen'] = df['screentype'].str.extract(r'(Touchscreen)')
df['screentype'] = df['screentype'].replace(r'(Touchscreen)','',regex=True)

In [None]:
df['touchscreen'].value_counts()

*One hot encoding for presence or absence of Touchscreen feature.*

In [None]:
df['touchscreen'] = df['touchscreen'].replace('Touchscreen',1)
df['touchscreen'] = df['touchscreen'].replace(np.nan,0)

In [None]:
df['touchscreen'].value_counts()

In [None]:
df['screentype'].value_counts()

In [None]:
df['screentype']=df['screentype'].replace(r' ','',regex=True)
df['screentype'].value_counts()

Replacing blank spaces with NaN.

In [None]:
df['screentype'] = df['screentype'].replace(r'^\s*$', np.nan, regex=True)
df['screentype'].value_counts()

In [None]:
df.head()

*Removing the Screen Resolution column as we extracted all the necessary information from it.*

In [None]:
df = df.drop('screenresolution', axis=1)

In [None]:
df.head()

In [None]:
df['resolution'].value_counts()

There are resolutions values which have a count value less than 10.we can recorded them as Other

In [None]:
resolution_values = ["2880x1800", "2736x1824", "2560x1600", "2400x1600", "2304x1440", "2256x1504", "2160x1440", 
                  "1920x1200", "1440x900"]
count = 0
for i in df['resolution']:
    if i in resolution_values:
        df['resolution'][count] = 'Other'
    else:
        df['resolution'][count] = i
    count+=1

In [None]:
df['resolution'].value_counts()

In [None]:
df['screentype'].value_counts()

In [None]:
df['touchscreen'].value_counts()

In [None]:
df['cpu']

*We are separating CPU info with CPU frequency using regex again.*

In [None]:
df['cpu_freq'] = df['cpu'].str.extract(r'(\d+(?:\.\d+)?GHz)') 

In [None]:
df['cpu_freq'].value_counts()

In [None]:
df['cpu_freq'] = df['cpu_freq'].str.replace('GHz', '')
df.rename(columns={'cpu_freq': 'cpu_freq(GHz)'}, inplace=True)

In [None]:
df['cpu_freq(GHz)'] = df['cpu_freq(GHz)'].astype(float)

In [None]:
df['cpu']= df['cpu'].str.replace(r'(\d+(?:\.\d+)?GHz)', '', regex=True)

In [None]:
df['cpu'].value_counts()

In [None]:
df.head()

*We removed "GB" from each row in RAM column and added it into the column name and turned the column into Integer. This way we can perform numeric calculations on the row.*

In [None]:
df['ram'] = df['ram'].str.replace('GB', '')

In [None]:
df.rename(columns={'ram': 'ram(GB)'}, inplace=True)

In [None]:
df['ram(GB)'] = df['ram(GB)'].astype(int)

In [None]:
df['ram(GB)'].value_counts()

In [None]:
df.head()

In [None]:
df['memory']

In [None]:
df['memory'].value_counts()

We can see that some laptops have 2 hard drives, there are 4 differnt types of disks: 'SSD', ''HDD', 'Hybrid' and 'Flash Storage'. We will create new columns to include all these information.

In [None]:
df['memory_1']=df['memory']

Converting all sizes into GB for uniformity.

In [None]:
df['memory_1']=df['memory_1'].str.replace('1.0TB','1TB', regex=True)
df['memory_1']=df['memory_1'].str.replace('1TB','1000GB')
df['memory_1']=df['memory_1'].str.replace('2TB','2000GB')
df['memory_1']=df['memory_1'].str.replace('GB','')
df['memory_1'].value_counts()

In [None]:
df['memory2']=df['memory_1'].str.replace(r' ','')
df['memory2'].value_counts()

In [None]:
memory_1=[]
memory_2=[]
for i in df['memory2']:
    if len(re.findall(r'\+',i))==1: # DOUBLE DRIVE
        one=re.findall(r'(\w+)',i)
        memory_1.append(one[0])
        memory_2.append(one[1])
    else: # SINGLE DRIVE
        one=re.findall(r'(\w+)',i)
        memory_1.append(one[0])
        memory_2.append('NaN')

Now we have separated the drives into two lists. We will now create two more lists that will contain the memory types.

In [None]:
memory_1_gb=[]
memory_1_type=[]
for i in memory_1:
    memory_1_type.append(re.findall(r'(\D\w+)',i)[0])
    memory_1_gb.append(re.findall(r'(\d+)',i)[0])

We created succesfully two lists for memory_1 in order to include the capacity and type for each memory 1

In [None]:
memory_2_gb=[]
memory_2_type=[]
for i in memory_2:
    if i != 'NaN':
        memory_2_type.append(re.findall(r'(\D\w+)',i)[0])
        memory_2_gb.append(re.findall(r'(\d+)',i)[0])
    else:
        memory_2_type.append('NaN')
        memory_2_gb.append(0)

We created the same two lists that will contain type and capacity also for the second drive

In [None]:
df['memory_1_sto(GB)'] = memory_1_gb
df['memory_1_type'] = memory_1_type
df['memory_2_sto(GB)'] = memory_2_gb
df['memory_2_type'] = memory_2_type

Then we convert the capacity related columns to floats.

In [None]:
df['memory_1_sto(GB)'] = df['memory_1_sto(GB)'].astype(float)
df['memory_2_sto(GB)'] = df['memory_2_sto(GB)'].astype(float)

In [None]:
df.head()

In [None]:
df=df.drop(['memory_1','memory2','memory'], axis=1)

In [None]:
df = df.replace({'NaN': np.nan})

In [None]:
df.head()

In [None]:
df['weight'].value_counts()

Checking is all rows have a weight specified in kg.

In [None]:
df['weight'].str.contains('kg').sum()

All rows returned true. That means all rows are in kg.

In [None]:
df['weight'] = df['weight'].str.replace('kg','').astype(float)

In [None]:
df.rename(columns={'weight': 'weight(kg)'}, inplace=True)

In [None]:
df.head()

In [None]:
df['cpu_brand'] = df['cpu'].str.extract(r'^(\w+)')
df['cpu_brand'].value_counts()

In [None]:
df[df['cpu_brand']=='Samsung']

Since this is the only entry containing a Samsung CPU and ARM GPU, we are dropping this row as to simplify the comparison among others.

In [None]:
df=df.drop(1191)

In [None]:
df['gpu_brand'] = df['gpu'].str.extract(r'^(\w+)')
df['gpu_brand'].value_counts()

In [None]:
df[df['gpu_brand']=='ARM']

In [None]:
df.head()

In [None]:
## my work
df["opsys"].value_counts()

In [None]:
df.info()

In [None]:
df.to_csv('laptop-clean.csv', index=False)
print('Cleaned dataset exported to CSV.')

## 2. Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
df1=pd.read_csv('laptop-clean.csv')
df1.head()

We will futher clean the data frame to remove outliers.

In [None]:
df_clean=df1.copy()

In [None]:
df1.info()

### Most frequent in brand the dataframe

In [None]:
fig, ax  = plt.subplots(figsize=(10,5))
ax=sns.countplot(x='company', data=df1, palette='mako_r', order = df1['company'].value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=80)
ax.bar_label(ax.containers[0])
plt.title('Number of laptops by brands')
plt.show()

In [None]:
company_list = df['company'].value_counts().index[:].tolist()
company_list

plt.figure(figsize=(9,5))
ax=sns.barplot(x='company', y='price', data=df1,
                order=company_list, 
                palette='Spectral', 
                errorbar=('ci', False),
                edgecolor="black")
plt.xticks(rotation=80);
ax.bar_label(ax.containers[0])
plt.title('Average price of laptops by company')
plt.show()

In [None]:
fig, ax  = plt.subplots(figsize=(10,5))
ax=sns.countplot(x='company', data=df, palette='mako_r', order = df['company'].value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=80)
ax.bar_label(ax.containers[0])
plt.title('Number of laptops by brands')
plt.show()

### Most frequent in Laptop the dataframe

In [None]:
fig, ax  = plt.subplots(figsize=(6,5))
ax=sns.countplot(x='typename', data=df,palette='mako_r', order = df['typename'].value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=80);
ax.bar_label(ax.containers[0])
plt.title('Laptop types')
plt.show()

In [None]:
typename_list = df['typename'].value_counts().index[:].tolist()
typename_list

plt.figure(figsize=(9,5))
ax=sns.barplot(x='typename', y='price', data=df[df['typename'].isin(typename_list)],
                order=typename_list, 
                palette='Spectral', 
                errorbar=('ci', False),
                edgecolor="black")
plt.xticks(rotation=80);
ax.bar_label(ax.containers[0])
plt.title('Average price of laptops by typename')
plt.show()

In [None]:
notebook_percentage = '{:.2f}'.format(len(df[df['typename']=='Notebook'])*100/len(df))
print('Most laptops are notebooks, which make ' +notebook_percentage+ '% of the total laptops.')

### Most popular screen size

In [None]:
fig, ax  = plt.subplots(figsize=(10,5))
ax=sns.countplot(x='inches', data=df, palette='viridis_r')
ax.set_xticklabels(ax.get_xticklabels(), rotation=80);
ax.bar_label(ax.containers[0])
plt.title('Laptop screen size (inches)')
plt.show()

In [None]:
plt.figure(figsize=(9,5))
ax=sns.barplot(x='inches', y='price', data=df,
                order=choices, 
                palette='Spectral', 
                errorbar=('ci', False),
                edgecolor="black")
plt.xticks(rotation=80);
ax.bar_label(ax.containers[0])
plt.title('Average price of laptops by inches')
plt.show()

# resolution

In [None]:
fig, ax  = plt.subplots(figsize=(6,5))
ax=sns.countplot(x='resolution', data=df1, palette='viridis_r')
ax.set_xticklabels(ax.get_xticklabels(), rotation=80);
ax.bar_label(ax.containers[0])
plt.title('Laptop screen resolution')
plt.show()

In [None]:
resolution_list = df['resolution'].value_counts().index[:].tolist()
resolution_list

In [None]:
order = ["3840x2160", "3200x1800", "2880x1800", "2736x1824", "2560x1600", "2560x1440", "2400x1600", "2304x1440", "2256x1504", "2160x1440", 
                  "1920x1200", "1920x1080","1600x900", "1440x900", "1366x768"]
plt.figure(figsize=(9,5))
ax=sns.barplot(x='resolution', y='price', data=df1,
                order = order, 
                palette='Spectral', 
                errorbar=('ci', False),
                edgecolor="black")
plt.xticks(rotation=80);
ax.bar_label(ax.containers[0])
plt.title('Average price of laptops by resolution')
plt.show()

### Weight distribution among laptops

In [None]:
# Writing a custom function to improve plot readability
def num_plot(df, col, title, symb):
    fig, ax = plt.subplots(2, 1, sharex=True, figsize=(8,5),gridspec_kw={"height_ratios": (.2, .8)})
    ax[0].set_title(title,fontsize=18)
    sns.boxplot(x=col, data=df, ax=ax[0])
    ax[0].set(yticks=[])
    sns.histplot(x=col, data=df, ax=ax[1])
    ax[1].set_xlabel(col, fontsize=16)
    plt.axvline(df[col].mean(), color='darkgreen', linewidth=2.2, label='mean=' + str(np.round(df[col].mean(),1)) + symb)
    plt.axvline(df[col].median(), color='red', linewidth=2.2, label='median='+ str(np.round(df[col].median(),1)) + symb)
    plt.axvline(df[col].mode()[0], color='purple', linewidth=2.2, label='mode='+ str(df[col].mode()[0]) + symb)
    plt.legend(bbox_to_anchor=(1, 1.03), ncol=1, fontsize=17, fancybox=True, shadow=True, frameon=True)
    plt.tight_layout()
    plt.show()

In [None]:
num_plot(df_clean, 'weight(kg)', 'Weight Distribution','kg')

In [None]:
plt.scatter(df['weight(kg)'],df['price'])
plt.title('Laptop price by weight')
plt.xlabel('weight(kg)')
plt.ylabel('price')

### Price distribution among the laptops

In [None]:
num_plot(df_clean, 'price', 'Price Distribution','$')

There seems to be some laptops with a price over 3000 euros.

In [None]:
fig, ax  = plt.subplots(figsize=(5,3))
ax=sns.boxplot(x='price', data=df)


In [None]:
sns.boxplot(x='typename', y='price', data=df[df['price']>3000], hue='cpu_brand')

In [None]:
df[df['price']>4500]

In [None]:
df['gpu'].value_counts()

### RAM distribution among laptops

In [None]:
# Group the data by the categorical variable
grouped_data = df.groupby('ram(GB)')['price']

# Create a list to store the data for each category
category_data = []

# Extract the Price values for each category and store them in the list
for category, group in grouped_data:
    category_data.append(group.values)

# Plot the box plots
plt.boxplot(category_data)

# Set the labels for the x-axis
plt.xticks(range(1, len(category_data) + 1), grouped_data.groups.keys())

# Set the label for the y-axis
plt.ylabel('price')

# Set the title of the plot
plt.title('Box Plot of Price by ram')

# Show the plot
plt.show()


In [None]:
num_plot(df_clean, 'ram(GB)','RAM distribution','GB')

### Opsys

In [None]:
df['opsys'].value_counts()

In [None]:
# Group the data by the categorical variable
grouped_data = df.groupby('opsys')['price']

# Create a list to store the data for each category
category_data = []

# Extract the Price values for each category and store them in the list
for category, group in grouped_data:
    category_data.append(group.values)

# Plot the box plots
plt.boxplot(category_data)

# Set the labels for the x-axis
plt.xticks(range(1, len(category_data) + 1), grouped_data.groups.keys(), rotation=45)

# Set the label for the y-axis
plt.ylabel('price')

# Set the title of the plot
plt.title('Box Plot of Price by opsys')

# Show the plot
plt.show()


In [None]:
fig, ax  = plt.subplots(figsize=(6,5))
ax=sns.countplot(x='opsys', data=df1, palette='viridis_r')
ax.set_xticklabels(ax.get_xticklabels(), rotation=80);
ax.bar_label(ax.containers[0])
plt.title('Bar plot for opsys')
plt.show()

### CPU Frequency distribution among laptops

In [None]:
df_clean['cpu_freq(GHz)'].value_counts()

In [None]:
num_plot(df_clean, 'cpu_freq(GHz)','CPU freq distribution','GHz')

### Hard Drive capacity distribution among laptops

In [None]:
fig, ax  = plt.subplots(figsize=(8,5))
ax=sns.countplot(x='memory_1_sto(GB)', data=df, hue='memory_1_type')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_xlabel('First Hard Drive memory storage (GB)')
plt.title('First Hard Drive Capacity Distribution')
plt.legend(loc='upper left', ncol=1, fontsize=15, fancybox=True, shadow=True, frameon=True)
plt.show()

Most PCs have 256 GB of storage, which is for the most part SSD. Moreover, for storage of 1 TB or higher, most of them are HDD.

#### Second Hard Drive capacity distribution among laptops

In [None]:
fig, ax  = plt.subplots(figsize=(8,5))
ax=sns.countplot(x='memory_2_sto(GB)', data=df, hue='memory_2_type')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_xlabel('Second Hard Drive memory storage (GB)')
plt.title('Second Hard Drive Capacity Distribution')
plt.legend(loc='upper left', ncol=1, fontsize=15, fancybox=True, shadow=True, frameon=True)
plt.show()

Most of second hard drive storages are 1 TB HDD disks



## 3. Outliers detection

In [None]:
# Writing a custom plot function for linear regression line.
def lr_plot(df, col_x, col_y, leg):
    slope, intercept, r_value, p_value, std_err = stats.linregress(df[col_x],df[col_y])
    sns.regplot(x=col_x, y = col_y, data=df, color='#0d98ba', line_kws={'label':"y={0:.1f}x+{1:.1f}".format(slope,intercept)})
    plt.legend(loc=leg, ncol=1, fontsize=15, fancybox=True, shadow=True, frameon=True)
    plt.title(col_y + ' VS ' + col_x)
    plt.show()

    return slope, intercept

### Price vs RAM

In [None]:
slope, intercept = lr_plot(df_clean,'ram(GB)','price', 'lower right')

We can see an outlier value.

In [None]:
df_clean[df_clean['ram(GB)']>60]

In [None]:
df_clean = df_clean[df_clean['ram(GB)']<60]

In [None]:
slope, intercept = lr_plot(df_clean,'ram(GB)','price', 'upper left')

### Price vs CPU Frequency

In [None]:
slope, intercept = lr_plot(df_clean,'cpu_freq(GHz)','price', 'upper left')

We can see a positive trend between price and the CPU frequency, with no obvious outliers.

### Price vs CPU Brand (Grouped by GPU Brand)

In [None]:
cpu_palette = {'Intel':'#0d98ba', 'AMD':'#FF0000', 'Nvidia':'#46C646'}
fig, ax = plt.subplots(figsize=(6,5))
sns.boxplot(x='cpu_brand', y='price', data=df, hue='gpu_brand', palette=cpu_palette)
ax.set_title('Price vs CPU brand by GPU brand')
plt.ylabel('price ($)')
plt.legend(loc='upper right', ncol=1, fontsize=15, fancybox=True, shadow=True, frameon=True)
plt.title('Price VS CPU brand by GPU brand')
plt.show()

- Laptops with Intel CPUs are more expensive.
- Laptops with an AMD CPUs also have and AMD GPUs
- Laptops with Nvidia GPUs are more expensive.

### Most common GPUs

In [None]:
gpu_list = df_clean['gpu'].value_counts()[:15].index.tolist()

In [None]:
plt.figure(figsize=(8,6))
ax=sns.countplot(x='gpu', data=df_clean[df_clean['gpu'].isin(gpu_list)], order = gpu_list, palette='viridis_r')
plt.xticks(rotation=80);
ax.bar_label(ax.containers[0])
plt.title('TOP 15 common GPUs')
plt.xlabel('')
plt.show()

The 2 most common GPUs are integrated Intel GPUs HD Graphics 620 and 520, while the third one is the Nvidia GTX1050.

### Most common CPUS

In [None]:
cpu_list = df_clean['cpu'].value_counts()[:15].index.tolist()

In [None]:
plt.figure(figsize=(8,6))
ax=sns.countplot(x='cpu', data=df_clean[df_clean['cpu'].isin(cpu_list)], order = cpu_list, palette='viridis')
plt.xticks(rotation=80);
ax.bar_label(ax.containers[0])
plt.title('TOP 15 common CPUs')
plt.xlabel('')
plt.show()

All the TOP 15 most common CPUs are from Intel.

## Correlation Matrix

In [None]:
plt.figure(figsize=(6, 5))
sns.heatmap(df_clean.corr(numeric_only=True), cmap='RdBu', annot=True, vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()


- RAM has a high positive correlation with price (+0.75): more expensive laptops tends to have a higher price.
- CPU Freq has a quite moderate positive correlation with the price (+0.43).
- Inches and Weight have a high positive correlation (+0.82) since laptops with bigger screens tend to be heavier.

## 4. Data Preparation for Prediction Model

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [None]:
num_plot(df_clean, 'price', 'Price Distribution','$')

In [None]:
df1=df_clean.copy()

Moreover, since the target variable 'price' is right skewed, we will log transform it to improve the prediction performance of the ML algorithm.

In [None]:
df1['price']=np.log(df1['price'])

In [None]:
sns.displot(df1['price'])

In [None]:
df1=df1.fillna('NaN')

### Feature Encoding

Defining the categorical columns.

In [None]:
catCols =  ['company','product','typename','cpu','gpu','opsys','resolution',
                    'screentype','memory_1_type','memory_2_type',
                    'gpu_brand','cpu_brand','inches']

In [None]:
print('For One Hot Encoding')
print('Dataframe Encoded Dimension: '
      , pd.get_dummies(df1, columns=catCols, drop_first=True).shape)

In [None]:
en = LabelEncoder()
for cols in catCols:
    df1[cols] = en.fit_transform(df1[cols])
    
print('For Label Encoding')
print('Dataframe Encoded Dimension: ', df1.shape)

In [None]:
df1.head()

In [None]:
## my work
df1["company"].value_counts()

### Variable Preparation

In [None]:
X=df1.drop(['price','product'], axis = 1).values

In [None]:
y=df1['price'].values

### Train - Test split

In [None]:
seed = 100

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = seed)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=100, max_features=15)

In [None]:
rf.fit(X_train,y_train)

In [None]:
y_pred_rf = rf.predict(X_test)

In [None]:
df1.head(2)

In [None]:
a=rf.predict([[2,5,12,50,4,50,4,1.22,5,2,0,1.5,128,2,0,0,1,1]])
a

In [None]:
import math
math.exp(a[0])*334

### Model Evaluation & Metrics

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
mse_rf = mean_squared_error(np.exp(y_test), np.exp(y_pred_rf))
print("RMSE using RF: {} $ ".format(np.round(np.sqrt(mse_rf)),4))

In [None]:
print("R2 using Random Forest: {:.2f} %".format(np.round(r2_score(y_test, y_pred_rf),4)*100))

In [None]:
mse = mean_squared_error(y_test,y_pred_rf)
mae = mean_absolute_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)

print("Random Forest Regressor")
print("Evaluation Metrics:")
print("Mean Squared Error (MSE): {:.4f}".format(mse))
print("Mean Absolute Error (MAE): {:.4f}".format(mae))
print("R-squared (R2) Score: {:.4f}".format(r2))


### Feature Importance for RandomForest

As we used random forest regressor, we can plot the feature importance determined by the algorithm to perform the regression to better understand what contributes the most to a laptop price.

In [None]:
feature_name_list=df1.drop(['price','product'], axis = 1).columns

In [None]:
rf.feature_names = feature_name_list

In [None]:
rf_feat_df= pd.DataFrame({'feature': rf.feature_names,'importance':rf.feature_importances_})

In [None]:
rf_sorted_df=rf_feat_df.sort_values('importance', ascending=False)
rf_sorted_df

In [None]:
plt.figure(figsize=(9,5))
sns.barplot(x='importance', y='feature', data=rf_sorted_df)
plt.title('Feature Importance By Random Forest')
plt.xlabel('Feature Importance (%)')
plt.ylabel('')
plt.show()

## 5. Alternative Models & Performance Evaluation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [None]:
regression_models = {
    "lr": LinearRegression(),
    "dt": DecisionTreeRegressor(),
    "gb": GradientBoostingRegressor(),
    "xgb": XGBRegressor()
}

models = {}

y_pred_gb = None
y_pred_xgb = None

for model_name, model in regression_models.items():
    model.fit(X_train, y_train)
    models[model_name] = (model.__class__.__name__, model.predict(X_test))
    
    if model_name == 'gb':
        y_pred_gb = models[model_name][1]
    elif model_name == 'xgb':
        y_pred_xgb = models[model_name][1]

for model_name, (model_fullname, predictions) in models.items():
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print(f"{model_fullname}")
    print("Evaluation Metrics:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R-squared (R2) Score: {r2:.4f}")
    print('------------------------------------------------------------------')
    print()


## 6. Model Comparison

In [None]:
plt.figure(figsize = (5,5))
plt.scatter(np.exp(y_test), np.exp(y_pred_rf), alpha=0.5, color='red',label='RF, R2 {:.2f} %'.format(r2_score(y_test, y_pred_rf)*100))
plt.scatter(np.exp(y_test), np.exp(y_pred_gb), alpha=0.5, color='green',label='GB, R2 {:.2f} %'.format(r2_score(y_test, y_pred_gb)*100))
plt.scatter(np.exp(y_test), np.exp(y_pred_xgb), alpha=0.5, color='blue',label='XGB, R2 {:.2f} %'.format(r2_score(y_test, y_pred_xgb)*100))
plt.plot([0, 7000], [0, 7000], linestyle='--')
plt.axis([0, 7000, 0, 7000])
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.title('RF vs GB vs XGBoost Price Regression($)')
plt.legend(loc='upper left', ncol=1, fontsize=13, fancybox=True, shadow=True, frameon=True)
plt.show()