**Laptop Price Prediction:**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
df = pd.read_csv("/content/sample_data/laptops.csv")

# **1. Data Cleaning and Preprocessing:**
1.1 Missing Values:

- Check for missing values in each column.
- Decide on a strategy for handling missing data (e.g., imputation or removal)

In [None]:
df.head(10)

Unnamed: 0,CompanyName,TypeOfLaptop,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,MSI,Business Laptop,17.04068,IPS Panel Retina Display 2560x1600,Intel Core i7,12GB,512GB SSD,Intel Iris Xe Graphics,Linux,2.064834,35844.099371
1,Chuwi,2 in 1 Convertible,16.542395,Full HD,Intel Core i5,12GB,128GB PCIe SSD,Intel Iris Xe Graphics,No OS,4.060656,37019.059051
2,hp,WorkStation,17.295294,Full HD,Intel Xeon E3-1505M,8GB,1TB HDD,Intel Iris Xe Graphics,Linux,2.901689,33329.360341
3,MSI,2 in 1 Convertible,11.526203,2K,Intel Core i7,16GB,512GB NVMe SSD,Intel Iris Xe Graphics,Windows 10,2.914843,68631.102486
4,Microsoft,Gaming,12.649634,Full HD,Intel Core i5,8GB,512GB SSD,AMD Radeon RX 5600M,Windows 10,4.341995,33842.479566
5,Apple,WorkStation,15.543249,HD 1920x1080,Intel Atom x5-Z8550,16GB,1TB NVMe SSD,NVIDIA GeForce GTX 1650,macOS,3.580368,83937.484249
6,lenevo,NoteBook,15.559451,2K,Intel Xeon E3-1505M,8GB,512GB SSD,Intel Iris Xe Graphics,Windows 11,4.050894,62896.017385
7,Asus,UltraBook,17.806917,IPS Panel Full HD / Touchscreen 1920x1080,Intel Celeron Dual Core 3855U,12GB,256GB PCIe SSD,NVIDIA GeForce GTX 1650,macOS,4.402627,35919.072831
8,Microsoft,Business Laptop,12.846039,IPS Panel Retina Display 2560x1600,Intel Core i9,12GB,128GB SSD,NVIDIA GeForce GTX 1650,Linux,3.700557,45011.851908
9,Microsoft,UltraBook,15.20418,Full HD,AMD A9-Series 9420,8GB,128GB PCIe SSD,NVIDIA GeForce GTX 1650,No OS,3.656769,52669.31083


In [None]:
df['CompanyName'].unique()

array(['MSI', 'Chuwi', 'hp', 'Microsoft', 'Apple', 'lenevo', 'Asus',
       'Acer', 'Dell'], dtype=object)

In [None]:
df['TypeOfLaptop'].unique()


array(['Business Laptop', '2 in 1 Convertible', 'WorkStation', 'Gaming',
       'NoteBook', 'UltraBook'], dtype=object)

In [None]:
df['ScreenResolution'].unique()


array(['IPS Panel Retina Display 2560x1600', 'Full HD', '2K',
       'HD 1920x1080 ', 'IPS Panel Full HD / Touchscreen 1920x1080', '4K'],
      dtype=object)

In [None]:
df['Cpu'].unique()


array(['Intel Core i7', 'Intel Core i5', 'Intel Xeon E3-1505M ',
       'Intel Atom x5-Z8550', 'Intel Celeron Dual Core 3855U ',
       'Intel Core i9', 'AMD A9-Series 9420', 'AMD Ryzen 5',
       'AMD Ryzen 7', 'Intel Pentium Quad Core N4200'], dtype=object)

In [None]:
df['Ram'].unique()


array(['12GB', '8GB', '16GB', '4GB'], dtype=object)

In [None]:
df['Memory'].unique()


array(['512GB SSD', '128GB PCIe SSD', '1TB HDD', '512GB NVMe SSD',
       '1TB NVMe SSD', '256GB PCIe SSD', '128GB SSD', '1TB Fusion Drive',
       '4TB HDD', '2TB NVMe SSD', '256GB Flash Storage', '6TB HDD',
       '512GB eMMC', '256GB eMMC', '2TB SATA SSD', '1TB SSHD',
       '256GB SSD', '2TB HDD'], dtype=object)

In [None]:
df['Gpu'].unique()


array(['Intel Iris Xe Graphics', 'AMD Radeon RX 5600M',
       'NVIDIA GeForce GTX 1650'], dtype=object)

In [None]:
df['OpSys'].unique()

array(['Linux', 'No OS', 'Windows 10', 'macOS', 'Windows 11'],
      dtype=object)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CompanyName       1000 non-null   object 
 1   TypeOfLaptop      1000 non-null   object 
 2   Inches            1000 non-null   float64
 3   ScreenResolution  1000 non-null   object 
 4   Cpu               1000 non-null   object 
 5   Ram               1000 non-null   object 
 6   Memory            1000 non-null   object 
 7   Gpu               1000 non-null   object 
 8   OpSys             1000 non-null   object 
 9   Weight            1000 non-null   float64
 10  Price             1000 non-null   float64
dtypes: float64(3), object(8)
memory usage: 86.1+ KB


In [None]:
df.shape

(1000, 11)

In [None]:
df.isna().sum()

CompanyName         0
TypeOfLaptop        0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64

# **2. Exploratory Data Analysis (EDA):**
2.1 Descriptive Statistics:

- Calculate basic statistics (mean, median, standard deviation) for numerical columns.
- Explore the distribution of categorical variables.

In [None]:
df.describe()

Unnamed: 0,Inches,Weight,Price
count,1000.0,1000.0,1000.0
mean,14.496646,3.4698,51602.255339
std,2.066624,0.857112,13802.833231
min,11.005842,2.000819,30060.2751
25%,12.677791,2.720228,40376.61767
50%,14.509298,3.477824,50683.971717
75%,16.313026,4.189891,61897.280126
max,17.998786,4.994556,115137.368077


In [None]:

# 1. Type of Laptop
type_count = df['TypeOfLaptop'].value_counts().reset_index()
type_count.columns = ['TypeOfLaptop', 'Count']

fig = px.bar(type_count, x='TypeOfLaptop', y='Count', title='Distribution of Laptop Types', labels={'Count': 'Number of Laptops', 'TypeOfLaptop': 'Type of Laptop'})
fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

# 2. Screen Resolution
resolution_count = df['ScreenResolution'].value_counts().reset_index()
resolution_count.columns = ['ScreenResolution', 'Count']

fig = px.bar(resolution_count, x='ScreenResolution', y='Count', title='Distribution of Screen Resolutions', labels={'Count': 'Number of Laptops', 'ScreenResolution': 'Screen Resolution'})
fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

# 3. CPU
cpu_count = df['Cpu'].value_counts().reset_index()
cpu_count.columns = ['Cpu', 'Count']

fig = px.bar(cpu_count, x='Cpu', y='Count', title='Distribution of CPU Types', labels={'Count': 'Number of Laptops', 'Cpu': 'CPU Type'})
fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

# 4. GPU
gpu_count = df['Gpu'].value_counts().reset_index()
gpu_count.columns = ['Gpu', 'Count']

fig = px.bar(gpu_count, x='Gpu', y='Count', title='Distribution of GPU Types', labels={'Count': 'Number of Laptops', 'Gpu': 'GPU Type'})
fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

# 5. Operating System
os_count = df['OpSys'].value_counts().reset_index()
os_count.columns = ['OpSys', 'Count']

fig = px.bar(os_count, x='OpSys', y='Count', title='Distribution of Operating Systems', labels={'Count': 'Number of Laptops', 'OpSys': 'Operating System'})
fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

# **2.2 Data Visualization:**

- Create visualizations to understand the distribution of laptops across - different categories (e.g., company, type, operating system).
- Plot correlations between numerical variables.
Visualize the distribution of price

In [None]:
# Company Distribution
company_distribution = df['CompanyName'].value_counts().reset_index()
company_distribution.columns = ['CompanyName', 'Count']

fig = px.bar(company_distribution, x='CompanyName', y='Count', title='Distribution of Laptops by Company', labels={'Count': 'Number of Laptops', 'CompanyName': 'Company'})
fig.update_layout(xaxis=dict(tickangle=45))
fig.show()


# Correlations Between Numerical Variables
# Scatter Plot Matrix
fig = px.scatter_matrix(df, dimensions=['Inches', 'Ram', 'Weight', 'Price'], title='Scatter Plot Matrix of Numerical Variables')
fig.show()

# Visualize the Distribution of Prices

# Price Distribution
fig = px.histogram(df, x='Price', title='Distribution of Laptop Prices', labels={'Price': 'Laptop Price'})
fig.show()

# Scatter Plot: Price vs. Weight
fig = px.scatter(df, x='Weight', y='Price', title='Scatter Plot: Weight vs. Price', labels={'Weight': 'Laptop Weight', 'Price': 'Laptop Price'})
fig.show()

# Scatter Plot: Price vs. Inches
fig = px.scatter(df, x='Inches', y='Price', title='Scatter Plot: Inches vs. Price', labels={'Inches': 'Screen Size (Inches)', 'Price': 'Laptop Price'})
fig.show()


# **Model Building**
*   Using Multi-Linear regression model for predicting laptop price






In [None]:
label_encoder = LabelEncoder()
df['CompanyName'] = label_encoder.fit_transform(df['CompanyName'])
df['TypeOfLaptop'] = label_encoder.fit_transform(df['TypeOfLaptop'])
df['ScreenResolution'] = label_encoder.fit_transform(df['ScreenResolution'])
df['Cpu'] = label_encoder.fit_transform(df['Cpu'])
df['Ram'] = label_encoder.fit_transform(df['Ram'])
df['Memory'] = label_encoder.fit_transform(df['Memory'])
df['Gpu'] = label_encoder.fit_transform(df['Gpu'])
df['OpSys'] = label_encoder.fit_transform(df['OpSys'])

In [None]:
df.head(10)

Unnamed: 0,CompanyName,TypeOfLaptop,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,5,1,17.04068,5,6,0,15,1,0,2.064834,35844.099371
1,3,0,16.542395,2,5,0,0,1,1,4.060656,37019.059051
2,7,5,17.295294,2,9,3,3,1,0,2.901689,33329.360341
3,5,0,11.526203,0,6,1,14,1,2,2.914843,68631.102486
4,6,2,12.649634,2,5,3,15,0,2,4.341995,33842.479566
5,1,5,15.543249,3,3,1,4,2,4,3.580368,83937.484249
6,8,3,15.559451,0,9,3,15,1,3,4.050894,62896.017385
7,2,4,17.806917,4,4,0,7,2,4,4.402627,35919.072831
8,6,1,12.846039,5,7,0,1,2,0,3.700557,45011.851908
9,6,4,15.20418,2,0,3,0,2,1,3.656769,52669.31083


In [None]:
x = df.iloc[:, :-1].values
x

array([[ 5.        ,  1.        , 17.04067994, ...,  1.        ,
         0.        ,  2.06483418],
       [ 3.        ,  0.        , 16.54239484, ...,  1.        ,
         1.        ,  4.06065604],
       [ 7.        ,  5.        , 17.29529434, ...,  1.        ,
         0.        ,  2.90168919],
       ...,
       [ 8.        ,  3.        , 13.76128764, ...,  2.        ,
         2.        ,  4.04746848],
       [ 2.        ,  4.        , 11.03799989, ...,  1.        ,
         1.        ,  3.66982456],
       [ 4.        ,  3.        , 11.00584228, ...,  1.        ,
         1.        ,  4.79967463]])

In [None]:
y = df.iloc[:, 10].values
y

array([ 35844.09937133,  37019.05905115,  33329.36034144,  68631.1024856 ,
        33842.47956619,  83937.4842492 ,  62896.01738543,  35919.0728313 ,
        45011.85190823,  52669.31083035,  44787.67209549,  57579.42957199,
        41623.57766144,  34169.16620547,  67996.9848609 ,  41011.3040712 ,
        47162.65145634,  32270.29345624,  51050.69801817,  64759.50341105,
        35381.34997939,  58607.32843567,  57349.57656821,  80798.39023496,
        38326.63426617,  30718.82230416,  43514.93474815,  64743.61642435,
        52172.81281276,  48954.92039767,  53738.41243671,  47148.78946499,
        37392.91775243,  69086.9677434 ,  81181.35561354,  56122.05038937,
        49294.29248072,  68607.14691583,  68551.97408406,  45213.13412049,
        45868.18424722,  44571.33688849,  33083.81128513,  53119.17472149,
        50204.14926101,  49590.19103503,  69429.34645417,  30080.22502606,
        56819.15885225,  66498.91206546,  67815.62809268,  45228.36857024,
        51853.55627915,  

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=30)

In [None]:
x_train.size

7000

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_predict = lr.predict(x_test)

In [None]:
#taking random values for prediction
#{'CompanyName': 7, 'TypeOfLaptop': 2, 'Inches': 17.295294, 'ScreenResolution': 2, 'Cpu': 9, 'Ram': 3, 'Memory': 3, 'Gpu': 0, 'OpSys': 2, 'Weight': 2.914843}
y_pred= lr.predict([[7, 2, 17.295294, 2, 9, 3, 3, 0, 2, 2.914843]])

In [None]:
print(y_pred) #predicting laptop price

[50357.05470517]


In [None]:
r2_score = lr.score(x_train, y_train)
print("Training Score:", r2_score*100, "%")

Training Score: 1.40772945025871 %


In [None]:
r2_score = lr.score(x_test, y_test)
print("Testing Score:", r2_score*100, "%")

Testing Score: -1.7339990333393507 %


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming y_true contains the actual prices
y_true = df['Price']

# Make predictions and assume y_pred contains the predicted prices
y_pred = lr.predict(x)

# Calculate evaluation metrics
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")



Mean Absolute Error (MAE): 11419.5391194173
Mean Squared Error (MSE): 189327690.67249292
Root Mean Squared Error (RMSE): 13759.639917980881
R-squared (R2): 0.0052540770276375826


In [None]:
# import pandas as pd

# # Features for prediction (replace these with your actual feature values)
# new_data = {'CompanyName': 7, 'TypeOfLaptop': 2, 'Inches': 17.295294, 'ScreenResolution': 2,
#             'Cpu': 9, 'Ram': 3, 'Memory': 3, 'Gpu': 0, 'OpSys': 2, 'Weight': 2.914843}

# # Create a DataFrame from the new data
# new_data_df = pd.DataFrame([new_data])

# # One-hot encode categorical variables
# new_data_encoded = pd.get_dummies(new_data_df)

# # Ensure the order of columns matches the format used during training
# # You might need to adjust the order or add missing columns if needed
# new_data_encoded = new_data_encoded.reindex(columns=columns_used_during_training, fill_value=0)

# # Make predictions using the Random Forest model
# y_pred_rf = rf_model.predict(new_data_encoded)

# print(y_pred_rf)
