In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("laptop_data.csv")

# Show the first few rows
df.head()


Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [2]:
# Check shape
print("Shape of dataset:", df.shape)

# Check column names
print("Columns:", df.columns)

# Summary of data types and non-null counts
df.info()

# Check for missing values
df.isnull().sum()


Shape of dataset: (1303, 12)
Columns: Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price             1303 non-null   float64
dtypes: float64(2), int64(1), obje

Unnamed: 0          0
Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64

In [3]:
# Describe numerical columns
df.describe()

# Check for unique values in categorical columns
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"{col}: {df[col].nunique()} unique values")


Company: 19 unique values
TypeName: 6 unique values
ScreenResolution: 40 unique values
Cpu: 118 unique values
Ram: 9 unique values
Memory: 39 unique values
Gpu: 110 unique values
OpSys: 9 unique values
Weight: 179 unique values


In [6]:
# Drop rows with missing values (if any)
df.dropna(inplace=True)

# Optional: Drop columns not useful for prediction
# df.drop(columns=['Unnamed: 0', 'Product', ...], inplace=True)


In [9]:
print(df.columns)



Index(['Company', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram',
       'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')


In [10]:
# Remove 'GB' from 'Ram' and convert to integer
df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)

# Remove 'kg' from 'Weight' and convert to float
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)


In [29]:
def process_memory(mem):
    hdd = ssd = 0
    try:
        mem = mem.replace('GB', '').replace('TB', '000')  # TB → GB
        parts = mem.split('+')
        for part in parts:
            part = part.strip()
            if 'HDD' in part:
                hdd = int(''.join(filter(str.isdigit, part)))
            elif 'SSD' in part:
                ssd = int(''.join(filter(str.isdigit, part)))
            elif 'Flash Storage' in part:
                ssd += int(''.join(filter(str.isdigit, part)))
            elif 'Hybrid' in part:
                hdd += int(''.join(filter(str.isdigit, part)))
    except Exception as e:
        print(f"Error processing: {mem} — {e}")
    return pd.Series([hdd, ssd])



In [30]:
if 'Memory' in df.columns:
    df[['HDD', 'SSD']] = df['Memory'].apply(process_memory)
    df.drop(columns=['Memory'], inplace=True)
else:
    # Reload dataset if already modified
    df = pd.read_csv('laptop_data.csv')  # update file path if needed
    df[['HDD', 'SSD']] = df['Memory'].apply(process_memory)
    df.drop(columns=['Memory'], inplace=True)

In [25]:
print(df.columns)

Index(['Company', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram',
       'Gpu', 'OpSys', 'Weight', 'Price', 'HDD', 'SSD'],
      dtype='object')


In [31]:
df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)

In [32]:
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)

In [33]:
print(df.dtypes)

Unnamed: 0            int64
Company              object
TypeName             object
Inches              float64
ScreenResolution     object
Cpu                  object
Ram                   int64
Gpu                  object
OpSys                object
Weight              float64
Price               float64
HDD                   int64
SSD                   int64
dtype: object


In [34]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [35]:
df = pd.get_dummies(df, columns=['Company', 'TypeName', 'OpSys'], drop_first=True)

In [36]:
def extract_cpu(text):
    if 'Intel' in text:
        if 'i3' in text:
            return 'Intel Core i3'
        elif 'i5' in text:
            return 'Intel Core i5'
        elif 'i7' in text:
            return 'Intel Core i7'
        elif 'i9' in text:
            return 'Intel Core i9'
        else:
            return 'Other Intel'
    elif 'AMD' in text:
        return 'AMD Ryzen'
    else:
        return 'Other'

df['CpuBrand'] = df['Cpu'].apply(extract_cpu)
df.drop(columns=['Cpu'], inplace=True)

In [37]:
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)

In [38]:
df.drop(columns=['ScreenResolution'], inplace=True)

In [39]:
df['GpuBrand'] = df['Gpu'].apply(lambda x: x.split()[0])
df.drop(columns=['Gpu'], inplace=True)

In [40]:
df = pd.get_dummies(df, columns=['CpuBrand', 'GpuBrand'], drop_first=True)

In [41]:
X = df.drop(columns=['Price'])
y = df['Price']

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 12786.57640423143
MSE: 352128334.1120792
R2 Score: 0.7557853871990784


In [44]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("Random Forest R2 Score:", r2_score(y_test, rf_pred))

Random Forest R2 Score: 0.82577496325647


In [45]:
import pickle

# Save the model
pickle.dump(rf, open('model.pkl', 'wb'))

# To load later:
# model = pickle.load(open('model.pkl', 'rb'))

In [46]:
import pickle

# Save the trained model
pickle.dump(rf, open('laptop_price_model.pkl', 'wb'))

# To load it later:
# loaded_model = pickle.load(open('laptop_price_model.pkl', 'rb'))

In [2]:
import pandas as pd

df = pd.read_csv('laptop_data.csv')


In [4]:
def process_memory(mem):
    hdd = ssd = 0
    try:
        mem = mem.replace('GB', '').replace('TB', '000')  # TB → GB
        parts = mem.split('+')
        for part in parts:
            part = part.strip()
            if 'HDD' in part:
                hdd = int(''.join(filter(str.isdigit, part)))
            elif 'SSD' in part:
                ssd = int(''.join(filter(str.isdigit, part)))
            elif 'Flash Storage' in part:
                ssd += int(''.join(filter(str.isdigit, part)))
            elif 'Hybrid' in part:
                hdd += int(''.join(filter(str.isdigit, part)))
    except Exception as e:
        print(f"Error processing: {mem} — {e}")
    return pd.Series([hdd, ssd])

In [5]:
if 'Memory' in df.columns:
    df[['HDD', 'SSD']] = df['Memory'].apply(process_memory)
    df.drop(columns=['Memory'], inplace=True)
else:
    # Reload dataset if already modified
    df = pd.read_csv('laptop_data.csv')  # update file path if needed
    df[['HDD', 'SSD']] = df['Memory'].apply(process_memory)
    df.drop(columns=['Memory'], inplace=True)

In [6]:
X = df.drop('Price', axis=1)
y = df['Price']


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Company'] = le.fit_transform(X['Company'])
X['TypeName'] = le.fit_transform(X['TypeName'])
X['OpSys'] = le.fit_transform(X['OpSys'])


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Example: Create a binary feature for 'Touchscreen'
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)

# Example: Create a binary feature for 'IPS'
df['IPS'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)

# Extract resolution
df['Resolution_X'] = df['ScreenResolution'].str.extract(r'(\d+)x').astype(int)
df['Resolution_Y'] = df['ScreenResolution'].str.extract(r'x(\d+)').astype(int)

# Now drop original ScreenResolution
df.drop(columns=['ScreenResolution'], inplace=True)


In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in ['Company', 'TypeName', 'Cpu', 'Gpu', 'OpSys']:
    df[col] = le.fit_transform(df[col])


In [13]:
X = df.drop('Price', axis=1)
y = df['Price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Encoding categorical features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['Company'] = le.fit_transform(df['Company'])
df['TypeName'] = le.fit_transform(df['TypeName'])
df['OpSys'] = le.fit_transform(df['OpSys'])

# Fix 'Ram' column if needed (remove GB and convert to int)
if df['Ram'].dtype == 'object':
    df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)

# Fix 'Weight' column (remove 'kg' and convert to float)
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)

# Train-test split (after preprocessing)
X = df.drop('Price', axis=1)
y = df['Price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Initialize and train the model
model = RandomForestRegressor(random_state=3)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


R2 Score: 0.869628219966827
MAE: 9607.331220413793
MSE: 229588195.6000368


In [17]:
import pickle

# Save model
with open('laptop_price_model.pkl', 'wb') as file:
    pickle.dump(model, file)


In [1]:
import pandas as pd

df = pd.read_csv("laptop_data.csv")
print(df.columns.tolist())
df.head()


['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price']


Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808
