In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [7]:
# Load the dataset
df = pd.read_csv('CarPrice_Assignment.csv')
df

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [8]:
df['carCompany'] = df['CarName'].apply(lambda x: x.split()[0].lower())
df.drop(['CarName', 'car_ID'], axis=1, inplace=True)

In [9]:
# Normalize brand names (optional)
df['carCompany'] = df['carCompany'].replace({
    'vw': 'volkswagen', 'vokswagen': 'volkswagen',
    'maxda': 'mazda', 'porcshce': 'porsche',
    'toyouta': 'toyota', 'nissan': 'nissan',
    'alfa-romero': 'alfa-romeo'
})

In [10]:
# Label encode simple categorical variables
for col in ['fueltype', 'aspiration', 'doornumber', 'carbody',
            'drivewheel', 'enginelocation', 'enginetype',
            'cylindernumber', 'fuelsystem', 'carCompany']:
    df[col] = LabelEncoder().fit_transform(df[col])

In [11]:
# Split into X and y
X = df.drop('price', axis=1)
y = df['price']

In [12]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [15]:

# Predict and evaluate
y_pred = model.predict(X_test_scaled)

print("Training R2 Score:", model.score(X_train_scaled, y_train))
print("Testing R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

Training R2 Score: 0.9854832583180447
Testing R2 Score: 0.9555591507127053
MSE: 3508339.3340459126


In [16]:
# Save model and scaler
joblib.dump(model, 'car_price_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']