In [146]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

In [147]:
df = pd.read_csv("car_price_prediction_ (1).csv")
df.sample(5)

Unnamed: 0,Car ID,Brand,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
2363,2364,Audi,2013,1.4,Petrol,Manual,188869,Like New,41658.67,Q5
1435,1436,Audi,2000,1.0,Hybrid,Manual,166471,Like New,81937.8,A3
42,43,Tesla,2022,4.2,Diesel,Automatic,219882,Like New,19855.49,Model Y
846,847,Ford,2018,5.8,Petrol,Automatic,20229,Used,74047.77,Focus
1773,1774,BMW,2013,5.8,Hybrid,Manual,193529,Like New,71259.92,3 Series


In [148]:
df["Brand"].unique()
df["Brand"].isnull().sum()

np.int64(0)

In [149]:
df = pd.concat([df, pd.get_dummies(df["Brand"])], axis=1)
df.drop(columns=["Brand"], inplace=True, axis=1)

In [150]:
df.drop(columns=["Car ID"], axis=1, inplace=True)

In [151]:
df["Fuel Type"].unique()

array(['Petrol', 'Electric', 'Diesel', 'Hybrid'], dtype=object)

In [152]:
df = pd.concat([df, pd.get_dummies(df["Fuel Type"])], axis=1)
df.drop(columns=["Fuel Type"], axis=1, inplace=True)

In [153]:
df["Transmission"].unique()

array(['Manual', 'Automatic'], dtype=object)

In [154]:
df["Transmission"] = df["Transmission"].map({"Manual": 0, "Automatic": 1})

In [155]:
df["Condition"].unique()

array(['New', 'Used', 'Like New'], dtype=object)

In [156]:
df = pd.concat([df, pd.get_dummies(df["Condition"])], axis=1)
df.drop(columns=["Condition"], axis=1, inplace=True)

In [157]:
df["Model"].unique()

array(['Model X', '5 Series', 'A4', 'Model Y', 'Mustang', 'Q7', 'Q5',
       'Civic', 'Explorer', 'Model 3', 'Fiesta', 'X3', 'GLA', 'A3', 'X5',
       'C-Class', 'E-Class', 'CR-V', 'Camry', 'Accord', 'GLC', 'Corolla',
       'Fit', 'Model S', 'Prius', '3 Series', 'RAV4', 'Focus'],
      dtype=object)

In [158]:
df.isnull().sum()

Year            0
Engine Size     0
Transmission    0
Mileage         0
Price           0
Model           0
Audi            0
BMW             0
Ford            0
Honda           0
Mercedes        0
Tesla           0
Toyota          0
Diesel          0
Electric        0
Hybrid          0
Petrol          0
Like New        0
New             0
Used            0
dtype: int64

In [159]:
df = pd.concat([df, pd.get_dummies(df["Model"])], axis=1)
df.drop(columns=["Model"], axis=1, inplace=True)

In [160]:
def preprocess_data(df: pd.DataFrame):
    x, y = df.drop(columns=["Price"]), df["Price"]
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)
    y_array = y.values
    x_train, x_val, y_train, y_val = train_test_split(x_scaled,y_array,test_size=0.2,random_state=42)
    return x_train, x_val, y_train, y_val, scaler

In [161]:
X_train, X_val, y_train, y_val, scaler = preprocess_data(df)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((2000, 46), (500, 46), (2000,), (500,))

In [162]:
class CarDataset(Dataset):

    def __init__(self, x, y):
        super().__init__()

        self.x = torch.tensor(x,dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)

    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [163]:
train_ds = CarDataset(X_train, y_train)
val_ds = CarDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256, shuffle=False)

len(train_ds), len(val_ds)

(2000, 500)

In [164]:
def build_model(input_dim: int) -> nn.Module:
    model = nn.Sequential(
        nn.Linear(input_dim, 32),
        nn.ReLU(),
        nn.Linear(32, 16),
        nn.ReLU(),
        nn.Linear(16, 1),
    )

    # model = nn.Sequential(
    #     nn.Linear(input_dim, 64),
    #     nn.ReLU(),
    #     nn.Linear(64, 32),
    #     nn.ReLU(),
    #     nn.Linear(32, 1),
    #     nn.Sigmoid()
    # )
    return model

In [165]:
def build_model_2(input_dim: int) -> nn.Module:
    model = nn.Sequential(
        nn.Linear(input_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, 1)
    )
    return model

In [166]:
input_dim = X_train.shape[1]
model = build_model(input_dim)
model_2 = build_model_2(input_dim)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model, model_2

(Sequential(
   (0): Linear(in_features=46, out_features=32, bias=True)
   (1): ReLU()
   (2): Linear(in_features=32, out_features=16, bias=True)
   (3): ReLU()
   (4): Linear(in_features=16, out_features=1, bias=True)
 ),
 Sequential(
   (0): Linear(in_features=46, out_features=64, bias=True)
   (1): ReLU()
   (2): Linear(in_features=64, out_features=32, bias=True)
   (3): ReLU()
   (4): Linear(in_features=32, out_features=1, bias=True)
 ))

In [167]:
def train_one_epoch(model: nn.Module,
                    train_loader: DataLoader,
                    criterion,
                    optimizer) -> float:
    model.train()
    losses = []
    batches = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        batches += 1
    return sum(losses) / batches

In [168]:
def evaluate(model: nn.Module, val_loader: DataLoader, criterion) -> float:
    model.eval()
    losses = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            losses.append(loss.item())
    return sum(losses) / len(losses)

In [169]:
epochs = 10
train_losses = []
val_accuracies = []

for epoch in range(epochs):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer)
    val_acc = evaluate(model, val_loader, criterion)

    train_losses.append(train_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}/{epoch} | Train loss: {train_loss:.4f} | Val acc: {val_acc:.4f}")

Epoch 1/0 | Train loss: 3492907728.0000 | Val acc: 3600980608.0000
Epoch 2/1 | Train loss: 3515166664.0000 | Val acc: 3600800768.0000
Epoch 3/2 | Train loss: 3521094688.0000 | Val acc: 3600389760.0000
Epoch 4/3 | Train loss: 3460207612.0000 | Val acc: 3599580800.0000
Epoch 5/4 | Train loss: 3490245800.0000 | Val acc: 3598194688.0000
Epoch 6/5 | Train loss: 3479410888.0000 | Val acc: 3596014848.0000
Epoch 7/6 | Train loss: 3493444856.0000 | Val acc: 3592838912.0000
Epoch 8/7 | Train loss: 3541628984.0000 | Val acc: 3588450432.0000
Epoch 9/8 | Train loss: 3490516160.0000 | Val acc: 3582683520.0000
Epoch 10/9 | Train loss: 3499905888.0000 | Val acc: 3575412096.0000


In [170]:
# Train model_2

epochs = 10
train_losses = []
val_accuracies = []

for epoch in range(epochs):
    train_loss = train_one_epoch(model_2, train_loader, criterion, optimizer)
    val_acc = evaluate(model_2, val_loader, criterion)

    train_losses.append(train_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}/{epoch} | Train loss: {train_loss:.4f} | Val acc: {val_acc:.4f}")

Epoch 1/0 | Train loss: 3481955344.0000 | Val acc: 3601094016.0000
Epoch 2/1 | Train loss: 3479988616.0000 | Val acc: 3601094016.0000
Epoch 3/2 | Train loss: 3500347256.0000 | Val acc: 3601094016.0000
Epoch 4/3 | Train loss: 3487609616.0000 | Val acc: 3601094016.0000
Epoch 5/4 | Train loss: 3477906680.0000 | Val acc: 3601094016.0000
Epoch 6/5 | Train loss: 3499290384.0000 | Val acc: 3601094016.0000
Epoch 7/6 | Train loss: 3496974592.0000 | Val acc: 3601094016.0000
Epoch 8/7 | Train loss: 3495580168.0000 | Val acc: 3601094016.0000
Epoch 9/8 | Train loss: 3504334512.0000 | Val acc: 3601094016.0000
Epoch 10/9 | Train loss: 3519625056.0000 | Val acc: 3601094016.0000


In [172]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

def eval_regression(model, loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for x, y in loader:
            out = model(x)
            preds.extend(out.numpy().flatten())
            trues.extend(y.numpy().flatten())
    return (
        mean_absolute_error(trues, preds),
        r2_score(trues, preds)
    )

mae1, r21 = eval_regression(model, val_loader)
mae2, r22 = eval_regression(model_2, val_loader)

print("Model 1 → MAE:", mae1, "R2:", r21)
print("Model 2 → MAE:", mae2, "R2:", r22)

Model 1 → MAE: 53102.19527568054 R2: -3.7223001512750162
Model 2 → MAE: 53343.489247218895 R2: -3.756208013636275
