In [1]:
# All imports and directory for csv.
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import re
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from google.colab import drive
drive.mount('/content/drive')
cars_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Cars Datasets 2025.csv", encoding="cp1252")

Mounted at /content/drive


In [2]:
# 80/20 split for sample of Cars 2025 dataset
test_df  = cars_df.sample(frac=0.20, random_state=42)
train_df = cars_df.drop(test_df.index).reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)


In [3]:
# Clean data
num = lambda s: (
    (lambda txt: (
        np.nan if not re.findall(r'[\d.,]*\d(?:\.\d+)?', txt)
        else float(re.findall(r'[\d.,]*\d(?:\.\d+)?', txt)[0].replace(',', ''))
             * (1_000_000 if 'million' in txt else (1_000 if re.search(r'\bk\b', txt) else 1))
    ))(str(s).lower())
)

# Parse train
train_df['price']   = train_df['Cars Prices'].map(num)
train_df['hp']      = train_df['HorsePower'].map(num)
train_df['torque']  = train_df['Torque'].map(num)
train_df['vmax']    = train_df['Total Speed'].map(num)
train_df['zero100'] = train_df['Performance(0 - 100 )KM/H'].map(num)

# Parse test
test_df['price']    = test_df['Cars Prices'].map(num)
test_df['hp']       = test_df['HorsePower'].map(num)
test_df['torque']   = test_df['Torque'].map(num)
test_df['vmax']     = test_df['Total Speed'].map(num)
test_df['zero100']  = test_df['Performance(0 - 100 )KM/H'].map(num)

# Drop rows missing required fields
train_clean = train_df.dropna(subset=['price','hp','zero100'])
test_clean  = test_df.dropna(subset=['price','hp','zero100'])

# Baseline model: Linear Regression on log(price) with fewer, less-collinear features
X_train = train_clean[['hp','zero100']]
X_test  = test_clean[['hp','zero100']]

y_train_log = np.log(train_clean['price'])
y_test = test_clean['price']

lr = LinearRegression().fit(X_train, y_train_log)

# back-transform predictions to dollars
pred_log = lr.predict(X_test)
pred = np.exp(pred_log)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, pred))
print(f"RMSE: {rmse:,.2f}")
print("Coefficients [hp, zero100]:", lr.coef_)
print("Intercept (log-scale):", lr.intercept_)


RMSE: 74,857.73
Coefficients [hp, zero100]: [ 0.00299523 -0.07633867]
Intercept (log-scale): 10.469232721847636


In [4]:

# Actual vs Predicted
cols_for_id = [c for c in ['Company Names','Cars Names'] if c in test_clean.columns]
compare = test_clean[cols_for_id].copy()
compare['actual_price'] = y_test.to_numpy()
compare['pred_price']   = pred
compare['error']        = compare['pred_price'] - compare['actual_price']
print(compare.head(10).to_string(index=False))

Company Names     Cars Names  actual_price    pred_price          error
      Porsche  Cayenne Turbo      130000.0 131176.167793    1176.167793
      HYUNDAI         Sonata       25000.0  33881.078227    8881.078227
  LAMBORGHINI  AVENTADOR SVJ      518000.0 276195.058312 -241804.941688
      Peugeot        5008 GT       40000.0  32286.250267   -7713.749733
      Porsche        Macan T       63000.0  48516.947805  -14483.052195
   Volkswagen        Crafter       40000.0  17718.123800  -22281.876200
  Tata Motors Nexon EV Prime       22000.0  25286.696220    3286.696220
        Mazda     Millenia S       30000.0  34806.834026    4806.834026
       TOYOTA          VENZA       33400.0  38278.654438    4878.654438
          BMW           118D       34000.0  29064.481842   -4935.518158
