# Linear Regression Baseline

## 1. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## 2. Load Processed Dataset

In [2]:
df = pd.read_csv("../data/processed/vehicles_feature_audited.csv")

print("Shape:", df.shape)

Shape: (390905, 16)


## 3. Define Features and Target
We train on log_price for stability and skew handling.

In [3]:
X = df.drop(columns = ['price','log_price'])
y = df['log_price']

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (390905, 14)
y shape: (390905,)


## 4. Train-Test Split
We perform the split before any fitting to avoid data leakage.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 42
)

print('Train shape: ', X_train.shape)
print('Test shape: ', X_test.shape)

Train shape:  (312724, 14)
Test shape:  (78181, 14)


## 5. Define Column Groups for Preprocessing

In [5]:
# Numerical features
numerical_cols = ["year", "odometer"]

# High-cardinality categorical features (Target Encoding)
target_encode_cols = ["model", "region"]

# Remaining categorical features (OneHot Encoding)
onehot_cols = [
    "manufacturer",
    "condition",
    "cylinders",
    "fuel",
    "title_status",
    "transmission",
    "drive",
    "size",
    "type",
    "paint_color"
]

print("Numerical:", numerical_cols)
print("Target Encoded:", target_encode_cols)
print("OneHot:", onehot_cols)

Numerical: ['year', 'odometer']
Target Encoded: ['model', 'region']
OneHot: ['manufacturer', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color']


## 6. Import Preprocessing Tools

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import category_encoders as ce

## 7. Define Preprocessing Pipelines

In [7]:
# Numerical Pipeline
numerical_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# OneHot Pipeline
onehot_pipeline = Pipeline(steps=[
    ("onehot", OneHotEncoder(
        drop="first",
        handle_unknown="ignore"
    ))
])

# Target Encoding Pipeline (with smoothing)
target_pipeline = ce.TargetEncoder(
    cols=target_encode_cols,
    smoothing=10
)

## 8. ColumnTransformer

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("onehot", onehot_pipeline, onehot_cols),
        ("target", target_pipeline, target_encode_cols)
    ]
)

## 9. Build Full Modeling Pipeline
We combine preprocessing and Linear Regression into a single sklearn Pipeline.
This ensures leakage-safe cross-validation and clean deployment structure.

In [9]:
from sklearn.linear_model import LinearRegression

model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", LinearRegression())
])

## 10. Cross-Validation (Training Data Only)

We evaluate performance using 5-fold cross-validation on training data.

In [10]:
from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_rmse = -cross_val_score(
    model,
    X_train,
    y_train,
    cv=kf,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE (log space):", cv_rmse.mean())
print("CV RMSE (log space) per fold:", cv_rmse)

CV RMSE (log space): 0.9954301934819757
CV RMSE (log space) per fold: [1.00025046 0.99700808 0.98270121 0.98856396 1.00862725]


## 11. Ridge Regression (Regularized Linear Model)

We replace LinearRegression with Ridge and tune the regularization strength (alpha).

In [11]:
from sklearn.linear_model import Ridge

## 12. Ridge Pipeline

In [12]:
ridge_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regressor", Ridge())
])

## 13. Cross-Validation for Different Alpha Values

In [13]:
alphas = [0.01, 0.1, 1, 10, 50, 100]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for alpha in alphas:
    ridge_model.set_params(regressor__alpha=alpha)
    
    cv_rmse = -cross_val_score(
        ridge_model,
        X_train,
        y_train,
        cv=kf,
        scoring="neg_root_mean_squared_error"
    )
    
    print(f"Alpha: {alpha}")
    print("Mean CV RMSE (log):", cv_rmse.mean())
    print("-" * 40)

Alpha: 0.01
Mean CV RMSE (log): 0.9954032017947101
----------------------------------------
Alpha: 0.1
Mean CV RMSE (log): 0.9954014626209082
----------------------------------------
Alpha: 1
Mean CV RMSE (log): 0.995397098956856
----------------------------------------
Alpha: 10
Mean CV RMSE (log): 0.9953920233116772
----------------------------------------
Alpha: 50
Mean CV RMSE (log): 0.9957066807484726
----------------------------------------
Alpha: 100
Mean CV RMSE (log): 0.9961420042863194
----------------------------------------


## 14. Linear Model Comparison

We compare baseline Linear Regression and Ridge regression
using 5-fold cross-validation (log RMSE).

Linear Regression CV RMSE (log): ~0.99543

Best Ridge CV RMSE (log): ~0.99539 (alpha=10)

## 15. Conclusion — Linear Models

Observations:

- Linear Regression CV RMSE ≈ 0.995
- Ridge regularization provided negligible improvement.
- Performance stability across folds was high.
- Model likely underfitting due to non-linear relationships in used car pricing.

Conclusion:

Linear family establishes a baseline but appears bias-limited.
Next step: evaluate non-linear tree-based models.