# 🚗 Automobile Price Prediction using Machine Learning

### 👩‍💻 Project by: Amisha Patil

This project explores and builds machine learning models to predict car prices using the UCI Automobile dataset. Techniques include preprocessing, regression modeling, feature selection, and PCA.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA

In [None]:
# Load the dataset
data = pd.read_csv('imports-85.data', header=None)
data.columns = [
    'symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
    'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base',
    'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',
    'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio',
    'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
]

In [None]:
# Replace '?' with NaN and convert necessary columns
data.replace('?', np.nan, inplace=True)
num_cols = ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']
for col in num_cols:
    data[col] = pd.to_numeric(data[col])
data.fillna(data.median(numeric_only=True), inplace=True)

In [None]:
# Encode categorical variables
categorical_columns = [
    'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
    'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system'
]
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

In [None]:
# Normalize numeric features
scaler = StandardScaler()
numeric_columns = [
    'symboling', 'normalized-losses', 'wheel-base', 'length', 'width', 'height',
    'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-ratio',
    'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'
]
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Pairplot of selected features
selected_features = ['price', 'engine-size', 'curb-weight', 'horsepower', 'city-mpg']
sns.pairplot(data[selected_features])
plt.show()

In [None]:
# Prepare features and target
X = data.drop('price', axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(f'Linear Regression MSE: {mean_squared_error(y_test, y_pred_lr):.4f}')
print(f'Linear Regression R^2: {r2_score(y_test, y_pred_lr):.4f}')

In [None]:
# Random Forest Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(f'Random Forest MSE: {mean_squared_error(y_test, y_pred_rf):.4f}')
print(f'Random Forest R^2: {r2_score(y_test, y_pred_rf):.4f}')

In [None]:
# Feature selection with RFE
rfe = RFE(LinearRegression(), n_features_to_select=5)
rfe.fit(X, y)
print('Selected Features:', X.columns[rfe.support_].tolist())

In [None]:
# PCA Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print('Explained Variance Ratio:', pca.explained_variance_ratio_)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Price')
plt.title('PCA of Car Prices')
plt.show()