In [12]:
import pandas as pd
import numpy as np


column_names = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", 
                "num_doors", "body_style", "drive_wheels", "engine_location", 
                "wheel_base", "length", "width", "height", "curb_weight", 
                "engine_type", "num_cylinders", "engine_size", "fuel_system", 
                "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", 
                "city_mpg", "highway_mpg", "price"]


url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df = pd.read_csv(url, names=column_names)


df.replace('?', np.nan, inplace=True)


df.head()


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [14]:
import pandas as pd
import numpy as np

# Assuming 'df' is your DataFrame and you have already loaded the data
numeric_columns = ["normalized_losses", "bore", "stroke", "horsepower", "peak_rpm", "price"]

# Convert the specified columns to numeric, forcing errors to NaN
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Fill NaN values in numeric columns with the mean of their respective columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Drop rows where the 'price' column has NaN values
df.dropna(subset=["price"], inplace=True)
df.dropna(subset=["num_doors"], inplace=True)

# Check for any remaining NaN values
df.isna().sum()


symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

In [15]:
from sklearn.preprocessing import LabelEncoder


num_doors_mapping = {'two': 2, 'four': 4}
num_cylinders_mapping = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 
                         'eight': 8, 'twelve': 12}

df['num_doors'] = df['num_doors'].map(num_doors_mapping)
df['num_cylinders'] = df['num_cylinders'].map(num_cylinders_mapping)


label_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
le = LabelEncoder()

for col in label_cols:
    df[col] = le.fit_transform(df[col])


df = pd.get_dummies(df, columns=['body_style', 'drive_wheels'], drop_first=True)


df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in x else 0)
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in x else 0)


df.head()


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,width,...,peak_rpm,city_mpg,highway_mpg,price,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon,drive_wheels_fwd,drive_wheels_rwd
0,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,5000.0,21,27,13495.0,False,False,False,False,False,True
1,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,5000.0,21,27,16500.0,False,False,False,False,False,True
2,1,122.0,0,1,0,2,0,94.5,171.2,65.5,...,5000.0,19,26,16500.0,False,True,False,False,False,True
3,2,164.0,1,1,0,4,0,99.8,176.6,66.2,...,5500.0,24,30,13950.0,False,False,True,False,True,False
4,2,164.0,1,1,0,4,0,99.4,176.6,66.4,...,5500.0,18,22,17450.0,False,False,True,False,False,False


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X = df.drop(columns='price')
y = df['price']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


lr = LinearRegression()
lr.fit(X_train, y_train)


y_pred = lr.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error without PCA: {mse}")


Mean Squared Error without PCA: 16549753.264190685


In [18]:
from sklearn.decomposition import PCA


pca = PCA(n_components=0.95)  
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)


y_pred_pca = lr_pca.predict(X_test_pca)


mse_pca = mean_squared_error(y_test, y_pred_pca)
print(f"Mean Squared Error with PCA: {mse_pca}")


Mean Squared Error with PCA: 19607138.41928702
