In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [2]:
column_names = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
                "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
                "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
                "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
                "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
df = pd.read_csv(url, names=column_names, na_values='?')
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [3]:
df.dropna(subset=['price'], inplace=True)

In [4]:
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()

In [5]:
for col in numerical_columns:
    df[col]=df[col].fillna(df[col].mean())
for col in categorical_columns:
    df[col]=df[col].fillna(df[col].mode()[0])

In [6]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [7]:
df.isna().sum()

symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

In [8]:
df['num_doors'].unique()

array(['two', 'four'], dtype=object)

In [9]:
df['num_cylinders'].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [10]:
num_mapping = {
    'zero': 0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10,
    'eleven': 11,
    'twelve': 12
}
df['num_doors'] = df['num_doors'].map(num_mapping)
df['num_cylinders'] = df['num_cylinders'].map(num_mapping)

In [11]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [12]:
dummies=pd.get_dummies(df['body_style'])
dummies

Unnamed: 0,convertible,hardtop,hatchback,sedan,wagon
0,True,False,False,False,False
1,True,False,False,False,False
2,False,False,True,False,False
3,False,False,False,True,False
4,False,False,False,True,False
...,...,...,...,...,...
200,False,False,False,True,False
201,False,False,False,True,False
202,False,False,False,True,False
203,False,False,False,True,False


In [13]:
df=pd.concat([df,dummies],axis=1)
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,horsepower,peak_rpm,city_mpg,highway_mpg,price,convertible,hardtop,hatchback,sedan,wagon
0,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,111.0,5000.0,21,27,13495.0,True,False,False,False,False
1,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,111.0,5000.0,21,27,16500.0,True,False,False,False,False
2,1,122.0,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,...,154.0,5000.0,19,26,16500.0,False,False,True,False,False
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,...,102.0,5500.0,24,30,13950.0,False,False,False,True,False
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,...,115.0,5500.0,18,22,17450.0,False,False,False,True,False


In [14]:
dummies=pd.get_dummies(df['drive_wheels'])
dummies

Unnamed: 0,4wd,fwd,rwd
0,False,False,True
1,False,False,True
2,False,False,True
3,False,True,False
4,True,False,False
...,...,...,...
200,False,False,True
201,False,False,True
202,False,False,True
203,False,False,True


In [15]:
df=pd.concat([df,dummies],axis=1)
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,highway_mpg,price,convertible,hardtop,hatchback,sedan,wagon,4wd,fwd,rwd
0,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,27,13495.0,True,False,False,False,False,False,False,True
1,3,122.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,...,27,16500.0,True,False,False,False,False,False,False,True
2,1,122.0,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,...,26,16500.0,False,False,True,False,False,False,False,True
3,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,...,30,13950.0,False,False,False,True,False,False,True,False
4,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,...,22,17450.0,False,False,False,True,False,True,False,False


In [16]:
df=df.drop(columns=['body_style','drive_wheels'])
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,width,...,highway_mpg,price,convertible,hardtop,hatchback,sedan,wagon,4wd,fwd,rwd
0,3,122.0,alfa-romero,gas,std,2,front,88.6,168.8,64.1,...,27,13495.0,True,False,False,False,False,False,False,True
1,3,122.0,alfa-romero,gas,std,2,front,88.6,168.8,64.1,...,27,16500.0,True,False,False,False,False,False,False,True
2,1,122.0,alfa-romero,gas,std,2,front,94.5,171.2,65.5,...,26,16500.0,False,False,True,False,False,False,False,True
3,2,164.0,audi,gas,std,4,front,99.8,176.6,66.2,...,30,13950.0,False,False,False,True,False,False,True,False
4,2,164.0,audi,gas,std,4,front,99.4,176.6,66.4,...,22,17450.0,False,False,False,True,False,True,False,False


In [17]:
label_encoders = {}
for column in ["make", "aspiration", "engine_location", "fuel_type"]:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [18]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,width,...,highway_mpg,price,convertible,hardtop,hatchback,sedan,wagon,4wd,fwd,rwd
0,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,27,13495.0,True,False,False,False,False,False,False,True
1,3,122.0,0,1,0,2,0,88.6,168.8,64.1,...,27,16500.0,True,False,False,False,False,False,False,True
2,1,122.0,0,1,0,2,0,94.5,171.2,65.5,...,26,16500.0,False,False,True,False,False,False,False,True
3,2,164.0,1,1,0,4,0,99.8,176.6,66.2,...,30,13950.0,False,False,False,True,False,False,True,False
4,2,164.0,1,1,0,4,0,99.4,176.6,66.4,...,22,17450.0,False,False,False,True,False,True,False,False


In [19]:
df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x) else 0)
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in str(x) else 0)

In [20]:
X=df.drop(columns=['price'])
y=df['price']

In [21]:
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.3,random_state=42)

In [23]:
regressor=LinearRegression()
regressor.fit(X_train,y_train)
y_pred=regressor.predict(X_test)

In [24]:
acc=r2_score(y_pred,y_test)
print("r2:",acc)

r2: 0.806014202519377


In [41]:
pca=PCA(n_components=0.95)
X=pca.fit_transform(X_scaled)
X_train_pca,X_test_pca,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=81)

In [42]:
regressor=LinearRegression()
regressor.fit(X_train_pca,y_train)
y_pred=regressor.predict(X_test_pca)

In [43]:
acc=r2_score(y_pred,y_test)
print("r2:",acc)

r2: 0.8981198791364999
