In [219]:
import numpy as np
import pandas as pd

## Exercise 1

In [220]:
link = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
columns = ["symboling",
    "normalized-losses",
    "make",
    "fuel-type",
    "aspiration",
    "num-of-doors",
    "body-style",
    "drive-wheels",
    "engine-location",
    "wheel-base",
    "length",
    "width",
    "height",
    "curb-weight",
    "engine-type",
    "num-of-cylinders",
    "engine-size",
    "fuel-system",
    "bore",
    "stroke",
    "compression-ratio",
    "horsepower",
    "peak-rpm",
    "city-mpg",
    "highway-mpg",
    "price"]
df = pd.read_csv(link, names=columns)
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


In [221]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [222]:
# Check values in object type columns
pd.set_option("max_colwidth", None)

df_value = pd.DataFrame(columns=['column', 'value'])

for i in df.columns:
    if df[i].dtype == 'O':
        temp = pd.DataFrame([[i, df[i].unique()]], columns=['column', 'value'])
        df_value = df_value.append(temp)
        
df_value

Unnamed: 0,column,value
0,normalized-losses,"[?, 164, 158, 192, 188, 121, 98, 81, 118, 148, 110, 145, 137, 101, 78, 106, 85, 107, 104, 113, 150, 129, 115, 93, 142, 161, 153, 125, 128, 122, 103, 168, 108, 194, 231, 119, 154, 74, 186, 83, 102, 89, 87, 77, 91, 134, 65, 197, 90, 94, 256, 95]"
0,make,"[alfa-romero, audi, bmw, chevrolet, dodge, honda, isuzu, jaguar, mazda, mercedes-benz, mercury, mitsubishi, nissan, peugot, plymouth, porsche, renault, saab, subaru, toyota, volkswagen, volvo]"
0,fuel-type,"[gas, diesel]"
0,aspiration,"[std, turbo]"
0,num-of-doors,"[two, four, ?]"
0,body-style,"[convertible, hatchback, sedan, wagon, hardtop]"
0,drive-wheels,"[rwd, fwd, 4wd]"
0,engine-location,"[front, rear]"
0,engine-type,"[dohc, ohcv, ohc, l, rotor, ohcf, dohcv]"
0,num-of-cylinders,"[four, six, five, three, twelve, two, eight]"


In [223]:
# Replace "?" with null
columns = ['normalized-losses', 'num-of-doors', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']
df[columns] = df[columns].replace("?", np.nan)

In [224]:
# Change column type to numberical:
columns = ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']
df[columns] = df[columns].astype('float32')

In [225]:
columns = ['num-of-doors', 'num-of-cylinders']
dct = {
    'two' : 2,
    'three' : 3,
    'four' : 4,
    'five' : 5,
    'six' : 6,
    'eight' : 8,
    'twelve' : 12
}
df[columns] = df[columns].replace(dct)

In [226]:
# Check correlation of all numeric columns to price
df.corr()['price']

symboling           -0.082391
normalized-losses    0.203254
num-of-doors         0.046532
wheel-base           0.584642
length               0.690628
width                0.751265
height               0.135486
curb-weight          0.834415
num-of-cylinders     0.708645
engine-size          0.872335
bore                 0.543436
stroke               0.082310
compression-ratio    0.071107
horsepower           0.810533
peak-rpm            -0.101649
city-mpg            -0.686571
highway-mpg         -0.704692
price                1.000000
Name: price, dtype: float64

In [227]:
# Choose ones whose absolute value are over 0.5
columns = ['normalized-losses', 'length', 'width', 'curb-weight', 'num-of-cylinders', 'engine-size', 'bore', 'horsepower', 'city-mpg', 
          'highway-mpg', 'price']
df = df[columns]

In [228]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   normalized-losses  164 non-null    float32
 1   length             205 non-null    float64
 2   width              205 non-null    float64
 3   curb-weight        205 non-null    int64  
 4   num-of-cylinders   205 non-null    int64  
 5   engine-size        205 non-null    int64  
 6   bore               201 non-null    float32
 7   horsepower         203 non-null    float32
 8   city-mpg           205 non-null    int64  
 9   highway-mpg        205 non-null    int64  
 10  price              201 non-null    float32
dtypes: float32(4), float64(2), int64(5)
memory usage: 14.5 KB


In [229]:
# Drop null from price column
df = df[~df['price'].isna()]

In [230]:
columns = ['normalized-losses', 'bore', 'horsepower']

for i in columns:
    df[i] = df[i].fillna(df[i].mean())

In [231]:
# Build model (without scaling)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split

In [232]:
X = df.drop(['price'], axis=1)
y = df['price']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=1)

In [233]:
# Linear Regression
lin_model = LinearRegression(fit_intercept=True)

lin_model.fit(Xtrain, ytrain)
lin_model.score(Xtest, ytest)

0.7490233198211884

In [234]:
# Predict test dataset
yprd = lin_model.predict(Xtest)
result_df = pd.DataFrame()
result_df['test'] = ytest
result_df['predict'] = yprd
result_df['predict'] = round(result_df['predict'])
result_df['difference'] = round(result_df['predict']/result_df['test'] * 100,2)
result_df

Unnamed: 0,test,predict,difference
100,9549.0,11880.0,124.41
120,6229.0,6384.0,102.49
62,10245.0,12143.0,118.53
36,7295.0,6083.0,83.39
169,9989.0,16259.0,162.77
151,6338.0,5900.0,93.09
29,12964.0,19701.0,151.97
81,8499.0,12182.0,143.33
4,17450.0,15313.0,87.75
19,6295.0,5198.0,82.57


In [235]:
# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly_model = PolynomialFeatures(degree=2)
Xtrain_poly = poly_model.fit_transform(Xtrain)

pol_model = LinearRegression()
pol_model.fit(Xtrain_poly, ytrain)

LinearRegression()

In [236]:
# Test
Xtest_poly = poly_model.fit_transform(Xtest)
pol_model.score(Xtest_poly, ytest)

0.4729193355343583

In [237]:
# Predict test dataset
yprd = pol_model.predict(Xtest_poly)
result_df = pd.DataFrame()
result_df['test'] = ytest
result_df['predict'] = yprd
result_df['predict'] = round(result_df['predict'])
result_df['difference'] = round(result_df['predict']/result_df['test'] * 100,2)
result_df

Unnamed: 0,test,predict,difference
100,9549.0,6260.0,65.56
120,6229.0,6677.0,107.19
62,10245.0,7457.0,72.79
36,7295.0,7896.0,108.24
169,9989.0,8355.0,83.64
151,6338.0,6941.0,109.51
29,12964.0,12430.0,95.88
81,8499.0,8862.0,104.27
4,17450.0,17877.0,102.45
19,6295.0,3973.0,63.11
