In [1]:
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

In [3]:
# 機器學習函式庫
import sklearn

In [4]:
# 顯示到小數點第三位
%precision 3

'%.3f'

In [5]:
import requests, zipfile
import io

In [7]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
res = requests.get(url).content

In [24]:
#將取得的資料作為DataFrame物件讀取
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header = None)

In [25]:
#在資料的行裡設定標籤
auto.columns = ['symboling', 'normalized-losses', 'make', 'fuel-type',
                'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 
                'enginelocation', 'wheel-base', 'length', 'width', 'height',
                'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 
                'fuelsystem', 'bore', 'stroke', 'compression-ratio', 'horsepower', 
                'peak-rpm','city-mpg', 'highway-mpg', 'price']

In [10]:
print("汽車資料的形式:{}".format(auto.shape))

汽車資料的形式:(205, 26)


In [11]:
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,enginelocation,wheel-base,...,engine-size,fuelsystem,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [12]:
# 計算各個行(欄位)有多少個 ?
auto = auto[['price','horsepower','width','height']]
auto.isin(['?']).sum()

price         4
horsepower    2
width         0
height        0
dtype: int64

In [13]:
# 將'?'取代為NaN，並將有NaN的列刪除
auto = auto.replace('?', np.nan).dropna()
print('汽車資料的形式:{}'.format(auto.shape))

汽車資料的形式:(199, 4)


In [14]:
print('資料型別的確認（型別轉換前）\n{}\n'.format(auto.dtypes))

資料型別的確認（型別轉換前）
price          object
horsepower     object
width         float64
height        float64
dtype: object



In [15]:
auto = auto.assign(price = pd.to_numeric(auto.price))
auto = auto.assign(horsepower = pd.to_numeric(auto.horsepower))
print('資料型別的確認（型別轉換後）\n{}'.format(auto.dtypes))

資料型別的確認（型別轉換後）
price           int64
horsepower      int64
width         float64
height        float64
dtype: object


In [16]:
auto.corr()

Unnamed: 0,price,horsepower,width,height
price,1.0,0.810533,0.753871,0.13499
horsepower,0.810533,1.0,0.615315,-0.087407
width,0.753871,0.615315,1.0,0.309223
height,0.13499,-0.087407,0.309223,1.0


In [17]:
# 為了資料分割（訓練資料與測試資料）的匯入
from sklearn.model_selection import train_test_split

# 為了線性迴歸模型的匯入
from sklearn.linear_model import LinearRegression

In [18]:
# 指定目標變數為price、其他為解釋變數
X = auto.drop('price', axis = 1)
y = auto['price']

# 分為訓練資料與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 0)

In [19]:
# 建立線性迴歸模型
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [20]:
# 顯示迴歸係數
print('\n迴歸係數\n{}'.format(pd.Series(model.coef_, index = X.columns)))
print('截距: {:.3f}'.format(model.intercept_))


迴歸係數
horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64
截距: -128409.046


In [21]:
# 顯示判定係數
print('判定係數(train):{:.3f}'.format(model.score(X_train,y_train)))
print('判定係數(test):{:.3f}'.format(model.score(X_test,y_test)))

判定係數(train):0.733
判定係數(test):0.737


In [33]:
auto1 = auto[['price','engine-size','bore','stroke']]
auto1.isin(['?']).sum()

price          4
engine-size    0
bore           4
stroke         4
dtype: int64

In [34]:
# 將'?'取代為NaN，並將有NaN的列刪除
auto1 = auto1.replace('?', np.nan).dropna()
print('汽車資料的形式:{}'.format(auto1.shape))

汽車資料的形式:(197, 4)


In [35]:
print('資料型別的確認（型別轉換前）\n{}\n'.format(auto1.dtypes))

資料型別的確認（型別轉換前）
price          object
engine-size     int64
bore           object
stroke         object
dtype: object



In [36]:
auto1 = auto1.assign(price = pd.to_numeric(auto1.price))
auto1 = auto1.assign(bore = pd.to_numeric(auto1.bore))
auto1 = auto1.assign(stroke = pd.to_numeric(auto1.stroke))
print('資料型別的確認（型別轉換後）\n{}'.format(auto1.dtypes))

資料型別的確認（型別轉換後）
price            int64
engine-size      int64
bore           float64
stroke         float64
dtype: object


In [37]:
auto1.corr()

Unnamed: 0,price,engine-size,bore,stroke
price,1.0,0.887508,0.543436,0.08231
engine-size,0.887508,1.0,0.582857,0.209523
bore,0.543436,0.582857,1.0,-0.05539
stroke,0.08231,0.209523,-0.05539,1.0


In [38]:
# 指定目標變數為price、其他為解釋變數
X = auto1.drop('price', axis = 1)
y = auto1['price']

# 分為訓練資料與測試資料
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 0)

In [39]:
# 建立線性迴歸模型
model_1 = LinearRegression()
model_1.fit(X_train, y_train)

LinearRegression()

In [40]:
# 顯示迴歸係數
print('\n迴歸係數\n{}'.format(pd.Series(model_1.coef_, index = X.columns)))
print('截距: {:.3f}'.format(model_1.intercept_))


迴歸係數
engine-size     199.084644
bore          -1291.996895
stroke        -5787.115564
dtype: float64
截距: 10848.394


In [41]:
# 顯示判定係數
print('判定係數(train):{:.3f}'.format(model_1.score(X_train,y_train)))
print('判定係數(test):{:.3f}'.format(model_1.score(X_test,y_test)))

判定係數(train):0.828
判定係數(test):0.729
