## 正規方程法線性迴歸

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd

hp = pd.read_csv('housePrice.csv')
x = hp.iloc[:, 0:12]
y = hp.iloc[:, 12]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2)
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
std_y = StandardScaler()
y_train = std_y.fit_transform(y_train.to_numpy().reshape(-1, 1))
y_test = std_y.transform(y_test.to_numpy().reshape(-1, 1))
lr = LinearRegression()
lr.fit(x_train, y_train)
print('權重值：{}'.format(lr.coef_)) 
print('偏置值：{}'.format(lr.intercept_))

權重值：[[-0.09811456  0.12152492  0.03341199  0.04474076 -0.26927931  0.27203246
   0.03958009 -0.33583873  0.3018656  -0.28971572 -0.24033746 -0.42558369]]
偏置值：[-2.90515995e-16]


In [None]:
y_predict = std_y.inverse_transform(lr.predict(x_test))
y_real = std_y.inverse_transform(y_test)
for i in range(10):
  print('預測值：{}，真實值：{}'.format(y_predict[i], y_real[i]))

預測值：[21.60838422]，真實值：[20.5]
預測值：[18.52143523]，真實值：[14.6]
預測值：[40.69299062]，真實值：[50.]
預測值：[28.78726975]，真實值：[23.3]
預測值：[19.02963563]，真實值：[16.1]
預測值：[28.40047238]，真實值：[28.]
預測值：[21.28917566]，真實值：[21.]
預測值：[15.24163339]，真實值：[13.5]
預測值：[14.29263085]，真實值：[18.2]
預測值：[19.59855943]，真實值：[18.5]


In [None]:
merror = mean_squared_error(y_real, y_predict)
print('平均方差：{}'.format(merror))

平均方差：22.86062557102978


## 梯度下降法線性迴歸

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import pandas as pd

hp = pd.read_csv('housePrice.csv')
x = hp.iloc[:, 0:12]
y = hp.iloc[:, 12]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
std_y = StandardScaler()
y_train = std_y.fit_transform(y_train.to_numpy().reshape(-1, 1))
y_test = std_y.transform(y_test.to_numpy().reshape(-1, 1))
sgd = SGDRegressor()
sgd.fit(x_train, y_train)
print('權重值：{}'.format(sgd.coef_)) 
print('偏置值：{}'.format(sgd.intercept_))
y_predict = std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))
y_real = std_y.inverse_transform(y_test)
for i in range(10):
  print('預測值：{}，真實值：{}'.format(y_predict[i], y_real[i]))
merror = mean_squared_error(y_real, y_predict)
print('平均方差：{}'.format(merror))

權重值：[-0.07099359  0.10868226 -0.05083439  0.12096582 -0.16131473  0.31451336
  0.03429864 -0.25799864  0.17064823 -0.10969534 -0.19796035 -0.4147111 ]
偏置值：[0.00198545]
預測值：[26.00212708]，真實值：[23.1]
預測值：[19.86820805]，真實值：[19.2]
預測值：[16.13268522]，真實值：[11.]
預測值：[23.78413949]，真實值：[29.6]
預測值：[9.07227135]，真實值：[9.7]
預測值：[28.20226233]，真實值：[36.2]
預測值：[19.13647049]，真實值：[18.5]
預測值：[15.94978279]，真實值：[10.4]
預測值：[21.86792019]，真實值：[11.9]
預測值：[17.08719527]，真實值：[22.5]
平均方差：32.28242514882645


  y = column_or_1d(y, warn=True)


## 邏輯迴歸應用：判斷是否罹癌

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("breastCancer.csv")
x = df.iloc[:, 0:9]
y = df.iloc[:, 9]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
estimator = LogisticRegression()
estimator.fit(x_train, y_train)
score = estimator.score(x_test, y_test)
print("準確率：{}".format(score))

準確率：0.9779411764705882


## 召回率應用：提高罹癌患者檢測率

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

df = pd.read_csv("breastCancer.csv")
x = df.iloc[:, 0:9]
y = df.iloc[:, 9]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
estimator = LogisticRegression()
estimator.fit(x_train, y_train)
y_pre = estimator.predict(x_test)
#pred_proba = estimator.predict_proba(x_test)[:, 1]
#thres = 0.15
#y_pre = np.where(pred_proba > thres, 2, 1)
ret = classification_report(y_test, y_pre, labels=(1, 2), 
                            target_names=("良性", "惡性"))
print(ret)

              precision    recall  f1-score   support

          良性       0.98      0.99      0.98        96
          惡性       0.97      0.95      0.96        40

    accuracy                           0.98       136
   macro avg       0.98      0.97      0.97       136
weighted avg       0.98      0.98      0.98       136



## SVM分類範例：稻米種類判斷

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

df = pd.read_csv("rice.csv")
x = df.iloc[:, 0:7]
y = df.iloc[:, 7]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2)
clf = SVC(kernel='poly', gamma='auto', C=1, degree=1)
clf.fit(x_train,y_train)
score = clf.score(x_test, y_test)
print("準確率：{}".format(score))

準確率：0.9381578947368421


## SVC應用：人臉辨識

In [None]:
from sklearn.model_selection import \
    train_test_split
from sklearn.datasets import \
    fetch_lfw_people
from sklearn.decomposition import PCA
from sklearn.svm import SVC
lfw_people = fetch_lfw_people(data_home='.', 
                    min_faces_per_person=70)
print(lfw_people.images.shape)

(1288, 62, 47)


In [None]:
x = lfw_people.data
print('特徵維度:{}'.format(x.shape))
y = lfw_people.target 
names = lfw_people.target_names
print('人臉姓名：')
print(lfw_people.target_names)

特徵維度:(1288, 2914)
人臉姓名：
['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush'
 'Gerhard Schroeder' 'Hugo Chavez' 'Tony Blair']


In [None]:
%%time
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1)
clf = SVC(kernel='rbf', C=100, gamma='auto')
clf = clf.fit(x_train, y_train)
predict = clf.predict(x_test)
score = clf.score(x_test, y_test)
print("準確率：{}".format(score))
for i in range(20):
  print('預測值：{}，真實值：{}'.format(names[predict[i]], 
                               names[y_test[i]]))

準確率：0.4108527131782946
預測值：George W Bush，真實值：Ariel Sharon
預測值：George W Bush，真實值：Tony Blair
預測值：George W Bush，真實值：Ariel Sharon
預測值：George W Bush，真實值：Donald Rumsfeld
預測值：George W Bush，真實值：Donald Rumsfeld
預測值：George W Bush，真實值：George W Bush
預測值：George W Bush，真實值：Colin Powell
預測值：George W Bush，真實值：Hugo Chavez
預測值：George W Bush，真實值：Tony Blair
預測值：George W Bush，真實值：Tony Blair
預測值：George W Bush，真實值：George W Bush
預測值：George W Bush，真實值：Donald Rumsfeld
預測值：George W Bush，真實值：Ariel Sharon
預測值：George W Bush，真實值：George W Bush
預測值：George W Bush，真實值：Gerhard Schroeder
預測值：George W Bush，真實值：Ariel Sharon
預測值：George W Bush，真實值：Colin Powell
預測值：George W Bush，真實值：George W Bush
預測值：George W Bush，真實值：Colin Powell
預測值：George W Bush，真實值：Gerhard Schroeder
CPU times: user 6.81 s, sys: 2.5 ms, total: 6.82 s
Wall time: 6.8 s


In [None]:
%%time
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1)
pca =PCA(svd_solver='randomized', n_components=100, whiten=True)
pca.fit(x, y)
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)
clf = SVC(kernel='rbf', C=100, gamma='auto')
clf = clf.fit(x_train_pca, y_train)
predict = clf.predict(x_test_pca)
score = clf.score(x_test_pca, y_test)
print("準確率：{}".format(score))
for i in range(20):
  print('預測值：{}，真實值：{}'.format(names[predict[i]], 
                               names[y_test[i]]))

準確率：0.8255813953488372
預測值：Colin Powell，真實值：Ariel Sharon
預測值：Tony Blair，真實值：Tony Blair
預測值：Ariel Sharon，真實值：Ariel Sharon
預測值：Donald Rumsfeld，真實值：Donald Rumsfeld
預測值：Donald Rumsfeld，真實值：Donald Rumsfeld
預測值：George W Bush，真實值：George W Bush
預測值：Colin Powell，真實值：Colin Powell
預測值：George W Bush，真實值：Hugo Chavez
預測值：Tony Blair，真實值：Tony Blair
預測值：Tony Blair，真實值：Tony Blair
預測值：George W Bush，真實值：George W Bush
預測值：Donald Rumsfeld，真實值：Donald Rumsfeld
預測值：Ariel Sharon，真實值：Ariel Sharon
預測值：George W Bush，真實值：George W Bush
預測值：Tony Blair，真實值：Gerhard Schroeder
預測值：Colin Powell，真實值：Ariel Sharon
預測值：Colin Powell，真實值：Colin Powell
預測值：George W Bush，真實值：George W Bush
預測值：Colin Powell，真實值：Colin Powell
預測值：George W Bush，真實值：Gerhard Schroeder
CPU times: user 1.37 s, sys: 987 ms, total: 2.36 s
Wall time: 1.36 s


## SVM迴歸範例：廣告效益預測

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

df = pd.read_csv("advSale.csv")
dict1 = {'大於百萬':1, '十萬到百萬':2, '一萬到十萬':3, '小於一萬':4}
df['網紅廣告'].replace(dict1, inplace=True)
df

Unnamed: 0,電視廣告,廣播廣告,社群媒體廣告,網紅廣告,銷售額
0,93.0,25.647047,5.124376,3,334.542585
1,79.0,28.862860,1.487848,4,283.111546
2,30.0,4.328536,0.477257,3,102.350794
3,36.0,15.826442,1.740744,4,127.546660
4,33.0,16.567955,3.220240,4,117.856095
...,...,...,...,...,...
4495,75.0,20.142183,5.763508,1,265.232733
4496,19.0,4.906581,3.263641,4,66.570507
4497,43.0,16.566161,3.758375,2,159.537246
4498,96.0,33.863142,4.851668,3,346.718590


In [None]:
x = df.iloc[:, 0:4]
y = df.iloc[:, 4]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2)
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
std_y = StandardScaler()
y_train = std_y.fit_transform(y_train.to_numpy().reshape(-1, 1))
y_test = std_y.transform(y_test.to_numpy().reshape(-1, 1))
clf = SVR(kernel='rbf', C=1, gamma='auto')
clf.fit(x_train, y_train)
y_predict = clf.predict(x_test)
y_predict = std_y.inverse_transform(y_predict.reshape(-1, 1))
y_real = std_y.inverse_transform(y_test)
for i in range(10):
  print('預測值：{}，真實值：{}'.format(y_predict[i], y_real[i][0]))
merror = mean_squared_error(y_real, y_predict)
print('平均方差：{}'.format(merror))

預測值：[185.17351454]，真實值：192.110626
預測值：[280.42962985]，真實值：279.998199
預測值：[336.94121359]，真實值：354.292515
預測值：[307.06949054]，真實值：311.670241
預測值：[352.24045744]，真實值：360.109683
預測值：[208.55594167]，真實值：214.269115
預測值：[54.73763829]，真實值：47.042182999999994
預測值：[275.59752106]，真實值：279.061514
預測值：[127.31255046]，真實值：130.10523
預測值：[76.50701872]，真實值：70.07728099999999
平均方差：18.041233854635003


  y = column_or_1d(y, warn=True)
