In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# 회귀 분석 문제

주택 데이터 <br>
평균 제곱 오차와 결정 계수를 계산<br>
결정계수R^2 =1 인 경우 모델이 데이터에 완전히 적합<br>
결정계수R^2 =0 인 경우 모델이 데이터를 설명하지 못함<br>
음수인 경우 점점 부적합되는 것


In [2]:
df = pd.read_csv('housing.csv',sep=',',header=None)
#shuffle the data
# 교차 검증 폴드 데이터를 임의로 추출 
# 데이터를 재편함
df = df.iloc[np.random.permutation(len(df))]
X= df[df.columns[:-1]].values
Y = df[df.columns[-1]].values

In [3]:
cv = 10
print('linear regression')
lin = LinearRegression()
scores = model_selection.cross_val_score(lin, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(lin, X,Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

linear regression
mean R2: 0.71 (+/- 0.19)
MSE: 23.6892459329


In [4]:
print('ridge r2egression')
ridge = Ridge(alpha=1.0)
scores = model_selection.cross_val_score(ridge, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(ridge, X,Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

ridge r2egression
mean R2: 0.71 (+/- 0.18)
MSE: 23.8534712192


In [5]:
print('lasso regression')
lasso = Lasso(alpha=0.1)
scores = model_selection.cross_val_score(lasso, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(lasso, X,Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

lasso regression
mean R2: 0.70 (+/- 0.17)
MSE: 24.8529216472


In [6]:
print('decision tree regression')
tree = DecisionTreeRegressor(random_state=0)
scores = model_selection.cross_val_score(tree, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(tree, X,Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

decision tree regression
mean R2: 0.78 (+/- 0.16)
MSE: 18.5230237154


In [7]:
print('random forest regression')
forest = RandomForestRegressor(n_estimators=50, max_depth=None, min_samples_split=2,
                               random_state=0)
scores = model_selection.cross_val_score(forest, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(forest, X,Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

random forest regression
mean R2: 0.87 (+/- 0.09)
MSE: 10.3944529644


In [8]:
#svm
print('linear support vector machine')
svm_lin = svm.SVR(epsilon=0.2,kernel='linear',C=1)
scores = model_selection.cross_val_score(svm_lin, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(svm_lin, X,Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted)) 

linear support vector machine
mean R2: 0.69 (+/- 0.24)
MSE: 25.9612224042


In [9]:
print('support vector machine rbf')
clf = svm.SVR(epsilon=0.2,kernel='rbf',C=1.)
scores = model_selection.cross_val_score(clf, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(clf, X,Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted)) 

support vector machine rbf
mean R2: -0.01 (+/- 0.12)
MSE: 83.8253978427


In [10]:
print('knn')
knn = KNeighborsRegressor()
scores = model_selection.cross_val_score(knn, X, Y, cv=cv)
print("mean R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(knn, X,Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

knn
mean R2: 0.55 (+/- 0.18)
MSE: 37.5442608696


램덤 포레스트 결정 계수 평균이 0.87로 가장 적합함<br>
특징선택 <br> 모델을 훈련 할 때 전체 특징 중 일부만 적절하고 나머지는 모델의 결정계수에 공헌하지 못함 <br>

# 재귀적 특징 축소 방법 RFE
가장 큰 절대 가중치를 갖는 속성을 고려 <br>
원하는 개수의 특징을 선택할 수 있을 때까지 반복<br>
SVM 알고리즘의 경우 가중치는 w 값 뿐이지만, 회귀 분석의 경우 모델의 파라미터가 0(세타) 가 됨

In [11]:
from sklearn.feature_selection import RFE
best_features=4 
# 최적의 속성을 4개로 줌 

In [12]:
print('feature selection on linear regression')
rfe_lin = RFE(lin,best_features).fit(X,Y)
# support_ 속성으로 특징 선택 여부를 타나내는 불리언 리스트를 반환
# 선택된 특징을 모델 평가에 적용
mask = np.array(rfe_lin.support_)
scores = model_selection.cross_val_score(lin, X[:,mask], Y, cv=cv)
print("R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(lin, X[:,mask],Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

print('feature selection ridge regression')
rfe_ridge = RFE(ridge,best_features).fit(X,Y)
mask = np.array(rfe_ridge.support_)
scores = model_selection.cross_val_score(ridge, X[:,mask], Y, cv=cv)
print("R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(ridge, X[:,mask],Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

print('feature selection on lasso regression')
rfe_lasso = RFE(lasso,best_features).fit(X,Y)
mask = np.array(rfe_lasso.support_)
scores = model_selection.cross_val_score(lasso, X[:,mask], Y, cv=cv)
print("R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(lasso, X[:,mask],Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

print('feature selection on decision tree') 
rfe_tree = RFE(tree,best_features).fit(X,Y)
mask = np.array(rfe_tree.support_)
scores = model_selection.cross_val_score(tree, X[:,mask], Y, cv=cv)
print("R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(tree, X[:,mask],Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

print('feature selection on random forest')
rfe_forest = RFE(forest,best_features).fit(X,Y)
mask = np.array(rfe_forest.support_)
scores = model_selection.cross_val_score(forest, X[:,mask], Y, cv=cv)
print("R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(forest, X[:,mask],Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted)) 

print('feature selection on linear support vector machine')
rfe_svm = RFE(svm_lin,best_features).fit(X,Y)
mask = np.array(rfe_svm.support_)
scores = model_selection.cross_val_score(svm_lin, X[:,mask], Y, cv=cv)
print("R2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
predicted = model_selection.cross_val_predict(svm_lin, X,Y, cv=cv)
print('MSE:',mean_squared_error(Y,predicted))

feature selection on linear regression
R2: 0.60 (+/- 0.30)
MSE: 33.2738633043
feature selection ridge regression
R2: 0.60 (+/- 0.29)
MSE: 33.3838301611
feature selection on lasso regression
R2: 0.67 (+/- 0.21)
MSE: 27.5283180344
feature selection on decision tree
R2: 0.76 (+/- 0.18)
MSE: 20.0476284585
feature selection on random forest
R2: 0.84 (+/- 0.13)
MSE: 13.132366332
feature selection on linear support vector machine
R2: 0.59 (+/- 0.29)
MSE: 25.9612224042


KNN 알고리즘의 경우 특징에 가중치를 제공하지 않기 때문에 RFE 함수가 적용되지 않음

# 분류문제

In [13]:
import pandas as pd
import numpy as np
from sklearn import model_selection 
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

자동차의 주요 특성을 설명하는 6개의 특징에 기반을 둔 데이터 사용 <br>
분류의 정확성을 평가하기 위해 <br>
정확률, 재현율, f 척도 사용<br>
긍정과 부정 두 부류를 갖는 데이터 집합이 있을 때 <br>
긍정인데 긍정으로 정확히 레이블된 참긍정과 <br>
부정인데 긍정으로 잘못 레이블된 거짓 긍정 개수<br>
긍정인데 부정으로 잘못 레이블된 거짓 부정의 개수 정의<br>

In [14]:
#read data in
df = pd.read_csv('data_cars.csv',header=None)
for i in range(len(df.columns)):
    df[i] = df[i].astype('category')
df.head()
# 범주형 특징값
# buying0, maint1, doors2, persons3, lug_boot4, safety5, car evaluation 6

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [15]:
#map catgories to values 분류를 숫자로 매핑 
map0 = dict( zip( df[0].cat.categories, range( len(df[0].cat.categories ))))
#print map0
map1 = dict( zip( df[1].cat.categories, range( len(df[1].cat.categories ))))
map2 = dict( zip( df[2].cat.categories, range( len(df[2].cat.categories ))))
map3 = dict( zip( df[3].cat.categories, range( len(df[3].cat.categories ))))
map4 = dict( zip( df[4].cat.categories, range( len(df[4].cat.categories ))))
map5 = dict( zip( df[5].cat.categories, range( len(df[5].cat.categories ))))
map6 = dict( zip( df[6].cat.categories, range( len(df[6].cat.categories ))))

cat_cols = df.select_dtypes(['category']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)

df = df.iloc[np.random.permutation(len(df))]
print(df.head())

      0  1  2  3  4  5  6
1337  1  3  1  1  1  0  0
889   2  3  0  2  0  2  0
341   3  1  0  1  0  0  0
1664  1  1  1  1  0  0  3
1605  1  2  3  1  1  1  2


In [16]:
# CalcMeasures 성능 측정 후 저장을 위한 함수 
df_f1 = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_precision = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_recall = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
def CalcMeasures(method,y_pred,y_true,df_f1=df_f1
                 ,df_precision=df_precision,df_recall=df_recall):

    df_f1.loc[len(df_f1)] = [method]+list(f1_score(y_pred,y_true,average=None))
    df_precision.loc[len(df_precision)] = [method]+list(precision_score(y_pred,y_true,average=None))
    df_recall.loc[len(df_recall)] = [method]+list(recall_score(y_pred,y_true,average=None))

# x, y 데이터 분리
X= df[df.columns[:-1]].values
Y = df[df.columns[-1]].values

In [17]:
# 10 개의 교차 검증 폴드가 사용 

cv = 10
method = 'linear support vector machine'
clf = svm.SVC(kernel='linear',C=50)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'rbf support vector machine'
clf = svm.SVC(kernel='rbf',C=50)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'poly support vector machine'
clf = svm.SVC(kernel='poly',C=50)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'decision tree'
clf = DecisionTreeClassifier(random_state=0)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'random forest'
clf = RandomForestClassifier(n_estimators=50,random_state=0,max_features=None)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'naive bayes'
clf = MultinomialNB()
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'logistic regression'
clf = LogisticRegression()
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'k nearest neighbours'
clf = KNeighborsClassifier(weights='distance',n_neighbors=5)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


측정된 성능 값은 데이터 프레임에 저장됨<br>
등급별로 4회 평가<br>
acc:0, unacc:2 good:1 vgood:3<br>
가장 좋은 모델 RBF 커널 SVM 

In [18]:
df_f1

Unnamed: 0,method,acc,good,unacc,vgood
0,linear support vector machine,0.264591,0.0,0.846866,0.0
1,rbf support vector machine,0.997403,1.0,0.999587,0.992248
2,poly support vector machine,0.788436,0.84375,0.938292,0.783333
3,decision tree,0.964613,0.893617,0.993394,0.969231
4,random forest,0.966234,0.937063,0.993367,0.977099
5,naive bayes,0.040404,0.0,0.825299,0.0
6,logistic regression,0.275168,0.0,0.823097,0.055556
7,k nearest neighbours,0.778689,0.579439,0.951549,0.686869


In [19]:
df_precision

Unnamed: 0,method,acc,good,unacc,vgood
0,linear support vector machine,0.177083,0.0,0.982645,0.0
1,rbf support vector machine,1.0,1.0,0.999174,0.984615
2,poly support vector machine,0.78125,0.782609,0.94876,0.723077
3,decision tree,0.958333,0.913043,0.994215,0.969231
4,random forest,0.96875,0.971014,0.990083,0.984615
5,naive bayes,0.020833,0.0,0.997521,0.0
6,logistic regression,0.213542,0.0,0.924793,0.030769
7,k nearest neighbours,0.742188,0.449275,0.990083,0.523077


In [20]:
df_recall

Unnamed: 0,method,acc,good,unacc,vgood
0,linear support vector machine,0.523077,0.0,0.744055,0.0
1,rbf support vector machine,0.994819,1.0,1.0,1.0
2,poly support vector machine,0.795756,0.915254,0.928052,0.854545
3,decision tree,0.970976,0.875,0.992574,0.969231
4,random forest,0.963731,0.905405,0.996672,0.969697
5,naive bayes,0.666667,0.0,0.70379,0.0
6,logistic regression,0.386792,0.0,0.741551,0.285714
7,k nearest neighbours,0.818966,0.815789,0.915902,1.0


In [21]:
labels_counts=df[6].value_counts()
pd.Series(map6).map(labels_counts)

acc       384
good       69
unacc    1210
vgood      65
dtype: int64