In [44]:
### GET PREPARED DATASET###
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
data = pd.read_csv('./ENB2012_data.csv',engine = 'python',encoding='ascii')
data.columns = [u'Relative Compactness',u'Surface Area',u'Wall Area',
u'Roof Area',
u'Overall Height',
u'Orientation',
u'Glazing Area',
u'Glazing Area Distribution',
u'Heating Load',
u'Cooling Load']
data_ = (data - data.mean(axis = 0))/data.std(axis = 0)
for i in range (0,len(data_['Cooling Load'].values)):
    data_['Cooling Load'].values[i]=int(data_['Cooling Load'].values[i])
y= data_['Cooling Load']
X = data_.drop(('Heating Load'), axis = 1).drop(('Cooling Load'), axis = 1)   

In [54]:
### Get Splitted Data###
import mysk
from sklearn.svm import SVC
from sklearn import discriminant_analysis
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, 
                                                    random_state = 31)

In [55]:
svc_model = SVC(kernel = 'linear', C = 1)
svc_model.fit(X_train, y_train)
print (np.mean(y_train != svc_model.predict(X_train)), \
               np.mean(y_test != svc_model.predict(X_test)))

0.235408560311 0.216535433071


In [56]:
for i in ["linear","rbf","poly","sigmoid"]:
    for c in [1,3,7,20]:
        svc_model = SVC(kernel = i, C = c)
        svc_model.fit(X_train, y_train)
        print (np.mean(y_train != svc_model.predict(X_train)), \
               np.mean(y_test != svc_model.predict(X_test)))
#rbf and poly are ok with c=20

0.235408560311 0.216535433071
0.235408560311 0.216535433071
0.235408560311 0.216535433071
0.190661478599 0.157480314961
0.178988326848 0.192913385827
0.147859922179 0.181102362205
0.108949416342 0.165354330709
0.0856031128405 0.157480314961
0.210116731518 0.188976377953
0.188715953307 0.196850393701
0.155642023346 0.196850393701
0.103112840467 0.153543307087
0.309338521401 0.228346456693
0.389105058366 0.303149606299
0.373540856031 0.291338582677
0.352140077821 0.259842519685


In [57]:
lda_model = discriminant_analysis.LinearDiscriminantAnalysis()
lda_model.fit(X_train.values, y_train.values)
print (np.mean(y_train != lda_model.predict(X_train)), \
               np.mean(y_test != lda_model.predict(X_test)))
print(confusion_matrix(lda_model.predict(X_train), y_train))
print(confusion_matrix(lda_model.predict(X_test), y_test)) 

0.22373540856 0.267716535433
[[ 84  19   0   0]
 [ 13 257  41   0]
 [  0  24  58  18]
 [  0   0   0   0]]
[[ 49  16   0   0]
 [  6 119  15   0]
 [  0  29  18   2]
 [  0   0   0   0]]




In [58]:
mysk.draw_points(X, y)
mysk.draw_sep_curve(lda_model)
mysk.draw_bayes()
plt.scatter(lda_model.means_[:, 0], lda_model.means_[:, 1], color = ['b', 'r'], s = 100)

TypeError: unhashable type: 'slice'

In [59]:
qda_model = discriminant_analysis.QuadraticDiscriminantAnalysis()
qda_model.fit(X_train.values, y_train.values)
print (np.mean(y_train != qda_model.predict(X_train)), \
               np.mean(y_test != qda_model.predict(X_test)))
print(confusion_matrix(qda_model.predict(X_train), y_train))
print(confusion_matrix(qda_model.predict(X_test), y_test)) 

0.332684824903 0.338582677165
[[ 16   5   0   0]
 [ 81 274  46   6]
 [  0  21  53  12]
 [  0   0   0   0]]
[[  3   1   0   0]
 [ 52 153  21   0]
 [  0  10  12   2]
 [  0   0   0   0]]




In [60]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print (np.mean(y_train != nb_model.predict(X_train)), \
               np.mean(y_test != nb_model.predict(X_test)))
print(confusion_matrix(nb_model.predict(X_train), y_train))
print(confusion_matrix(nb_model.predict(X_test), y_test))

0.643968871595 0.685039370079
[[ 97 152   0   0]
 [  0   0   0   0]
 [  0 146  68   0]
 [  0   2  31  18]]
[[55 80  0  0]
 [ 0  0  0  0]
 [ 0 83 23  0]
 [ 0  1 10  2]]


In [61]:
nbber_model = BernoulliNB()
nbber_model.fit(X_train, y_train)
print (np.mean(y_train != nbber_model.predict(X_train)), \
               np.mean(y_test != nbber_model.predict(X_test)))
print(confusion_matrix(nbber_model.predict(X_train), y_train))
print(confusion_matrix(nbber_model.predict(X_test), y_test))

0.618677042802 0.653543307087
[[ 97 152   0   0]
 [  0   0   0   0]
 [  0 148  99  18]
 [  0   0   0   0]]
[[55 80  0  0]
 [ 0  0  0  0]
 [ 0 84 33  2]
 [ 0  0  0  0]]


In [62]:
depth_array = np.arange(1, 100)
tree_model = tree.DecisionTreeClassifier()
grid = GridSearchCV(tree_model, param_grid = {'max_depth': depth_array}, cv = 5)
grid.fit(X_train, y_train)
min_err_cv = 1 - grid.best_score_
min_err_cv

0.12840466926070038

In [63]:
tree_model = tree.DecisionTreeClassifier(max_depth = None)
tree_model.fit(X_train, y_train)
print (np.mean(y_train != tree_model.predict(X_train)), \
               np.mean(y_test != tree_model.predict(X_test)))
print(confusion_matrix(tree_model.predict(X_train), y_train))
print(confusion_matrix(tree_model.predict(X_test), y_test))
#Wow, such power, many efficiency, so knowledge

0.0 0.106299212598
[[ 97   0   0   0]
 [  0 300   0   0]
 [  0   0  99   0]
 [  0   0   0  18]]
[[ 53   9   0   0]
 [  2 144   5   0]
 [  0  11  28   0]
 [  0   0   0   2]]


In [64]:
logistic_model = LogisticRegression(C=1.0)
logistic_model.fit(X_train, y_train)
print (np.mean(y_train != logistic_model.predict(X_train)), \
               np.mean(y_test != logistic_model.predict(X_test)))
print(confusion_matrix(logistic_model.predict(X_train), y_train))
print(confusion_matrix(logistic_model.predict(X_test), y_test))

0.198443579767 0.157480314961
[[ 76  14   0   0]
 [ 21 278  41   0]
 [  0   8  58  18]
 [  0   0   0   0]]
[[ 44  10   0   0]
 [ 11 146   9   0]
 [  0   8  24   2]
 [  0   0   0   0]]
