In [100]:
from sklearn.datasets import load_wine
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import numpy as np

In [87]:
X = load_wine()['data']
y = load_wine()['target']

In [88]:
tree_model = DecisionTreeClassifier()
svm_model = SVC(probability=True)

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [90]:
tree_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)

In [91]:
tree_model = DecisionTreeClassifier()
# svm_model = SVC(probability=True) 
svm_model = SVC(probability=True, kernel='linear')

In [92]:
tree_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)

tree_model.score(X_test, y_test), svm_model.score(X_test, y_test)

(0.8222222222222222, 0.9333333333333333)

In [93]:
# np.argmax(tree_model.predict_proba(X_test) + svm_model.predict_proba(X_test))

In [94]:
tree_predict = tree_model.predict_proba(X_test)
svm_predict = svm_model.predict_proba(X_test)
soft_vote_predict = tree_predict + svm_predict
predict = np.argmax(soft_vote_predict, axis=1)

In [95]:
accuracy_score(predict, y_test)

0.8222222222222222

In [96]:
confusion_matrix(predict, y_test)

array([[14,  1,  0],
       [ 1, 14,  6],
       [ 0,  0,  9]], dtype=int64)

In [98]:
f1_score(y_test, predict, average='macro')

0.8203703703703704

## 디시전트리 만들기

In [6]:
import pandas as pd

In [52]:
data = {'outlook': {0: 'overcast',
  1: 'overcast',
  2: 'overcast',
  3: 'overcast',
  4: 'rainy',
  5: 'rainy',
  6: 'rainy',
  7: 'rainy',
  8: 'rainy',
  9: 'sunny',
  10: 'sunny',
  11: 'sunny',
  12: 'sunny',
  13: 'sunny'},
 'temp': {0: 'hot',
  1: 'cool',
  2: 'mild',
  3: 'hot',
  4: 'mild',
  5: 'cool',
  6: 'cool',
  7: 'mild',
  8: 'mild',
  9: 'hot',
  10: 'hot',
  11: 'mild',
  12: 'cool',
  13: 'mild'},
 'humidity': {0: 'high',
  1: 'normal',
  2: 'high',
  3: 'normal',
  4: 'high',
  5: 'normal',
  6: 'normal',
  7: 'normal',
  8: 'high',
  9: 'high',
  10: 'high',
  11: 'high',
  12: 'normal',
  13: 'normal'},
 'windy': {0: False,
  1: True,
  2: True,
  3: False,
  4: False,
  5: False,
  6: True,
  7: False,
  8: True,
  9: False,
  10: True,
  11: False,
  12: False,
  13: True},
 'play': {0: 'yes',
  1: 'yes',
  2: 'yes',
  3: 'yes',
  4: 'yes',
  5: 'yes',
  6: 'no',
  7: 'yes',
  8: 'no',
  9: 'no',
  10: 'no',
  11: 'no',
  12: 'yes',
  13: 'yes'}}

In [53]:
data = pd.DataFrame(data)

In [54]:
data

Unnamed: 0,outlook,temp,humidity,windy,play
0,overcast,hot,high,False,yes
1,overcast,cool,normal,True,yes
2,overcast,mild,high,True,yes
3,overcast,hot,normal,False,yes
4,rainy,mild,high,False,yes
5,rainy,cool,normal,False,yes
6,rainy,cool,normal,True,no
7,rainy,mild,normal,False,yes
8,rainy,mild,high,True,no
9,sunny,hot,high,False,no


In [22]:
import numpy as np
np.unique(data.play)

array(['no', 'yes'], dtype=object)

In [25]:
pk_no = np.sum(data.play == 'no')/len(data.play)
pk_yes = np.sum(data.play == 'yes')/len(data.play)

In [27]:
-(pk_no * np.log2(pk_no) + pk_yes*np.log2(pk_yes))

0.9402859586706311

In [115]:
np.unique(data.outlook, return_counts=True)

(array(['overcast', 'rainy', 'sunny'], dtype=object),
 array([4, 5, 5], dtype=int64))

In [42]:
def entropy(x):
    px = np.unique(x, return_counts=True)[1]/len(x)
    entropy_ = -np.sum(px*np.log2(px))
    # return -np.sum(np.unique(x, return_counts=True)[1]/len(x) * np.log2(np.unique(x, return_counts=True)[1]/len(x)))
    return entropy_

In [43]:
entropy(data.play)

0.9402859586706311

In [57]:
np.unique(data.outlook)

array(['overcast', 'rainy', 'sunny'], dtype=object)

In [120]:
x = data.loc[data.play == 'yes', 'play']

In [121]:
entropy(x)

-0.0

In [122]:
a = 4/14*entropy(data.loc[data.outlook == 'overcast','play'])
a

-0.0

In [123]:
b = 5/14*entropy(data.loc[data.outlook == 'rainy','play'])
b

0.3467680694480959

In [124]:
c = 5/14*entropy(data.loc[data.outlook == 'sunny','play'])
c

0.3467680694480959

In [71]:
s = a+b+c
s

0.6935361388961918

In [72]:
entropy(data.play) - s

0.24674981977443933

In [81]:
def info_gain(data, x, y):
    total = entropy(data[y])
    vals, counts = np.unique(data[x], return_counts=True)
    
    gain = [ count/len(data)*entropy(data.loc[data[x] == val, y]) for val, count in zip(vals, counts)]
    return total - np.sum(gain)
    

In [82]:
# np.unique(data[x], return_counts=True)[1]/len(data)*\
# entropy(data.loc[data[x]])

# (entropy(data.play) - 4/14*entropy(data.loc[data.outlook == 'overcast','play']) +\
# 5/14*entropy(data.loc[data.outlook == 'rainy','play']) +\
# 5/14*entropy(data.loc[data.outlook == 'sunny','play']))

In [83]:
info_gain(data,'outlook','play')

0.24674981977443933

In [84]:
info_gain(data,'temp','play')

0.02922256565895487

In [85]:
info_gain(data,'windy','play')

0.04812703040826949

In [86]:
data.columns

Index(['outlook', 'temp', 'humidity', 'windy', 'play'], dtype='object')

In [95]:
def max_gain(data, x, y):
    return x[np.argmax([info_gain(data, i, y) for i in x])]

In [96]:
features = data.columns[:-1]
target = data.columns[-1]

In [97]:
max_gain(data, features, target)

'outlook'

In [99]:
data.loc[data.outlook == 'overcast', 'play']

0    yes
1    yes
2    yes
3    yes
Name: play, dtype: object

In [100]:
data.loc[data.outlook == 'sunny', 'play']

9      no
10     no
11     no
12    yes
13    yes
Name: play, dtype: object

In [101]:
data.loc[data.outlook == 'rainy', 'play']

4    yes
5    yes
6     no
7    yes
8     no
Name: play, dtype: object

In [103]:
sub_data = data.loc[data.outlook == 'sunny', :]

In [104]:
max_gain(sub_data, features, target)

'humidity'

In [105]:
np.unique(data.humidity)

array(['high', 'normal'], dtype=object)

In [106]:
sub_data.loc[sub_data.humidity == 'high', 'play']

9     no
10    no
11    no
Name: play, dtype: object

In [107]:
sub_data.loc[sub_data.humidity == 'normal', 'play']

12    yes
13    yes
Name: play, dtype: object

In [108]:
sub_data = data.loc[data.outlook == 'rainy', :]

In [109]:
max_gain(sub_data, features, target)

'windy'

In [110]:
np.unique(data.windy)

array([False,  True])

In [113]:
sub_data.loc[sub_data.windy == True, 'play']

6    no
8    no
Name: play, dtype: object

In [114]:
sub_data.loc[sub_data.windy == False, 'play']

4    yes
5    yes
7    yes
Name: play, dtype: object

In [188]:
def entropy(x):
    px = np.unique(x, return_counts=True)[1]/len(x) # numpy 사용
    # px = x.value_counts()/ np.sum(x.value_counts())  # 시리즈  values 응용
    return -np.sum(px*np.log2(px))

def info_gain(data, x, y):
    total = entropy(data[y])
    vals, counts = np.unique(data[x], return_counts=True)
    gain = np.sum([ count/len(data)*entropy(data.loc[data[x]==val, y]) for val, count in zip(vals, counts)])
    return total - gain
                                     
def max_gain(data, x, y):
    return x[np.argmax([info_gain(data,i,y) for i in x])]

In [161]:
def make_tree(data,x,y):
    best_feature = max_gain(data,x,y)
    tree = {best_feature:{}}
    return tree

In [193]:
def make_tree(data,x,y):
    try:
        if np.unique(data[y]).size == 1:
            return np.unique(data[y])[0]
        best_feature = max_gain(data,x,y)
        columns = x[ x != best_feature]
        tree = {best_feature:{}}
        for val in np.unique(data[best_feature]):
            sub_data = data.loc[data[best_feature] == val, :]
            sub_tree = make_tree(sub_data, columns, y)
            tree[best_feature][val] = sub_tree
        return tree
    except Exception as e:
        print(e)

In [194]:
np.unique(data.loc[data.outlook == 'overcast', 'play']).size

1

In [195]:
features = data.columns[:-1]
target = data.columns[-1]

In [196]:
make_tree(data, features, target)

{'outlook': {'overcast': 'yes',
  'rainy': {'windy': {False: 'yes', True: 'no'}},
  'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}

In [197]:
def make_tree(data,x,y):
    if np.unique(data[y]).size == 1:
        return np.unique(data[y])[0]
    best_feature = max_gain(data,x,y)
    columns = x[ x != best_feature]
    tree = {best_feature:{}}
    for val in np.unique(data[best_feature]):
        sub_data = data.loc[data[best_feature] == val, :]
        sub_tree = make_tree(sub_data, columns, y)
        tree[best_feature][val] = sub_tree
    return tree

In [198]:
features = data.columns[:-1]
target = data.columns[-1]
make_tree(data, features, target)

{'outlook': {'overcast': 'yes',
  'rainy': {'windy': {False: 'yes', True: 'no'}},
  'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}

In [199]:
from pprint import pprint

In [200]:
pprint(make_tree(data, features, target))

{'outlook': {'overcast': 'yes',
             'rainy': {'windy': {False: 'yes', True: 'no'}},
             'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}


In [262]:
data = {'no_insects': {0: True,
  1: True,
  2: True,
  3: False,
  4: True,
  5: True,
  6: True,
  7: True,
  8: True,
  9: False},
 'no_dead': {0: True,
  1: True,
  2: False,
  3: True,
  4: True,
  5: True,
  6: False,
  7: False,
  8: True,
  9: False},
 'no_wilting': {0: True,
  1: True,
  2: True,
  3: True,
  4: True,
  5: True,
  6: False,
  7: True,
  8: True,
  9: True},
 'no_diseases': {0: True,
  1: True,
  2: False,
  3: True,
  4: True,
  5: True,
  6: False,
  7: False,
  8: True,
  9: True},
 'tree_health': {0: 'Good',
  1: 'Good',
  2: 'Poor',
  3: 'Good',
  4: 'Good',
  5: 'Good',
  6: 'Poor',
  7: 'Poor',
  8: 'Good',
  9: 'Poor'}}

In [263]:
data = pd.DataFrame(data)

In [264]:
features = np.array(['no_insects', 'no_wilting', 'no_diseases'])
target = 'tree_health'

In [281]:
def make_tree(data,x,y, parent_class=None):
    try:
        if np.unique(data[y]).size == 1:
            return np.unique(data[y])[0]
        elif len(data) == 0:
            return 'case1'
        elif len(x) == 0:
            return parent_class
        best_feature = max_gain(data,x,y)
        columns = x[ x != best_feature]
        tree = {best_feature:{}}
        for val in np.unique(data[best_feature]):
            vals, counts = np.unique(data.loc[data[best_feature], y], return_counts=True)
            parent_class = vals[np.argmax(counts)]
            sub_data = data.loc[data[best_feature] == val, :]
            sub_tree = make_tree(sub_data, columns, y, parent_class)
            tree[best_feature][val] = sub_tree
        return tree
    except Exception as e:
        print(e)

In [282]:
features = np.array(['no_insects', 'no_wilting', 'no_diseases'])
target = 'tree_health'
make_tree(data, features, target)

{'no_diseases': {False: 'Poor',
  True: {'no_insects': {False: {'no_wilting': {True: 'poor'}}, True: 'Good'}}}}

In [277]:
x = pd.DataFrame(
    {
        'no_insects': [False],
        'no_dead' : [True],
        'no_wilting' : [True],
        'no_diseases' : [True],
        'tree_health' : ['poor']
    }
)

In [278]:
data = pd.concat([data, x])

In [279]:
aa = data.loc[(data['no_diseases'] == True) & (data['no_insects'] == False), :] 

In [280]:
np.unique(aa['tree_health'])[np.argmax(np.unique(aa['tree_health'], return_counts=True)[1])]

'poor'

In [273]:
data.loc[(data['no_diseases'] == True) & (data['no_insects'] == False) &
              (data['no_wilting']==True), :] 

Unnamed: 0,no_insects,no_dead,no_wilting,no_diseases,tree_health
3,False,True,True,True,Good
9,False,False,True,True,Poor
0,False,True,True,True,poor


In [None]:
## 랜덤하게 발생하는 변수

In [287]:
np.random.seed(100)
X = np.random.randint(10, 100, size=(50, 5))
y = np.random.choice([0,1], 50)
df = pd.DataFrame(np.c_[X,y])

In [288]:
df.rename({ i:'col_' + str(i) for i in df.columns}, axis=1, inplace=True)

In [290]:
df.rename({'col_5' : 'target'}, axis='columns', inplace=True)

In [293]:
X = df.iloc[:,:-1].apply(lambda x: x>=50)
y = df.iloc[:,-1]

In [294]:
df = pd.concat([X,y], axis=1)

In [297]:
pprint(make_tree(df, df.columns[:-1], df.columns[-1]))

{'col_0': {False: {'col_1': {False: {'col_2': {False: 1,
                                               True: {'col_4': {False: {'col_3': {True: 0}},
                                                                True: 0}}}},
                             True: {'col_2': {False: {'col_3': {False: 1,
                                                                True: {'col_4': {False: 0,
                                                                                 True: 0}}}},
                                              True: 1}}}},
           True: {'col_4': {False: {'col_2': {False: {'col_1': {False: 0,
                                                                True: {'col_3': {False: 0,
                                                                                 True: 0}}}},
                                              True: {'col_1': {False: {'col_3': {False: 0,
                                                                                 True: 1}},
             