In [8]:
import numpy as np
import pandas as pd

original_data = pd.read_csv('C:/Users/icear/diagnosis.data', sep = '\t', encoding = 'utf-16', # 인코딩이 참..
                           names = ['temperature','nausea', 'lumbar_pain', 'urine_pushing', 'micturition_pains',
                                   'burning_of_urethra', 'inflamation', 'nephritis']) # feature의 name이 없으므로 추가

In [10]:
print(original_data.shape)

original_data.head()

(120, 8)


Unnamed: 0,temperature,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflamation,nephritis
0,355,no,yes,no,no,no,no,no
1,359,no,no,yes,yes,yes,yes,no
2,359,no,yes,no,no,no,no,no
3,360,no,no,yes,yes,yes,yes,no
4,360,no,yes,no,no,no,no,no


- temperature에 소수점이 아니고 쉼표가 들어가있다. 바꿔줘야 할 필요가 있을 듯

In [14]:
original_data.columns

Index(['temperature', 'nausea', 'lumbar_pain', 'urine_pushing',
       'micturition_pains', 'burning_of_urethra', 'inflamation', 'nephritis'],
      dtype='object')

In [16]:
original_data['temperature'] = original_data['temperature'].apply(lambda x: x.replace(',','.'))
original_data.head()

Unnamed: 0,temperature,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflamation,nephritis
0,35.5,no,yes,no,no,no,no,no
1,35.9,no,no,yes,yes,yes,yes,no
2,35.9,no,yes,no,no,no,no,no
3,36.0,no,no,yes,yes,yes,yes,no
4,36.0,no,yes,no,no,no,no,no


In [17]:
from sklearn.preprocessing import LabelEncoder

le_nausea = LabelEncoder()
le_nausea.fit(original_data['nausea'])

print(le_nausea.classes_)
print(le_nausea.transform(['no', 'yes', 'yes']))
print(le_nausea.inverse_transform([0, 0, 1]))

['no' 'yes']
[0 1 1]
['no' 'no' 'yes']


- label encoder로 문자를 숫자로 바꿔줘본다.
- 보통은 one-hot encoding을 하는데 여기에서는 label encoding을 하는 이유는 이진으로 이루어진 데이터라서 그렇다.

In [18]:
dicted_data = original_data.copy()

dicted_data['nausea'] = le_nausea.transform(original_data['nausea'])
dicted_data.head()

Unnamed: 0,temperature,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflamation,nephritis
0,35.5,0,yes,no,no,no,no,no
1,35.9,0,no,yes,yes,yes,yes,no
2,35.9,0,yes,no,no,no,no,no
3,36.0,0,no,yes,yes,yes,yes,no
4,36.0,0,yes,no,no,no,no,no


- feature nausea은 수치형으로 바뀐것을 확인 할 수 있다.

In [23]:
tmp = ['lumbar_pain', 'urine_pushing', 'micturition_pains', 'burning_of_urethra', 'inflamation', 'nephritis']
les = {'nausea' : le_nausea}

for x in tmp:
    les[x] = LabelEncoder()
    dicted_data[x] = les[x].fit_transform(original_data[x])
    
dicted_data.head()

Unnamed: 0,temperature,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflamation,nephritis
0,35.5,0,1,0,0,0,0,0
1,35.9,0,0,1,1,1,1,0
2,35.9,0,1,0,0,0,0,0
3,36.0,0,0,1,1,1,1,0
4,36.0,0,1,0,0,0,0,0


- 전부다 바뀐것을 확인 할 수 있다.

In [24]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree

y = dicted_data['inflamation']

features = ['temperature']
x = dicted_data[features]

model = DecisionTreeClassifier(min_samples_split = 20, random_state = 1)

- Decsion Tree를 사용할 것이고 노드를 분할할때 데이터의 갯수가 20개 미만이면 분할하지 않는것으로 한다.

In [25]:
y.head()

0    0
1    1
2    0
3    1
4    0
Name: inflamation, dtype: int32

In [26]:
x.head()

Unnamed: 0,temperature
0,35.5
1,35.9
2,35.9
3,36.0
4,36.0


In [27]:
model.fit(x, y)

DecisionTreeClassifier(min_samples_split=20, random_state=1)

In [33]:
from graphviz import Source
from sklearn.tree import export_graphviz
from IPython.display import SVG
inflamation_label = les['inflamation'].inverse_transform(dicted_data['inflamation'])

graph = Source(tree.export_graphviz(model, out_file = None,
                                   feature_names = features,
                                   class_names = inflamation_label,
                                   filled = True))

display(SVG(graph.pipe(format = 'svg')))

ExecutableNotFound: failed to execute ['dot', '-Kdot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH

- 이거 문제가 있구만..

In [34]:
pred_y = model.predict(x)
pred_y_prob = model.predict_proba(x)

print(np.shape(pred_y))
print(np.shape(pred_y_prob))

(120,)
(120, 2)


In [36]:
print('정답 :', y[0])
print('예측 :', pred_y[0])
print('확률 :', pred_y_prob[0])
print(model.classes_)

정답 : 0
예측 : 0
확률 : [0.5625 0.4375]
[0 1]


In [37]:
mean_acc = model.score(x,y)
print(mean_acc)

0.7083333333333334


- temperature 변수만 썼는데 70%의 정확도를 나타낸다. 나머지를 다쓰면 얼마나 나올까?

In [41]:
y = dicted_data['inflamation']

features = ['temperature','nausea','lumbar_pain','urine_pushing','micturition_pains','burning_of_urethra']
x = dicted_data[features]

model = DecisionTreeClassifier(min_samples_split=20, random_state=99)

model.fit(x,y)
mean_accuracy = model.score(x,y)
print(mean_accuracy)

1.0


- 100%의 정확도가 나왔다

In [42]:
# feature의 기여도 측정
print(features)
print(model.feature_importances_)

['temperature', 'nausea', 'lumbar_pain', 'urine_pushing', 'micturition_pains', 'burning_of_urethra']
[0.22586919 0.         0.         0.48360656 0.29052425 0.        ]


In [53]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.model_selection import train_test_split

features = ['temperature','nausea','lumbar_pain','urine_pushing','micturition_pains','burning_of_urethra']

train_d, test_d = train_test_split(dicted_data, test_size=0.1)

train_y = train_d['inflamation']
train_x = train_d[features]

test_y = test_d['inflamation']
test_x = test_d[features]

- train과 test로 나눠서 해보자 train 9 : test 1 비율이다

In [54]:
model = DecisionTreeClassifier(min_samples_split=22, random_state=99)
model.fit(train_x, train_y)

mean_accuracy_for_train = model.score(train_x,train_y)
print(mean_accuracy_for_train)

mean_accuracy_for_test = model.score(test_x, test_y)
print(mean_accuracy_for_test)

0.9259259259259259
0.8333333333333334


In [55]:
# min_samples_split 인자 수 줄이기
model = DecisionTreeClassifier(min_samples_split=2, random_state=99)
model.fit(train_x, train_y)

mean_accuracy_for_train = model.score(train_x,train_y)
print(mean_accuracy_for_train)

mean_accuracy_for_test = model.score(test_x, test_y)
print(mean_accuracy_for_test)

1.0
1.0


- 생각보다 min_samples_split의 영향을 많이 받는다.
- 데이터에 따라 min_samples_split을 어떻게 해야하는지 판단할 필요가 있어보인다.