In [None]:
cd /content/drive/MyDrive/2022-1/캡스톤/comebackhome/server_ver2

/content/drive/MyDrive/2022-1/캡스톤/comebackhome/server_ver2


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import random
random.seed(777)
torch.manual_seed(777)

<torch._C.Generator at 0x7f311b592890>

In [None]:
df = pd.read_csv('/content/drive/MyDrive/2022-1/캡스톤/comebackhome/dataset_ver2.csv')
df.shape

(2588, 25)

In [None]:
df = df.fillna("")

# 1. 전처리

### 1) 불용어 제거

In [None]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from konlpy.tag import Okt
from konlpy.tag import Kkma

In [None]:
stopwords_txt = pd.read_csv("pyfiles/stopwords.txt", header = None)
stopwords = stopwords_txt[0].tolist()

In [None]:
stopwords = ['이', '가', '을', '를', '요', '너무', '진짜', '부터', '나', '저', '제', '랑', '에', '할', '해', '해요', '했', '했어요', '돼요', '됐어요', '되어']

In [None]:
okt = Okt()

In [None]:
def remove_stopwords(text):
    clean_words = []
    for word in okt.morphs(text):
        if word not in stopwords:
            clean_words.append(word)
    return ' '.join(clean_words)

In [None]:
df['Chief complaint'] = df['Chief complaint'].apply(remove_stopwords) 
df['Onset'] = df['Onset'].map(str).apply(remove_stopwords) 

### 2) 데이터 나누기

In [None]:
y = df['level2']
X = df.drop('level2', axis = 1)
display(X.shape, y.shape)

(2588, 24)

(2588,)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 723, test_size=0.2, stratify = y)
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2070, 24)

(518, 24)

(2070,)

(518,)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train = X_train[['Chief complaint', 'Onset', 'Age', 'Sex']]
X_test = X_test[['Chief complaint', 'Onset', 'Age', 'Sex']]

### 3) 데이터 수치화 

#### 나이, 성별

In [None]:
def ageband_to_age(x):
    if x == "영아" or x == "신생아" :
        return 1
    if x == "유아":
        return 5
    if x == "10세 미만":
        return 9
    return int(x[:2])

def sex_to_int(x):
    if x == "남성" :
        return 0
    elif x == '여성' :
        return 1
    return 2

In [None]:
X_train['Age'] = X_train['Age'].map(ageband) 
X_test['Age'] = X_test['Age'].map(ageband) 
X_train['Sex'] = X_train['Sex'].map(sex_to_int) 
X_test['Sex'] = X_test['Sex'].map(sex_to_int) 

#### 주요 증상, 증상 시작 시점

In [None]:
complaint_vectorizer = TfidfVectorizer(token_pattern = r'\w{1,}', min_df = 3)
onset_vectorizer = TfidfVectorizer(token_pattern = r'\w{1,}', min_df = 3)
X_train_tfidf_complaint = complaint_vectorizer.fit_transform(X_train['Chief complaint']).toarray()
X_test_tfidf_complaint = complaint_vectorizer.transform(X_test['Chief complaint']).toarray()
X_train_tfidf_onset = onset_vectorizer.fit_transform(X_train['Onset']).toarray()
X_test_tfidf_onset = onset_vectorizer.transform(X_test['Onset']).toarray()

In [None]:
column_name_complaint = [str(i) for i in range(X_train_tfidf_complaint.shape[1])]
column_name_onset = [str(i + X_train_tfidf_complaint.shape[1]) for i in range(X_train_tfidf_onset.shape[1])]

tfidf_complaint_df = pd.DataFrame(X_train_tfidf_complaint, columns = column_name_complaint)
tfidf_onset_df = pd.DataFrame(X_train_tfidf_onset, columns = column_name_onset)
X_train = pd.concat([X_train[['Sex', 'Age']].reset_index(drop = True), tfidf_complaint_df, tfidf_onset_df], axis= 1)

tfidf_complaint_df = pd.DataFrame(X_test_tfidf_complaint, columns = column_name_complaint)
tfidf_onset_df = pd.DataFrame(X_test_tfidf_onset, columns = column_name_onset)
X_test = pd.concat([X_test[['Sex', 'Age']].reset_index(drop = True), tfidf_complaint_df, tfidf_onset_df], axis= 1)

In [None]:
X_train

Unnamed: 0,Sex,Age,0,1,2,3,4,5,6,7,...,230,231,232,233,234,235,236,237,238,239
0,0,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,1,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.221139,0.0,0.0,0.0,0.0,0.0
3,0,70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.288472,0.0,0.0,0.0,0.0,0.0
4,1,70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.590752,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2065,0,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2066,1,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.349328,0.0,0.0,0.0,0.0,0.0
2067,0,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.349328,0.0,0.0,0.0,0.0,0.0
2068,1,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.135766,0.0,0.0,0.0,0.0,0.0


In [None]:
X_test

Unnamed: 0,Sex,Age,0,1,2,3,4,5,6,7,...,230,231,232,233,234,235,236,237,238,239
0,1,30,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.244668,0.0,0.0,0.0,0.0,0.0
1,0,60,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,1,60,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.302900,0.0,0.0,0.0,0.0,0.0
3,0,50,0.0,0.0,0.819683,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.233739,0.0,0.0,0.0,0.0,0.0
4,1,20,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.257102,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0,20,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.271477,0.0,0.0,0.0,0.0,0.0
514,1,30,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.349328,0.0,0.0,0.0,0.0,0.0
515,0,60,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.306798,0.0,0.0,0.0,0.0,0.0
516,1,20,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.349328,0.0,0.0,0.0,0.0,0.0


# 2. 학습

### 1) DNN 구조 정의


In [None]:
class Net(nn.Module):

    def __init__(self, in_features, out_features):
        super(Net, self).__init__()
        self.l1 = nn.Linear(in_features, 512)
        self.l2 = nn.Linear(512, 64)
        self.l3 = nn.Linear(64, out_features)
        self.dropout = nn.Dropout(p = 0.2)
        torch.nn.init.xavier_normal_(self.l1.weight)
        torch.nn.init.xavier_normal_(self.l2.weight)
        torch.nn.init.xavier_normal_(self.l3.weight)
     
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.dropout(x)
        x = F.relu(self.l2(x))
        x = self.dropout(x)
        x = self.l3(x)
        return x
    

### 2) 학습


In [None]:
x_train_tensor = torch.FloatTensor(X_train.to_numpy()).cuda()
x_test_tensor = torch.FloatTensor(X_test.to_numpy()).cuda()
y_train_tensor = torch.LongTensor(y_train).cuda()
y_test_tensor = torch.LongTensor(y_test).cuda()

In [None]:
from sklearn.metrics import accuracy_score

clf = Net(X_train.shape[1], len(le.classes_)).cuda()
lr = 0.01
nb_epoch = 50000
optimizer = optim.Adam(clf.parameters(), lr=lr)
loss = nn.CrossEntropyLoss().cuda()
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5000, gamma=0.1)

clf.train()
for epoch in range(nb_epoch):
    hypothesis = clf(x_train_tensor)
    cost = loss(hypothesis, y_train_tensor)
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    scheduler.step()
    if epoch%5000 == 0:
        acc = (torch.argmax(hypothesis, 1) == y_train_tensor).sum().item() / len(y_train)
        print(epoch, cost.item(), acc)

        clf.eval()
        pred = clf(x_test_tensor)
        pred = pred.argmax(1).cpu().detach().numpy()
        print('Test Accuracy test : %.2f' % accuracy_score(y_test, pred))
        clf.train()

0 4.8582763671875 0.021256038647342997
Test Accuracy test : 0.19
5000 0.02318190224468708 0.9884057971014493
Test Accuracy test : 0.94
10000 0.010959895327687263 0.9932367149758454
Test Accuracy test : 0.93
15000 0.01136029977351427 0.9917874396135266
Test Accuracy test : 0.93
20000 0.010264976881444454 0.991304347826087
Test Accuracy test : 0.93
25000 0.012388549745082855 0.991304347826087
Test Accuracy test : 0.93
30000 0.010401180014014244 0.9917874396135266
Test Accuracy test : 0.93
35000 0.010633143596351147 0.9951690821256038
Test Accuracy test : 0.93
40000 0.01297275722026825 0.9903381642512077
Test Accuracy test : 0.93
45000 0.010709310881793499 0.9946859903381643
Test Accuracy test : 0.93


### 3) 커스텀데이터로 검증

In [None]:
data = {
    "Chief complaint" : "배가 너무 아파요",
    "Age" : 30,
    "Sex" : "남성",
    "Onset" : "어제",
    "Height" : 165,
    "Weight" : 100

}

data['Chief complaint'] = remove_stopwords(data['Chief complaint'])
data['Onset'] = remove_stopwords(data['Onset'])

complaint_tfidf =  complaint_vectorizer.transform([data['Chief complaint']]).toarray()[0].tolist()
onset_tfidf =  onset_vectorizer.transform([data['Onset']]).toarray()[0].tolist()
input = torch.FloatTensor([[0 if data['Sex'] == "남성" else 1, data['Age']] +  complaint_tfidf + onset_tfidf]).cuda()
out = clf(input).argmax(dim = 1).item()
le.inverse_transform([out])

# 3. 모델 저장


### LE, TFIDF vectorizer, DNN 가중치 저장

In [None]:
from joblib import dump, load

In [None]:
cd /content/drive/MyDrive/2022-1/캡스톤/comebackhome/server_ver2

/content/drive/MyDrive/2022-1/캡스톤/comebackhome/server_ver2


In [None]:
np.save('level2_classes.npy', le.classes_)

In [None]:
dump(complaint_vectorizer, "complaint_vectorizer.pkl")
dump(onset_vectorizer, "onset_vectorizer.pkl")

['onset_vectorizer.pkl']

In [None]:
torch.save(clf.state_dict(), 'dnn_state_dict.pth')