# data는 김도휘 형제님과 김명찬 형제님이 만들어주신 보편지향 기도 데이터를 사용하였습니다. 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

## CSV 에서 기도문 읽어오기
def read_data(path_to_file):
    df = pd.read_csv(path_to_file, dtype=str)
    return df

df = read_data('../../data/pray456_v3.csv')

In [2]:
df.to_csv('../../data/pray456_v3withid.csv')

In [3]:
X = df['content']
y = df['label']
print(len(X))
print(len(y))

774
774


In [4]:
X[0]

'주님, 대림시기를 맞는 교회가 회개와 화해의 생활을 하며 저희에게  오실 아기 예수님을 기쁜 마음으로 맞이할 수 있도록 도와주소서.'

In [5]:
y_quiz = df['content'].sample(50)
y_quiz.shape

(50,)

In [6]:
y_quiz.sort_index().to_csv('../../data/quiz_pray1_sample50.csv')

## y data one_hot encoding

In [7]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

print(type(y[0]), y[:5])

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)
print(integer_encoded[:5])

# # one_hot encode
# onehot_encoder = OneHotEncoder(sparse=False)
# integer_encoded = integer_encoded.reshape(len(integer_encoded), )
# onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
# print(onehot_encoded[:5])

# setup y 
# y = onehot_encoded
y= integer_encoded
print(y[:5])

<class 'str'> 0    1
1    2
2    3
3    4
4    1
Name: label, dtype: object
[0 1 2 3 0]
[0 1 2 3 0]


## 띄어쓰기로 구분

In [8]:
X = [x.split() for x in X]

In [9]:
X[0]

['주님,',
 '대림시기를',
 '맞는',
 '교회가',
 '회개와',
 '화해의',
 '생활을',
 '하며',
 '저희에게',
 '오실',
 '아기',
 '예수님을',
 '기쁜',
 '마음으로',
 '맞이할',
 '수',
 '있도록',
 '도와주소서.']

## 고유 토큰 인덱싱

In [10]:
from collections import defaultdict

In [11]:
# 단어마다 고유한 인덱스를 부여하기 위한 dictionary
token_to_index = defaultdict(lambda : len(token_to_index))

In [12]:
# 단어에 대한 고유 인덱스를 부여하는 함수
def convert_token_to_idx(token_ls):
    for tokens in token_ls:
        yield [token_to_index[token] for token in tokens]
    return

In [13]:
X = list(convert_token_to_idx(X))

In [14]:
# 고유 인덱스로 변환될 경우, 원래 어떤 단어였는지 알기 어려우므로,
# 인덱스로 변환된 단어를 본래의 단어로 재변환하기 위한 dictionary 생성
index_to_token = {val : key for key,val in token_to_index.items()}

#### 인덱싱 결과 확인 

In [15]:
import operator

In [16]:
for k,v in sorted(token_to_index.items(), key=operator.itemgetter(1))[:5]:
    print (k,v)

주님, 0
대림시기를 1
맞는 2
교회가 3
회개와 4


In [17]:
X[0]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

### 빈(empty) 단어 가방(Bag of Words) 생성

In [18]:
n_train_reviews = len(X)       # 학습용 리뷰의 총 수
n_unique_word = len(token_to_index)  # 고유 단어의 갯수 (BOW의 차원의 크기) 

In [19]:
n_unique_word

3329

### numpy를 사용하면 memory error 발생 

In [20]:
import numpy as np

In [21]:
bow = np.zeros((n_train_reviews, n_unique_word), dtype=np.float32)

### Scipy 패키지 활용

In [22]:
# import scipy.sparse as sps

In [23]:
# 학습용 리뷰 수(150,000) x 고유 단어의 수(450,541)의 크기를 갖는 빈 단어가방 생성
# bow_data = sps.lil_matrix((n_train_reviews, n_unique_word), dtype=np.int8)

### 단어 가방 채우기

In [24]:
for i, tokens in enumerate(X):
    for token in tokens:
        # i번 째 리뷰에 등장한 단어들을 세서, 고유 번호에 1씩 더해준다.
        bow[i, token] += 1.0

### Train / test split

In [25]:
bow_train, bow_test, y_train, y_test = train_test_split(bow, y, test_size=0.2, random_state=1212)
print(bow_train.shape, bow_test.shape, y_train.shape, y_test.shape)
print(y_train[:5])

(619, 3329) (155, 3329) (619,) (155,)
[3 2 3 1 0]


## Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

### Train

In [28]:
model.fit(bow_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

### Test

In [29]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [30]:
predict = model.predict(bow_test)
accuracy = accuracy_score(y_test, predict)

In [31]:
print('Accuracy : ',accuracy)
print(classification_report(y_test, predict))

Accuracy :  0.7870967741935484
             precision    recall  f1-score   support

          0       0.86      0.95      0.90        38
          1       0.84      0.76      0.80        42
          2       0.81      0.71      0.75        41
          3       0.64      0.74      0.68        34

avg / total       0.79      0.79      0.79       155



## Pytorch

In [32]:
import torch

In [36]:
# dataset : bow_train, bow_test, y_train, y_test
bow_train, y_train, bow_test, y_test = map(
    torch.tensor, (bow_train, y_train, bow_test, y_test)
)

  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
n, c = bow_train.shape
print(bow_train.shape)

torch.Size([619, 3329])


In [38]:
print(y_train.min(), y_train.max())
print(y_test.min(), y_test.max())

tensor(0) tensor(3)
tensor(0) tensor(3)


In [39]:
bs = 64  # batch size

xb = bow_train[0:bs]  # a mini-batch from x
yb = y_train[0:bs]
xv = bow_test[0:bs]
yv = y_test[0:bs]

print(xb[:5])
print(yb[:5])


tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])
tensor([3, 2, 3, 1, 0])


In [265]:
import math

weights = torch.randn(3329, 4) / math.sqrt(3329)
weights.requires_grad_()
bias = torch.zeros(4, requires_grad=True)

def log_softmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)

def model(xb):
    return log_softmax(xb @ weights + bias)

def nll(pred, gt):
    return -pred[range(gt.shape[0]), gt].mean()

loss_func = nll

def accuracy(out, y):
    preds = torch.argmax(out, dim=1)
    return (preds == y).float().mean()

print(loss_func(model(xb), yb), accuracy(model(xb), yb))
print(loss_func(model(xv), yv), accuracy(model(xv), yv))


tensor(1.3787, grad_fn=<NegBackward>) tensor(0.2500)
tensor(1.3791, grad_fn=<NegBackward>) tensor(0.2812)


In [266]:
from IPython.core.debugger import set_trace

lr = 0.5  # learning rate
epochs = 100  # how many epochs to train for

for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
        
        start_i = i * bs
        end_i = start_i + bs
        xb = bow_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        
        ## Feed foward
        pred = model(xb)
        loss = loss_func(pred, yb)
        
        ## Backpropagation
        loss.backward()
        
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_()
            bias.grad.zero_()
    
    if epoch // 10 == 0:
        print(loss_func(model(xb), yb), accuracy(model(xb), yb), loss_func(model(xv), yv), accuracy(model(xv), yv))

tensor(1.0720, grad_fn=<NegBackward>) tensor(0.8140) tensor(1.1751, grad_fn=<NegBackward>) tensor(0.6250)
tensor(0.9095, grad_fn=<NegBackward>) tensor(0.8605) tensor(1.0279, grad_fn=<NegBackward>) tensor(0.7031)
tensor(0.7943, grad_fn=<NegBackward>) tensor(0.8837) tensor(0.9320, grad_fn=<NegBackward>) tensor(0.7344)
tensor(0.7063, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.8649, grad_fn=<NegBackward>) tensor(0.7344)
tensor(0.6359, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.8153, grad_fn=<NegBackward>) tensor(0.7656)
tensor(0.5778, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.7772, grad_fn=<NegBackward>) tensor(0.7812)
tensor(0.5288, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.7471, grad_fn=<NegBackward>) tensor(0.7812)
tensor(0.4869, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.7228, grad_fn=<NegBackward>) tensor(0.7812)
tensor(0.4506, grad_fn=<NegBackward>) tensor(0.9535) tensor(0.7028, grad_fn=<NegBackward>) tensor(0.7812)
tensor(0.4188, grad_fn=<NegBackward>) tensor(0

In [267]:
from torch import nn

class Mnist_Logistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(3329, 4) / math.sqrt(3329))
        self.bias = nn.Parameter(torch.zeros(4))

    def forward(self, xb):
        return xb @ self.weights + self.bias

model = Mnist_Logistic()

In [261]:
loss = loss_func(model(xb), yb)
loss.backward()

with torch.no_grad():
    for p in model.parameters(): 
        print(p)
        p -= p.grad * lr
    model.zero_grad()
    print(loss_func(model(xb), yb), accuracy(model(xb), yb))

Parameter containing:
tensor([[ 5.6228e+00,  7.1804e+00,  1.5089e+00,  7.6837e+00],
        [-2.5841e-02, -6.4183e-03, -4.4941e-02,  4.8598e-03],
        [-1.1584e-02,  6.6794e-03,  1.8750e-02,  5.1246e-01],
        ...,
        [ 4.0660e-03,  4.5586e-03,  1.3938e-02,  1.3074e-02],
        [-1.5490e-02, -1.6162e-02, -1.4162e-02,  2.8925e-03],
        [-1.3852e-02, -4.7789e-03,  1.7368e-02,  3.2919e-03]],
       requires_grad=True)
Parameter containing:
tensor([5.6279, 7.1628, 1.5349, 7.6744], requires_grad=True)
tensor(-31.5796) tensor(0.8605)


In [262]:
def fit():
    for epoch in range(epochs):
        n_batches = (n - 1) // bs + 1
        for i in range(n_batches):
            start_i = i * bs
            end_i = start_i + bs
            
            xb = bow_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            
            pred = model(xb)
            loss = loss_func(pred, yb)
            
            #back propagation
            loss.backward()
            
            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()
        print(loss_func(model(xb), yb), accuracy(model(xb), yb))

fit()

tensor(-36.1496, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-40.7196, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-45.2896, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-49.8596, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-54.4296, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-58.9996, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-63.5696, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-68.1396, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-72.7097, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-77.2797, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-81.8497, grad_fn=<NegBackward>) tensor(0.8605)
tensor(-86.4197, grad_fn=<NegBackward>) tensor(0.8372)
tensor(-90.9897, grad_fn=<NegBackward>) tensor(0.8372)
tensor(-95.5597, grad_fn=<NegBackward>) tensor(0.8372)
tensor(-100.1297, grad_fn=<NegBackward>) tensor(0.8140)
tensor(-104.6998, grad_fn=<NegBackward>) tensor(0.8140)
tensor(-109.2698, grad_fn=<NegBackward>) tensor(0.8140)
tensor(-113.8398, grad_fn=<NegBackward>) tensor(0.8140)
tensor