# data는 김도휘 형제님과 김명찬 형제님이 만들어주신 보편지향 기도 데이터를 사용하였습니다. 

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

## CSV 에서 기도문 읽어오기
def read_data(path_to_file):
    df = pd.read_csv(path_to_file, dtype=str)
    return df

df = read_data('../../data/pray456_v3.csv')

In [4]:
df.to_csv('../../data/pray456_v3withid.csv')

In [5]:
X = df['content']
y = df['label']
print(len(X))
print(len(y))

774
774


In [6]:
X[0]

'주님, 대림시기를 맞는 교회가 회개와 화해의 생활을 하며 저희에게  오실 아기 예수님을 기쁜 마음으로 맞이할 수 있도록 도와주소서.'

In [7]:
y_quiz = df['content'].sample(50)
y_quiz.shape

(50,)

In [8]:
y_quiz.sort_index().to_csv('../../data/quiz_pray1_sample50.csv')

## y data one_hot encoding

In [9]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

print(type(y[0]), y[:5])

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y)
print(integer_encoded[:5])

# # one_hot encode
# onehot_encoder = OneHotEncoder(sparse=False)
# integer_encoded = integer_encoded.reshape(len(integer_encoded), )
# onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
# print(onehot_encoded[:5])

# setup y 
# y = onehot_encoded
y= integer_encoded
print(y[:5])

<class 'str'> 0    1
1    2
2    3
3    4
4    1
Name: label, dtype: object
[0 1 2 3 0]
[0 1 2 3 0]


## 띄어쓰기로 구분

In [10]:
X = [x.split() for x in X]

In [11]:
X[0]

['주님,',
 '대림시기를',
 '맞는',
 '교회가',
 '회개와',
 '화해의',
 '생활을',
 '하며',
 '저희에게',
 '오실',
 '아기',
 '예수님을',
 '기쁜',
 '마음으로',
 '맞이할',
 '수',
 '있도록',
 '도와주소서.']

## 고유 토큰 인덱싱

In [12]:
from collections import defaultdict

In [13]:
# 단어마다 고유한 인덱스를 부여하기 위한 dictionary
token_to_index = defaultdict(lambda : len(token_to_index))

In [14]:
# 단어에 대한 고유 인덱스를 부여하는 함수
def convert_token_to_idx(token_ls):
    for tokens in token_ls:
        yield [token_to_index[token] for token in tokens]
    return

In [15]:
X = list(convert_token_to_idx(X))

In [16]:
# 고유 인덱스로 변환될 경우, 원래 어떤 단어였는지 알기 어려우므로,
# 인덱스로 변환된 단어를 본래의 단어로 재변환하기 위한 dictionary 생성
index_to_token = {val : key for key,val in token_to_index.items()}

#### 인덱싱 결과 확인 

In [17]:
import operator

In [18]:
for k,v in sorted(token_to_index.items(), key=operator.itemgetter(1))[:5]:
    print (k,v)

주님, 0
대림시기를 1
맞는 2
교회가 3
회개와 4


In [19]:
X[0]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

### 빈(empty) 단어 가방(Bag of Words) 생성

In [20]:
n_train_reviews = len(X)       # 학습용 리뷰의 총 수
n_unique_word = len(token_to_index)  # 고유 단어의 갯수 (BOW의 차원의 크기) 

In [21]:
n_unique_word

3329

### numpy를 사용하면 memory error 발생 

In [22]:
import numpy as np

In [23]:
bow = np.zeros((n_train_reviews, n_unique_word), dtype=np.float32)

### Scipy 패키지 활용

In [24]:
# import scipy.sparse as sps

In [25]:
# 학습용 리뷰 수(150,000) x 고유 단어의 수(450,541)의 크기를 갖는 빈 단어가방 생성
# bow_data = sps.lil_matrix((n_train_reviews, n_unique_word), dtype=np.int8)

### 단어 가방 채우기

In [26]:
for i, tokens in enumerate(X):
    for token in tokens:
        # i번 째 리뷰에 등장한 단어들을 세서, 고유 번호에 1씩 더해준다.
        bow[i, token] += 1.0

### Train / test split

In [27]:
bow_train, bow_test, y_train, y_test = train_test_split(bow, y, test_size=0.2, random_state=1212)
print(bow_train.shape, bow_test.shape, y_train.shape, y_test.shape)
print(y_train[:5])

(619, 3329) (155, 3329) (619,) (155,)
[3 2 3 1 0]


## Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

### Train

In [30]:
model.fit(bow_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

### Test

In [31]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [32]:
predict = model.predict(bow_test)
accuracy = accuracy_score(y_test, predict)

In [33]:
print('Accuracy : ',accuracy)
print(classification_report(y_test, predict))

Accuracy :  0.7870967741935484
             precision    recall  f1-score   support

          0       0.86      0.95      0.90        38
          1       0.84      0.76      0.80        42
          2       0.81      0.71      0.75        41
          3       0.64      0.74      0.68        34

avg / total       0.79      0.79      0.79       155



## Pytorch

In [34]:
import torch

In [247]:
# dataset : bow_train, bow_test, y_train, y_test
bow_train, y_train, bow_test, y_test = map(
    torch.tensor, (bow_train, y_train, bow_test, y_test)
)

  This is separate from the ipykernel package so we can avoid doing imports until


In [248]:
n, c = bow_train.shape
print(bow_train.shape)

torch.Size([619, 3329])


In [249]:
print(y_train.min(), y_train.max())
print(y_test.min(), y_test.max())

tensor(0) tensor(3)
tensor(0) tensor(3)


In [250]:
bs = 64  # batch size

xb = bow_train[0:bs]  # a mini-batch from x
yb = y_train[0:bs]
xv = bow_test[0:bs]
yv = y_test[0:bs]

print(xb[:5])
print(yb[:5])


tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])
tensor([3, 2, 3, 1, 0])


### Logistic regression with equations

In [251]:
import math

weights = torch.randn(3329, 4) / math.sqrt(3329)
weights.requires_grad_()
bias = torch.zeros(4, requires_grad=True)

def log_softmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)

def model(xb):
    return log_softmax(xb @ weights + bias)

def nll(pred, gt):
    return -pred[range(gt.shape[0]), gt].mean()

loss_func = nll

def accuracy(out, y):
    preds = torch.argmax(out, dim=1)
    return (preds == y).float().mean()

print(loss_func(model(xb), yb), accuracy(model(xb), yb))
print(loss_func(model(xv), yv), accuracy(model(xv), yv))


tensor(1.3676, grad_fn=<NegBackward>) tensor(0.3750)
tensor(1.3780, grad_fn=<NegBackward>) tensor(0.3594)


In [252]:
from IPython.core.debugger import set_trace

lr = 0.5  # learning rate
epochs = 30  # how many epochs to train for

for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
        
        start_i = i * bs
        end_i = start_i + bs
        xb = bow_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        
        ## Feed foward
        pred = model(xb)
        loss = loss_func(pred, yb)
        
        ## Backpropagation
        loss.backward()
        
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_()
            bias.grad.zero_()
    
    print("Epoch:", epoch, loss_func(model(xb), yb), accuracy(model(xb), yb), loss_func(model(xv), yv), accuracy(model(xv), yv))

Epoch: 0 tensor(1.0614, grad_fn=<NegBackward>) tensor(0.8605) tensor(1.1742, grad_fn=<NegBackward>) tensor(0.6094)
Epoch: 1 tensor(0.9005, grad_fn=<NegBackward>) tensor(0.8837) tensor(1.0259, grad_fn=<NegBackward>) tensor(0.7344)
Epoch: 2 tensor(0.7864, grad_fn=<NegBackward>) tensor(0.9070) tensor(0.9295, grad_fn=<NegBackward>) tensor(0.7500)
Epoch: 3 tensor(0.6993, grad_fn=<NegBackward>) tensor(0.9070) tensor(0.8620, grad_fn=<NegBackward>) tensor(0.7812)
Epoch: 4 tensor(0.6296, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.8122, grad_fn=<NegBackward>) tensor(0.7969)
Epoch: 5 tensor(0.5722, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.7739, grad_fn=<NegBackward>) tensor(0.8125)
Epoch: 6 tensor(0.5239, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.7438, grad_fn=<NegBackward>) tensor(0.8125)
Epoch: 7 tensor(0.4825, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.7194, grad_fn=<NegBackward>) tensor(0.8125)
Epoch: 8 tensor(0.4467, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.6994, gra

In [253]:
xb

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])

In [254]:
yb

tensor([1, 2, 1, 1, 0, 3, 3, 3, 0, 3, 0, 3, 1, 3, 2, 1, 0, 3, 1, 0, 3, 0, 1, 3,
        3, 2, 3, 3, 0, 1, 3, 1, 0, 3, 0, 0, 1, 1, 3, 1, 1, 1, 0])

In [None]:
## What is nn.Module?

In [255]:
## Same model with nn.Module

class Logistic_Regression(nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(3329, 4) / math.sqrt(3329))
        self.bias = nn.Parameter(torch.zeros(4))

    def forward(self, xb):
        return log_softmax(xb @ self.weights + self.bias)

model = Logistic_Regression()
loss_func = nll

with torch.no_grad(): # To see model's paramter without calculating gradient
    for p in model.parameters(): 
        print(p)
    print(loss_func(model(xb), yb), accuracy(model(xb), yb))

Parameter containing:
tensor([[-0.0163, -0.0103,  0.0177, -0.0262],
        [ 0.0062, -0.0261, -0.0375, -0.0041],
        [ 0.0022, -0.0186, -0.0019, -0.0214],
        ...,
        [-0.0173,  0.0015,  0.0010, -0.0011],
        [ 0.0071, -0.0025, -0.0118, -0.0003],
        [-0.0040, -0.0079, -0.0158, -0.0233]], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
tensor(1.3852) tensor(0.2326)


In [256]:
def fit():
    for epoch in range(epochs):
        n_batches = (n - 1) // bs + 1
        for i in range(n_batches):
            start_i = i * bs
            end_i = start_i + bs
            
            xb = bow_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            
            pred = model(xb)
            loss = loss_func(pred, yb)
            
            #back propagation
            loss.backward()
            
            with torch.no_grad():
                for p in model.parameters():
                     p -= p.grad * lr
                model.zero_grad()
        
        print("Epochs:", epoch, loss_func(model(xb), yb), accuracy(model(xb), yb), loss_func(model(xv), yv), accuracy(model(xv), yv))

fit()

Epochs: 0 tensor(1.0650, grad_fn=<NegBackward>) tensor(0.8372) tensor(1.1886, grad_fn=<NegBackward>) tensor(0.6406)
Epochs: 1 tensor(0.9023, grad_fn=<NegBackward>) tensor(0.8605) tensor(1.0381, grad_fn=<NegBackward>) tensor(0.7188)
Epochs: 2 tensor(0.7873, grad_fn=<NegBackward>) tensor(0.9070) tensor(0.9403, grad_fn=<NegBackward>) tensor(0.7500)
Epochs: 3 tensor(0.6998, grad_fn=<NegBackward>) tensor(0.9070) tensor(0.8719, grad_fn=<NegBackward>) tensor(0.7656)
Epochs: 4 tensor(0.6298, grad_fn=<NegBackward>) tensor(0.9070) tensor(0.8213, grad_fn=<NegBackward>) tensor(0.7656)
Epochs: 5 tensor(0.5721, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.7825, grad_fn=<NegBackward>) tensor(0.7969)
Epochs: 6 tensor(0.5236, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.7519, grad_fn=<NegBackward>) tensor(0.8125)
Epochs: 7 tensor(0.4821, grad_fn=<NegBackward>) tensor(0.9302) tensor(0.7271, grad_fn=<NegBackward>) tensor(0.8125)
Epochs: 8 tensor(0.4461, grad_fn=<NegBackward>) tensor(0.9535) tensor(0.

In [257]:
from torch import nn
import torch.nn.functional as F

loss_func = F.cross_entropy

class Logistic_Regression(nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(3329, 4) / math.sqrt(3329))
        self.bias = nn.Parameter(torch.zeros(4))

    def forward(self, xb):
        return xb @ self.weights + self.bias

model = Logistic_Regression()

with torch.no_grad(): # To see model's paramter without calculating gradient
    for p in model.parameters(): 
        print(p)
    print(loss_func(model(xb), yb), accuracy(model(xb), yb))

Parameter containing:
tensor([[-0.0136,  0.0033, -0.0045, -0.0145],
        [ 0.0066, -0.0241, -0.0112, -0.0176],
        [-0.0022,  0.0023,  0.0227, -0.0128],
        ...,
        [ 0.0188,  0.0084, -0.0171,  0.0166],
        [ 0.0084,  0.0025,  0.0102,  0.0377],
        [-0.0081, -0.0183, -0.0146,  0.0111]], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
tensor(1.3891) tensor(0.2326)


In [258]:
loss = loss_func(model(xb), yb)
loss.backward()

In [259]:
loss_func

<function torch.nn.functional.cross_entropy>

In [260]:
with torch.no_grad(): # To see model's paramter without calculating gradient
    for p in model.parameters(): 
        print(p)
    model.zero_grad()
    print(loss_func(model(xb), yb), accuracy(model(xb), yb))

Parameter containing:
tensor([[-0.0136,  0.0033, -0.0045, -0.0145],
        [ 0.0066, -0.0241, -0.0112, -0.0176],
        [-0.0022,  0.0023,  0.0227, -0.0128],
        ...,
        [ 0.0188,  0.0084, -0.0171,  0.0166],
        [ 0.0084,  0.0025,  0.0102,  0.0377],
        [-0.0081, -0.0183, -0.0146,  0.0111]], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
tensor(1.3891) tensor(0.2326)


In [261]:
fit()

Epochs: 0 tensor(1.0731, grad_fn=<NllLossBackward>) tensor(0.8140) tensor(1.1815, grad_fn=<NllLossBackward>) tensor(0.6406)
Epochs: 1 tensor(0.9092, grad_fn=<NllLossBackward>) tensor(0.8605) tensor(1.0351, grad_fn=<NllLossBackward>) tensor(0.6875)
Epochs: 2 tensor(0.7933, grad_fn=<NllLossBackward>) tensor(0.8837) tensor(0.9396, grad_fn=<NllLossBackward>) tensor(0.7344)
Epochs: 3 tensor(0.7050, grad_fn=<NllLossBackward>) tensor(0.9070) tensor(0.8725, grad_fn=<NllLossBackward>) tensor(0.7500)
Epochs: 4 tensor(0.6344, grad_fn=<NllLossBackward>) tensor(0.9070) tensor(0.8228, grad_fn=<NllLossBackward>) tensor(0.7812)
Epochs: 5 tensor(0.5763, grad_fn=<NllLossBackward>) tensor(0.9302) tensor(0.7845, grad_fn=<NllLossBackward>) tensor(0.7812)
Epochs: 6 tensor(0.5274, grad_fn=<NllLossBackward>) tensor(0.9302) tensor(0.7542, grad_fn=<NllLossBackward>) tensor(0.7969)
Epochs: 7 tensor(0.4855, grad_fn=<NllLossBackward>) tensor(0.9302) tensor(0.7298, grad_fn=<NllLossBackward>) tensor(0.7969)
Epochs: 