In [7]:
max_length=256 #sms 최대길이

## 1. 데이터 불러오기

In [8]:
import pandas as pd

In [9]:
df=pd.read_csv('sms.tsv',sep='\t',)
print (df.columns)
print (df.shape)

Index(['label', 'sms'], dtype='object')
(5572, 2)


In [10]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# 클래스 파악
classes = sorted(set(df['label']))
class_to_idx={}

for i, c in enumerate(classes):
    class_to_idx.update({c: i})
    
nclass=len(classes)

print ("# of classes: %d"%nclass)
print (classes)
print (class_to_idx)

# of classes: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


## 2. 새로운 DataFrame

### 1) 'label,sms' 만 남기기
### 2) 최대 텍스트 길이 만큼 자르기 # pandas.Series.str.slice

- '성별, 가사' 만 남기려면?

In [12]:
new_df = pd.DataFrame({'label':df['label'],
                      'sms':df['sms'].str.slice(
                      start=0, stop=max_length)
                      })

### 3) 중복 제거

In [13]:
len(new_df)

5572

In [14]:
new_df=pd.DataFrame(new_df.drop_duplicates())

In [15]:
len(new_df)

5169

### 4) 셔플

In [16]:
df_shuffled=new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,label,sms
0,ham,There the size of elephant tablets & u shove u...
1,ham,Prof: you have passed in all the papers in thi...
2,ham,"It'll be tough, but I'll do what I have to"
3,ham,1 in cbe. 2 in chennai.
4,ham,G.W.R


### 5) train, test 나누기

In [17]:
# train: test = 9: 1
# train: test = 540: 60 -> train:valid:test=432:108:60
train_ratio=0.9

#train dataset
s,e = 0, int(df_shuffled.shape[0]*train_ratio) # #of rows
df_train = pd.DataFrame({'label':df_shuffled['label'][s:e],
                        'sms':df_shuffled['sms'][s:e]})
print ("index for train: %d-%d"%(s,e))

#test dataset
s,e=e,e+int(df_shuffled.shape[0]*(1.0-train_ratio))
print ("index for test: %d-%d"%(s,e))
df_test=pd.DataFrame({'label':df_shuffled['label'][s:e],
                        'sms':df_shuffled['sms'][s:e]})

index for train: 0-4652
index for test: 4652-5168


In [18]:
#column 수 확인
print (df_train.shape)
print (df_test.shape)

(4652, 2)
(516, 2)


### 6) 저장

In [19]:
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv',header=False, index=False, sep='\t')
df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv',header=False, index=False, sep='\t')

In [20]:
!pip install torchtext==0.4.0

Collecting torchtext==0.4.0
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[?25l[K     |██████▏                         | 10 kB 23.4 MB/s eta 0:00:01[K     |████████████▍                   | 20 kB 16.7 MB/s eta 0:00:01[K     |██████████████████▌             | 30 kB 10.5 MB/s eta 0:00:01[K     |████████████████████████▊       | 40 kB 8.9 MB/s eta 0:00:01[K     |██████████████████████████████▉ | 51 kB 4.4 MB/s eta 0:00:01[K     |████████████████████████████████| 53 kB 1.3 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.11.0
    Uninstalling torchtext-0.11.0:
      Successfully uninstalled torchtext-0.11.0
Successfully installed torchtext-0.4.0


In [21]:
import torchtext
import numpy as np

In [22]:
from data_loader import DataLoader

# RNN+SMS 구현
## 0.1 라이브러리 임포트

In [23]:
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np

## 0.2 하이퍼파라미터 셋팅

In [24]:
# Hyper-parameters
batch_size=128
num_epochs=10

word_vec_size=256
dropout_p=0.3

hidden_size=512
num_layers=4

learning_rate=0.0001

In [25]:
#Device configuration
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1. SMS train, test dataset 가져오기

In [26]:
from data_loader import DataLoader

In [27]:
loaders=DataLoader(
    train_fn='./sms.maxlen.uniq.shuf.train.tsv',
    batch_size=batch_size,
    valid_ratio=.2,
    device=-1,
    max_vocab=999999,
    min_freq=5,
)

In [28]:
test_loaders=DataLoader(
    train_fn='./sms.maxlen.uniq.shuf.test.tsv',
    batch_size=batch_size,
    valid_ratio=.01,
    device=-1,
    max_vocab=999999,
    min_freq=5,
)

## 2. 대략적인 데이터 형태

In [29]:
print ("[train]=",len(loaders.train_loader.dataset),"[valid]=",len(loaders.valid_loader.dataset))

vocab_size=len(loaders.text.vocab)
num_classes=len(loaders.label.vocab)
print ("[vocab]=",vocab_size,'[classes]=',num_classes)

[train]= 3722 [valid]= 930
[vocab]= 1540 [classes]= 2


## 3. 데이터 로드함수
학습시킬 때 batch_size 단위로 끊어서 로드하기 위함

### 데이터 로드함수 이해하기

In [30]:
n=3 #샘플로 출력할 데이터 갯
for i, data in enumerate(loaders.train_loader):
    labels=data.label
    texts=data.text
    
    if i>n:
        break
    
    print ("[%d]"%i)
    print ("한번에 로드되는 데이터 크기:",len(labels))
    
    # 출력
    for j in range(n):
        label=labels[j].numpy()
        text=texts[j].numpy()
        print ("label:",label)
        print ("text:",text.shape)

[0]
한번에 로드되는 데이터 크기: 128
label: 0
text: (8,)
label: 0
text: (8,)
label: 0
text: (8,)
[1]
한번에 로드되는 데이터 크기: 128
label: 0
text: (19,)
label: 0
text: (19,)
label: 1
text: (19,)
[2]
한번에 로드되는 데이터 크기: 10
label: 0
text: (60,)
label: 0
text: (60,)
label: 0
text: (60,)
[3]
한번에 로드되는 데이터 크기: 128
label: 0
text: (5,)
label: 0
text: (5,)
label: 0
text: (5,)


## 4. 모델선언

In [31]:
class RNN(nn.Module):
    def __init__(self,
                 input_size,
                 word_vec_size,
                 hidden_size, 
                 n_classes,
                 num_layers=4,
                 dropout_p=0.3
                ):
        super(RNN, self).__init__()
        
        self.input_size=input_size
        self.sord_vec_size=word_vec_size
        self.hidden_size = hidden_size
        self.n_classes=n_classes
        self.num_layers = num_layers
        self.dropout_p=dropout_p
        
        self.emb=nn.Embedding(input_size,word_vec_size)
        
        self.lstm=nn.LSTM(input_size=word_vec_size,
                         hidden_size=hidden_size,
                         num_layers=num_layers,
                             dropout=dropout_p,
                             batch_first=True,
                             bidirectional=True)
        
        self.fc = nn.Linear(hidden_size*2, num_classes)
        self.activation=nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        x=self.emb(x)
        
        x,_=self.lstm(x)
        
        out=self.activation(self.fc(x[:,-1]))
        
        return out

In [32]:
model=RNN(input_size=vocab_size,
         word_vec_size=word_vec_size,
         hidden_size=hidden_size,
         n_classes=num_classes,
         num_layers=num_layers,
         dropout_p=dropout_p)

In [32]:
def ComputeAccr(dloader, imodel):
    correct=0
    total=0
    
    model.eval() #test mode
    for i,data in enumerate(dloader):
        texts=data.text.to(device)
        labels=data.label.to(device)
        
        # Forward prop.
        output=model(texts) 
        _, output_index=torch.max(output,1) 
        
        total+=labels.size(0)
        correct+=(output_index==labels).sum().float()
    model.train()
    return (100*correct/total).cpu().numpy() 

In [33]:
print ("Accuracy of Test Data: %.2f"%ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data: 13.66


## 5. loss, optimizer

In [34]:
loss_func=nn.NLLLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

## 6. 학습

In [35]:
# Train the model
total_step = len(loaders.train_loader)
for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader):
        texts=data.text.to(device) 
        labels=data.label.to(device) 
        
        print ("[%d]" %i)
        
        # Forward pass
        outputs = model(texts)
        loss = loss_func(outputs, labels)
        
        # Backward prop. & optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 10 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accr:{:.2f}'.format(epoch+1, num_epochs, i+1, total_step,loss.item(),ComputeAccr(loaders.valid_loader,model)))

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [1/10], Step [10/30], Loss: 0.5870, Accr:86.34
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [1/10], Step [20/30], Loss: 0.1645, Accr:86.34
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [1/10], Step [30/30], Loss: 2.3479, Accr:86.34
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [2/10], Step [10/30], Loss: 0.2583, Accr:86.34
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [2/10], Step [20/30], Loss: 0.6211, Accr:86.34
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [2/10], Step [30/30], Loss: 0.2413, Accr:86.34
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [3/10], Step [10/30], Loss: 0.2489, Accr:86.34
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [3/10], Step [20/30], Loss: 0.1587, Accr:86.34
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [3/10], Step [30/30], Loss: 0.1981, Accr:87.10
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [4/10], Step [10/30], Loss: 0.8491, Accr:88.39
[10]
[11]


## 7. 테스트

In [36]:
print ("Accuracy of valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of valid Data: 94.62


## 8. 학습된 파라미터 저장

In [39]:
netname='./rnn_weight.pkl'
torch.save(model, netname,)

## 9. 학습된 파라미터 로드

In [44]:
netname='./rnn_weight.pkl'
model=torch.load(netname)
print ("Accuracy of Test Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data: 94.62
