# 201710757 김지훈 
# https://github.com/201710757/NLP_SMU

In [1]:
max_length = 256

In [2]:
import pandas as pd

df = pd.read_csv('./data/sms.tsv', sep='\t')
print(df.columns)
print(df.shape)

Index(['label', 'sms'], dtype='object')
(5572, 2)


In [3]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
classes = sorted(set(df['label']))
class_to_idx = {}
print(classes)

['ham', 'spam']


In [5]:
for i, c in enumerate(classes):
    class_to_idx.update({c:i})

In [6]:
class_to_idx

{'ham': 0, 'spam': 1}

In [7]:
nclass = len(classes)

In [8]:
print(nclass)

2


In [9]:
new_df = pd.DataFrame({'label' : df['label'],
                      'sms' : df['sms'].str.slice(
                          start=0, stop=max_length)})
# 원하는 컬럼으로 데이터 정제

In [10]:
# len(new_df)

In [11]:
new_df = pd.DataFrame(new_df.drop_duplicates())
len(new_df)

5169

In [12]:
df_shuffled = new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,label,sms
0,spam,URGENT! Your Mobile number has been awarded wi...
1,ham,Glad to see your reply.
2,ham,Call me when u finish then i come n pick u.
3,spam,Congratulations - Thanks to a good friend U ha...
4,ham,"Daddy, shu shu is looking 4 u... U wan me 2 te..."


In [13]:
train_ratio = 0.9

# train dataset
s, e = 0, int(df_shuffled.shape[0] * train_ratio)
df_train = pd.DataFrame({'label' : df_shuffled['label'][s:e],
                        'sms' : df_shuffled['sms'][s:e]})

# test dataset
s, e = e, e + int(df_shuffled.shape[0] * (1.0 - train_ratio))
df_test = pd.DataFrame({'label' : df_shuffled['label'][s:e],
                        'sms' : df_shuffled['sms'][s:e]})

In [14]:
df_train.shape

(4652, 2)

In [15]:
df_test.shape

(516, 2)

In [16]:
df_train.to_csv('./sms.maxlen.uniq.shuf.train.tsv', header=False, index=False, sep='\t')
df_test.to_csv('./sms.maxlen.uniq.shuf.test.tsv', header=False, index=False, sep='\t')

In [17]:
import torch

In [18]:
# !pip uninstall torchtext -y
#!pip install torchtext==0.4.0
#!pip install torch==1.7.0+cu110 torchvision==0.8.1+cu110 torchaudio===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html

In [19]:
import torchtext
import numpy as np

In [20]:
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np

In [21]:
batch_size = 128
num_epochs = 10

word_vec_size = 256
dropout_p = 0.3

hidden_size = 512
num_layers = 4

learning_rate = 0.001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
from data_loader import DataLoader

In [23]:
loaders = DataLoader(
            train_fn='./data/sms.maxlen.uniq.shuf.train.tsv',
            batch_size = batch_size,
            valid_ratio = .2,
            device = 0,
            max_vocab = 999999,
            min_freq = 5
)

test_loaders = DataLoader(
            train_fn='./data/sms.maxlen.uniq.shuf.test.tsv',
            batch_size = batch_size,
            valid_ratio = .01,
            device = 0,
            max_vocab = 999999,
            min_freq = 5
)

In [24]:
vocab_size = len(loaders.text.vocab)
num_classes = len(loaders.label.vocab)

In [25]:
print(vocab_size, num_classes)

1527 2


In [27]:
n = 3

for i, data in enumerate(loaders.train_loader):
    labels = data.label
    texts = data.text
    
    if i > n:
        break
    print("[%d]"%i)
    print("Data Size : ", len(labels))
    
    for j in range(n):
        label = labels[j].cpu().numpy()
        text = texts[j].cpu().numpy()
        print("label : ", label)
        print("text : ", text.shape)

[0]
Data Size :  128
label :  0
text :  (7,)
label :  0
text :  (7,)
label :  0
text :  (7,)
[1]
Data Size :  128
label :  0
text :  (10,)
label :  0
text :  (10,)
label :  0
text :  (10,)
[2]
Data Size :  128
label :  0
text :  (28,)
label :  0
text :  (28,)
label :  0
text :  (28,)
[3]
Data Size :  128
label :  0
text :  (4,)
label :  0
text :  (4,)
label :  0
text :  (4,)


In [28]:
class RNN(nn.Module):
    def __init__(self, input_size, word_vec_size, hidden_size, n_classes, num_layers=4, dropout_p=0.3):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.word_vec_size = word_vec_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.n_classes = n_classes

        self.emb = nn.Embedding(self.input_size, self.word_vec_size)
        self.lstm = nn.LSTM(input_size=self.word_vec_size, hidden_size=self.hidden_size, num_layers=self.num_layers,
                            dropout=dropout_p, batch_first=True,
                           bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, n_classes)
        self.activation = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        x = self.emb(x)
        x, _ = self.lstm(x)
        
        out = self.activation(self.fc(x[:, -1]))
        
        return out

In [29]:
model = RNN(vocab_size, word_vec_size, hidden_size, num_classes, num_layers, dropout_p).to(device)

In [30]:
def ComputeAccr(dloader, model):
    correct = 0
    total = 0
    
    model.eval()
    for i, data in enumerate(dloader):
        texts = data.text.to(device)
        labels = data.label.to(device)
        
        output = model(texts)
        _, output_index = torch.max(output, 1)
        
        total += labels.size(0)
        correct += (output_index == labels).sum().float()
    model.train()
    return (100 * correct / total).cpu().numpy()

In [31]:
print("ACC : %.2f"%ComputeAccr(loaders.valid_loader, model))

ACC : 12.26


In [32]:
loss_func = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [33]:
total_step = len(loaders.train_loader)
for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader):
        texts = data.text.to(device)
        labels = data.label.to(device)
        print("[%d]"%i)
        outputs = model(texts)
        loss = loss_func(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 10 == 0:
            print("Epoch [{}/{}], step [{}/{}], Loss : {:.4f}, Accr: {:.2f}".format(epoch+1,num_epochs, i+1, total_step, loss.item(), ComputeAccr(loaders.valid_loader, model)))
        

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [1/10], step [10/30], Loss : 0.3149, Accr: 87.74
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [1/10], step [20/30], Loss : 1.2432, Accr: 87.74
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [1/10], step [30/30], Loss : 0.2354, Accr: 87.85
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [2/10], step [10/30], Loss : 0.0426, Accr: 87.74
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [2/10], step [20/30], Loss : 0.5446, Accr: 20.22
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [2/10], step [30/30], Loss : 0.1667, Accr: 88.17
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [3/10], step [10/30], Loss : 0.2451, Accr: 86.13
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [3/10], step [20/30], Loss : 1.5062, Accr: 95.59
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [3/10], step [30/30], Loss : 0.3759, Accr: 91.08
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [4/10], step [10/30], Loss : 0.0760, Ac

In [34]:
print("ACC : %.2f"%ComputeAccr(loaders.valid_loader, model))

ACC : 96.99


In [42]:
import os
netname = './nets/rnn_weight1.pkl'

torch.save(model, netname) if os.path.exists('./nets') else os.mkdir('./nets')
torch.save(model, netname)

# 감사합니다.