In [4]:
#关于自然语言处理 的RNN 以后可以用GRU 和lstm
"""
本次作業是要讓同學接觸 NLP 當中一個簡單的 task —— 語句分類（文本分類）

給定一個語句，判斷他有沒有惡意（負面標 1，正面標 0）
"""
import numpy as np
import torch 
import cv2
import torch.nn as nn 
import pandas as pd
import os
from torch.utils.data import DataLoader,Dataset
import torchvision.transforms as transforms
import time
import torch.nn.functional as F

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
#Uitiles
#定义常用函数
def load_traing_data(path='./data/training_label.txt'):
    if 'training_label' in path:
        with open (path,'r',encoding='UTF-8') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split(' ') for line in lines]
        x = [line[2:]for line in lines]
        y = [line[0] for line in lines]
        return x,y
    else:
        with open(path,'r',encoding='UTF-8') as f:
            lines = f.readlines()
            x = [line.strip('\n').split(' ')for line in lines]
            return x

def load_testing_data(path='./data/testing_data.txt'):
    with open (path,'r',encoding='UTF-8') as f:
        lines = f.readlines()
        X = ["".join(line.strip('\n').split(',')[1:]).strip() for line in lines[1:]]
        X = [sen.split(' ')for sen in X]
        return X

def evaluation(outputs,lables):
    outputs[outputs>=0.5] = 1
    outputs[outputs<0.5] = 0
    correct = torch.sum(torch.eq(outputs,lables)).item()
    return correct

In [7]:
#Word2vec
import os
import argparse
from gensim.models import word2vec
def train_word2vec(x):
    model = word2vec.Word2Vec(x,size=250,window=5, min_count=5, workers=12, iter=10, sg=1)
    return model

print("loading training data...")
train_x,y = load_traing_data("./data/training_label.txt")
train_x_nolable =load_traing_data('./data/training_nolabel.txt')

print("loading testing data")
test_x = load_testing_data('./data/testing_data.txt')
flag =False
if flag == False:
    model = train_word2vec(train_x+test_x)
    print('saving model...')
    model.save('w2v_all.model')


loading training data...
loading testing data
saving model...


In [17]:
#data process
#数据预处理
import torch.nn as nn
from gensim.models import Word2Vec

class PreProcess():
    def __init__(self,sentences,sen_len,w2v_path='./w2v_all.model'):
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matix = []
    
    def get_w2v_model(self):
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size
    
    def add_embedding(self,word):
        vector = torch.empty(1,self.embedding_dim)
        nn.init.uniform(vector)
        self.word2idx[word] =len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matix = torch.cat([self.embedding_matix,vector],0)
    
    def make_embedding(self,load =True):
        print('Get embedding...')
        if load:
            print('loading word to vec model...')
            self.get_w2v_model()
        else:
            raise NotImplementedError
        
        for i,word in enumerate(self.embedding.wv.vocab):
            print('get words #{}'.format(i+1),end='\r')
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matix.append(self.embedding[word])
        print('')
        self.embedding_matix = torch.tensor(self.embedding_matix)
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matix)))
        return self.embedding_matix

    def pad_sequence(self,sentence):
        if len(sentence)> self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len -len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        return sentence

    def sentence_word2idx(self):
        sentence_list =[]
        for i,sen in enumerate(self.sentences):
            print('sentence count #{}'.format(i+1),end ='\r')
            sentence_idx = []
            for word in sen:
                if(word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            #每个句子变成一样的长度
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    
    def labels_to_tensor(self,y):
        y = [int(label) for label in y]
        return torch.LongTensor(y)

In [9]:
#dataset
from torch.utils import data
class TwitterDataset(data.Dataset):
    def __init__(self,X,Y):
        self.data = X
        self.label = Y 
    
    def __getitem__(self, index):
        if self.label is None:return self.data[index]
        return self.data[index],self.label[index]
    
    def __len__(self):
        return len(self.data)

In [10]:
#model
class RNN(nn.Module):
    def __init__(self,embedding,embedding_dim,hidden_dim,num_layer,dropout=0.5,fix_embedding=True):
        super(RNN,self).__init__()
        #embedding layer
        self.embedding = nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = nn.Parameter(embedding)
        #是否embedding fix 住
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layer
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim,hidden_dim,num_layers=num_layer,batch_first = True)
        self.classfier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim,1),
            nn.ReLU()
        )

    def forward(self,X):
        X = self.embedding(X)
        out,_ = self.lstm(X,None)
        out = out[:,-1,:]
        out = self.classfier(out)
        return out
        

In [40]:
#setting
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sen_len =20
fix_embedding = True
batch_size = 16
epoch =5
lr =0.001
print(device)

cuda


In [18]:
#inputs and label preprocess
preprocess = PreProcess(train_x,sen_len)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)

Get embedding...
loading word to vec model...
get words #24694
total words: 24696


In [19]:
#build model
model = RNN(embedding,embedding_dim=250,hidden_dim=150,num_layer=1,dropout=0.5,fix_embedding=fix_embedding)
model = model.to(device)
# model.eval()
"""
RNN(
  (embedding): Embedding(24696, 250)
  (lstm): LSTM(250, 150, batch_first=True)
  (classfier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=150, out_features=1, bias=True)
    (2): ReLU()
  )
)
"""

RNN(
  (embedding): Embedding(24696, 250)
  (lstm): LSTM(250, 150, batch_first=True)
  (classfier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=150, out_features=1, bias=True)
    (2): ReLU()
  )
)

In [45]:
X_train,X_val ,y_train,y_val = train_x[:18000],train_x[18000:],y[:18000],y[18000:]
train_dataset = TwitterDataset(X = X_train,Y =y_train)
val_dataset = TwitterDataset(X= X_val,Y = y_val)
train_loader  = DataLoader(dataset= train_dataset,batch_size=batch_size,shuffle=True)
val_loader = DataLoader(dataset= val_dataset,batch_size = batch_size,shuffle=False)
# for i,(inputs,labels) in enumerate(train_loader):
#     print(inputs[15])
#     break
    

tensor([ 1842, 24695,    66,    68,  1972,  1973,  1974, 24694, 24694, 24694,
        24694, 24694, 24694, 24694, 24694, 24694, 24694, 24694, 24694, 24694])


In [39]:
#training
import torch.optim as optim
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Start traing.. parameter total:{}, trainable:{}\n'.format(total,trainable))
model.train()
criteion = nn.BCELoss()
t_batch = len(train_loader)
v_batch = len(val_loader)
optimizer = optim.Adam(model.parameters(),lr = lr)
best_acc = 0,0,0
for e in range(epoch):
    total_loss,total_acc =0.0,0.0
    for i,(inputs,labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.squeeze()# 去掉最外面的 dimension，好讓 outputs 可以餵進 criterion()
        loss = criteion(outputs,labels)
        loss.backward()
        optimizer.step()
        correct = evaluation(outputs,labels)
        total_acc +=(correct/batch_size)
        total_loss +=loss.item()
        print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
    print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))

    model.eval()
    with torch.no_grad():
        val_loss ,val_acc = 0.0,0.0
        for i,(inputs,labels) in enumerate(val_loader):
            #val 验证
            inputs = inputs.to(device,dtype = torch.long)
            labels = labels.to(device,dtype = torch.float)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criteion(outputs,labels)
            correct = evaluation(outputs,labels)
            total_acc +=(correct/batch_size)
            total_loss +=loss.item()

            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            if total_acc >best_acc:
                best_acc = total_acc
                torch.save(model,'ckpt-{}.model'.format(total_acc/v_batch*100))
    model.train()       


Start traing.. parameter total:6415351, trainable:241351



RuntimeError: CUDA error: device-side assert triggered

In [29]:
#testing data input
print("loading testing data ...")
test_x = load_testing_data()
preprocess = PreProcess(test_x, sen_len)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, Y=None)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = False)

loading testing data ...
Get embedding...
loading word to vec model...
get words #24694
total words: 24696


In [None]:
#Testing 

model.eval()
ret_output = []
with torch.no_grad():
    for i,inputs in enumerate(test_loader):
        inputs = inputs.to(device,dtype = torch.long)
        outputs = model(inputs)
        outputs = outputs.squeeze()
        outputs[outputs>=0.5] = 1 # 大於等於 0.5 為正面
        outputs[outputs<0.5] = 0 # 小於 0.5 為負面
        ret_output += outputs.int().tolist()

In [None]:
#saving csv
tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"label":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(path_prefix, 'predict.csv'), index=False)
print("Finish Predicting")