In [1]:
import torch
import os
import csv
import random
import numpy as np
import pandas as pd
import torch.nn.functional as F

from torch.nn.utils.rnn import pad_sequence
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

In [2]:
import gdown
url = "https://drive.google.com/uc?id=1wYgIRQXGB_jKiBevr0IkKUqa8EO2VcJv&export=download"
gdown.download(url)
!unzip 'data.zip'

Downloading...
From: https://drive.google.com/uc?id=1wYgIRQXGB_jKiBevr0IkKUqa8EO2VcJv&export=download
To: /content/data.zip
100%|██████████| 40.6M/40.6M [00:00<00:00, 55.5MB/s]


Archive:  data.zip
  inflating: data/test.csv           
  inflating: data/train_label.csv    
  inflating: data/train_nolabel.csv  


In [3]:
DEVICE_NUM = 2
BATCH_SIZE = 128
EPOCH_NUM = 50
MAX_POSITIONS_LEN = 500
SEED = 4300
MODEL_DIR = 'model.pth'
lr = 0.001

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
random.seed(SEED)
np.random.seed(SEED)

torch.cuda.set_device(0)
use_gpu = torch.cuda.is_available()
# torch.cuda.set_device(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

w2v_config = {'path': 'model', 'dim': 256}
lstm_config = {'hidden_dim': 256, 'num_layers': 2, 'bidirectional': True, 'fix_embedding': True}
header_config = {'dropout': 0.5, 'hidden_dim': 512}
assert header_config['hidden_dim'] == lstm_config['hidden_dim'] or header_config['hidden_dim'] == lstm_config['hidden_dim'] * 2

In [4]:
from nltk.stem import PorterStemmer
import string
def parsing_text(text):
    # TODO: do data processing
    text = str(text)
    c1 = text.replace("\r\n","")
    c1 = c1.replace("\n","")

    doc = c1.split(" ")
    c1 = ""

    for a in doc:
        a = a.lower()
        c1 += (a+" ")

    # # remove punctuations

    data = ""
    for char in c1:
        if char not in string.punctuation:
            data += char


    # #tokenize the words
    # words = data.split(" ")
    # result = ""
    # # stemming and removing stopwords in the end
    # for a in words:
    #     a = PorterStemmer().stem(a)
    #     if(a != ""):
    #         result += (" " + a)
    return data
    #return text

def load_train_label(path='train_label.csv'):
    tra_lb_pd = pd.read_csv(path)
    label = torch.FloatTensor(tra_lb_pd['label'].values)
    idx = tra_lb_pd['id'].tolist()
    text = [parsing_text(s).split(' ') for s in tra_lb_pd['text'].tolist()]
    return idx, text, label

def load_train_nolabel(path='train_nolabel.csv'):
    tra_nlb_pd = pd.read_csv(path)
    print(len(tra_nlb_pd['text']))
    text = [parsing_text(s).split(' ') for s in tra_nlb_pd['text'].tolist()]
    return text

def load_test(path='test.csv'):
    tst_pd = pd.read_csv(path)
    idx = tst_pd['id'].tolist()
    text = [parsing_text(s).split(' ') for s in tst_pd['text'].tolist()]
    return idx, text

In [5]:
class Preprocessor:
    def __init__(self, sentences, w2v_config):
        self.sentences = sentences
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
        self.build_word2vec(sentences, **w2v_config)

    def build_word2vec(self, x, path, dim):
        if os.path.isfile(path):
            print("loading word2vec model ...")
            w2v_model = Word2Vec.load(path)
        else:
            print("training word2vec model ...")
            w2v_model = Word2Vec(x, vector_size=dim, window=5, min_count=2, workers=12, epochs=2, sg=1)
            print("saving word2vec model ...")
            w2v_model.save(path)

        self.embedding_dim = w2v_model.vector_size
        for i, word in enumerate(w2v_model.wv.key_to_index):
            #e.g. self.word2index['he'] = 1
            #e.g. self.index2word[1] = 'he'
            #e.g. self.vectors[1] = 'he' vector

            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(w2v_model.wv[word])

        self.embedding_matrix = torch.tensor(self.embedding_matrix , requires_grad=True)
        self.add_embedding('<PAD>')
        self.add_embedding('<UNK>')
        print("total words: {}".format(len(self.embedding_matrix)))

    def add_embedding(self, word):
        # 把 word 加進 embedding，並賦予他一個隨機生成的 representation vector
        # word 只會是 "<PAD>" 或 "<UNK>"
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)

    def sentence2idx(self, sentence):
        sentence_idx = []
        for word in sentence:
            if word in self.word2idx.keys():
                sentence_idx.append(self.word2idx[word])
            else:
                sentence_idx.append(self.word2idx["<UNK>"])
        return torch.LongTensor(sentence_idx)

class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, id_list, sentences, labels, preprocessor):
        self.id_list = id_list
        self.sentences = sentences
        self.labels = labels
        self.preprocessor = preprocessor

    def __getitem__(self, idx):
        if self.labels is None: return self.id_list[idx], self.preprocessor.sentence2idx(self.sentences[idx])
        return self.id_list[idx], self.preprocessor.sentence2idx(self.sentences[idx]), self.labels[idx]

    def __len__(self):
        return len(self.sentences)

    def collate_fn(self, data):
        id_list = torch.LongTensor([d[0] for d in data])
        lengths = torch.LongTensor([len(d[1]) for d in data])
        texts = pad_sequence(
            [d[1] for d in data], batch_first=True).contiguous()

        if self.labels is None:
            return id_list, lengths, texts

        labels = torch.FloatTensor([d[2] for d in data])
        return id_list, lengths, texts, labels

In [6]:
train_idx, train_label_text, label = load_train_label('data/train_label.csv')
no_label_text = load_train_nolabel('data/train_nolabel.csv')
test_idx, test_text = load_test('data/test.csv')
a = no_label_text + train_label_text + test_text
preprocessor = Preprocessor(a, w2v_config)

train_idx, valid_idx, train_label_text, valid_label_text, train_label, valid_label = train_test_split(train_idx, train_label_text, label, test_size=0.12)
train_dataset, valid_dataset = TwitterDataset(train_idx, train_label_text, train_label, preprocessor), TwitterDataset(valid_idx, valid_label_text, valid_label, preprocessor)

test_idx, test_text = load_test('data/test.csv')
test_dataset = TwitterDataset(test_idx, test_text, None, preprocessor)

train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = True,
                                            collate_fn = train_dataset.collate_fn,
                                            num_workers = 8)
valid_loader = torch.utils.data.DataLoader(dataset = valid_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = False,
                                            collate_fn = valid_dataset.collate_fn,
                                            num_workers = 8)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = False,
                                            collate_fn = test_dataset.collate_fn,
                                            num_workers = 8)

1178614
training word2vec model ...
saving word2vec model ...


  self.embedding_matrix = torch.tensor(self.embedding_matrix , requires_grad=True)


total words: 114547




In [7]:
# from torch.nn import Linear
import torch
import torch.nn as nn
class LSTM_Backbone(torch.nn.Module):
    def __init__(self, embedding, hidden_dim, num_layers, bidirectional, fix_embedding= False):
        super(LSTM_Backbone, self).__init__()
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False if fix_embedding else True

        self.lstm = torch.nn.LSTM(embedding.size(1), hidden_dim, num_layers=num_layers, \
                                  bidirectional=bidirectional, batch_first=True)

    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs)
        return x

class Header(torch.nn.Module):
    def __init__(self, dropout, hidden_dim):
        super(Header, self).__init__()
        # TODO: you should design your classifier module
        self.classifier = torch.nn.Sequential(nn.Dropout(0.2),
                          nn.BatchNorm1d(hidden_dim),
                          nn.Linear(hidden_dim, 48),
                          nn.LeakyReLU(0.2),
                          # nn.Dropout(0.2),
                          nn.BatchNorm1d(48),
                          nn.Linear(48,1),
                          torch.nn.Sigmoid())

    @ torch.no_grad()
    def _get_length_masks(self, lengths):
        # lengths: (batch_size, ) in cuda
        ascending = torch.arange(MAX_POSITIONS_LEN)[:lengths.max().item()].unsqueeze(
            0).expand(len(lengths), -1).to(lengths.device)
        length_masks = (ascending < lengths.unsqueeze(-1)).unsqueeze(-1)
        return length_masks

    def forward(self, inputs, lengths):
        # the input shape should be (N, L, D∗H)
        pad_mask = self._get_length_masks(lengths)
        inputs = inputs * pad_mask
        inputs = inputs.sum(dim=1)
        out = self.classifier(inputs).squeeze()
        return out

In [8]:

def train(train_loader, backbone, header, optimizer, criterion, device, epoch):

    total_loss = []
    total_acc = []


    for i, (idx_list, lengths, texts, labels) in enumerate(train_loader):
      print(i)

      lengths, inputs, labels = lengths.to(device), texts.to(device), labels.to(device)

      optimizer.zero_grad()
      if not backbone is None:
          inputs = backbone(inputs)
      soft_predicted = header(inputs, lengths)
      loss = criterion(soft_predicted, labels)
      total_loss.append(loss.item())
      loss.backward()
      optimizer.step()

      with torch.no_grad():
        hard_predicted = (soft_predicted >= 0.5).int()
        correct = sum(hard_predicted == labels).item()
        batch_size = len(labels)
        total_acc.append(float(correct/batch_size))


    total_acc = np.array(total_acc)
    total_loss = np.array(total_loss)
    print(total_acc)
    print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f}'.format(epoch+1, np.mean(total_loss), np.mean(total_acc)), end='\r')
    backbone.train()
    header.train()
    return np.mean(total_loss), np.mean(total_acc)




def valid(valid_loader, backbone, header, criterion, device, epoch):
    backbone.eval()
    header.eval()
    with torch.no_grad():
        total_loss = []
        total_acc = []


        for i, (idx_list, lengths, texts, labels) in enumerate(valid_loader):
            lengths, inputs, labels = lengths.to(device), texts.to(device), labels.to(device)

            if not backbone is None:
                inputs = backbone(inputs)
            soft_predicted = header(inputs, lengths)
            loss = criterion(soft_predicted, labels)
            total_loss.append(loss.item())

            hard_predicted = (soft_predicted >= 0.5).int()
            correct = sum(hard_predicted == labels).item()
            acc = correct * 100 / len(labels)
            total_acc.append(acc)

    total_acc = np.array(total_acc)
    total_loss = np.array(total_loss)
    print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f}'.format(epoch+1, np.mean(total_loss), np.mean(total_acc)), end='\r')
    backbone.train()
    header.train()
    return np.mean(total_loss), np.mean(total_acc)


def run_training(train_loader, valid_loader, backbone, header, epoch_num, lr, device, model_dir):
    def check_point(backbone, header, loss, acc, model_dir, max):
        # TODO
        return



    def is_stop(loss, acc):
        # TODO
        if(acc > 85):
          return True
        else:
          return False

    if backbone is None:
        trainable_paras = header.parameters()
    else:
        trainable_paras = list(backbone.parameters()) + list(header.parameters())

    optimizer = torch.optim.Adam(trainable_paras, lr=lr)
    print("!!")

    backbone.train()
    header.train()
    backbone = backbone.to(device)
    header = header.to(device)
    criterion = torch.nn.BCELoss()
    max_acc = 0;
    count = 0;
    for epoch in range(EPOCH_NUM):
        train(train_loader, backbone, header, optimizer, criterion, device, epoch)
        loss, acc = valid(valid_loader, backbone, header, criterion, device, epoch)
        print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f} '.format(epoch+1, loss, acc))
        # if ( epoch > 10):
        #   backbone.embedding.weight.requires_grad = False
        if( acc > max_acc):
          print(max_acc)
          torch.save({'backbone': backbone, 'header': header}, model_dir)
          max_acc = acc
          print("saved at", acc)
          count = 0
        else:
          count += 1
          print(count)


        if is_stop(loss, acc):
          break
        if (count >= 7):
          break
    return max_acc


In [9]:
print(len(train_loader))

1169


In [17]:
backbone = LSTM_Backbone(preprocessor.embedding_matrix, **lstm_config)
header = Header(**header_config)

max_acc = run_training(train_loader, valid_loader, backbone, header, EPOCH_NUM, lr, device, MODEL_DIR)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080


In [18]:
print(max_acc)

82.119140625


Loaded the saved model and test

In [19]:
saved_data = torch.load(MODEL_DIR)
loaded_backbone = saved_data['backbone']
loaded_header = saved_data['header']
print(loaded_backbone)
saved_backbone = loaded_backbone
saved_header = loaded_header

LSTM_Backbone(
  (embedding): Embedding(114547, 256)
  (lstm): LSTM(256, 256, num_layers=2, batch_first=True, bidirectional=True)
)


In [20]:
def run_testing(test_loader, backbone, header, device, output_path):
    with open(output_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'label'])
        header.eval()
        backbone.eval()
        with torch.no_grad():
            for i, (idx_list, lengths, texts) in enumerate(test_loader):
                lengths, inputs = lengths.to(device), texts.to(device)
                if not backbone is None:
                    inputs = backbone(inputs)
                soft_predicted = header(inputs, lengths)
                hard_predicted = (soft_predicted >= 0.5).int()
                for i, p in zip(idx_list, hard_predicted):
                    writer.writerow([str(i.item()), str(p.item())])

In [39]:
run_testing(test_loader, saved_backbone, saved_header, device, 'pred.csv')

### 4. come up with two sentences that have same words but result in different outcome，記得先把原本的model刪掉才能gdown最佳的model

In [26]:
import gdown
url = "https://drive.google.com/u/0/uc?id=1mwe0PmRFDft3bKcQu4_tL22wU4QwhWIo&export=download"
gdown.download(url)
!unzip 'model1.zip'


Downloading...
From: https://drive.google.com/u/0/uc?id=1mwe0PmRFDft3bKcQu4_tL22wU4QwhWIo&export=download
To: /content/model1.zip
100%|██████████| 338M/338M [00:01<00:00, 240MB/s]


Archive:  model1.zip
  inflating: model/model             
  inflating: model/model.pth         
  inflating: model/model.syn1neg.npy  
  inflating: model/model.wv.vectors.npy  


In [36]:
saved_data = torch.load("model/model.pth")
loaded_backbone = saved_data['backbone']
loaded_header = saved_data['header']
print(loaded_backbone)
saved_backbone = loaded_backbone
saved_header = loaded_header

LSTM_Backbone(
  (embedding): Embedding(114547, 256)
  (lstm): LSTM(256, 256, num_layers=2, batch_first=True, bidirectional=True)
)


In [37]:
new_df = pd.DataFrame({"id":[1,2],"text":["I am right, but you are wrong", "I am wrong, but you are right"]})
print(new_df)
check_idx = new_df['id'].tolist()
check_text = [parsing_text(s).split(' ') for s in new_df['text'].tolist()]
check_dataset = TwitterDataset(check_idx, check_text, None, preprocessor)
check_loader = torch.utils.data.DataLoader(dataset = check_dataset,
                      batch_size = BATCH_SIZE,
                      shuffle = False,
                      collate_fn = check_dataset.collate_fn,
                      num_workers = 8)


   id                           text
0   1  I am right, but you are wrong
1   2  I am wrong, but you are right




In [38]:
saved_header.eval()
saved_backbone.eval()
with torch.no_grad():
    for i, (idx_list, lengths, texts) in enumerate(check_loader):
        lengths, inputs = lengths.to(device), texts.to(device)
        if not saved_backbone is None:
            inputs = saved_backbone(inputs)
        soft_predicted = saved_header(inputs, lengths)
        hard_predicted = (soft_predicted >= 0.5).int()
        for i, p in zip(idx_list, hard_predicted):
          print(str(i.item()), str(p.item()))

1 0
2 1
