In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
import copy
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
from tqdm.notebook import tqdm

In [2]:
torch.cuda.set_device(-1)

torch.cuda.manual_seed(1)

In [3]:
class FeedNet(nn.Module):
    
    def __init__(self, idim = 300, odim = 4, activation = ['Tanh'], dropout = [0], gpu = 0, nhid = []):
        
        super(FeedNet, self).__init__()
        
        num_layers = len(nhid)
        num_activations = len(activation)
        num_dropout = len(dropout)
        
        mul_act = False
        mul_drop = False
        if num_activations > 1:
            mul_act = True
        if num_dropout > 1:
            mul_drop = True
            
        if ((mul_act or mul_drop) and
            (num_activations != num_layers+1 or num_dropout!= num_layers+1)):
            raise Exception("Number of activation functions and dropouts, should be 1 or it should be equal to number of layers")
        
        modules = []
        print('-Network {:d}'.format(idim), end='')
        if num_layers > 0 :
                n_prev = idim
                for i, n in enumerate(nhid):
                                                
                    if mul_drop and dropout[i] > 0:
                        print('-{:d}drop'.format(dropout[i]), end='')
                        modules.append(nn.Dropout(p=dropout[i]))
                    else:
                        if dropout[0]>0:
                            modules.append(nn.Dropout(p=dropout[0]))
                
                    modules.append(nn.Linear(n_prev, n))
                    n_prev = n
                    if mul_act:
                        if activation[i] == 'Tanh':
                            print('-{:d}t'.format(n), end='')
                            modules.append(nn.Tanh())
                        elif activation[i] == 'Relu':
                            print('-{:d}r'.format(n), end='')
                            modules.append(nn.ReLU())
                        else:
                            print('Unrecognizable activation Function, the defualt activation is replace.')
                            print('-{:d}t'.format(n), end='')
                            modules.append(nn.Tanh())
                    else:
                        if activation[0] == 'Tanh':
                            print('-{:d}t'.format(n), end='')
                            modules.append(nn.Tanh())
                        elif activation[0] == 'Relu':
                            print('-{:d}r'.format(n), end='')
                            modules.append(nn.ReLU())
                        else:
                            print('-{:d}t'.format(n), end='')
                            print('Unrecognizable activation Function, the defualt activation is replace.')
                            modules.append(nn.Tanh())
                            
                modules.append(nn.Linear(n_prev,odim))            
        else:
             modules.append(nn.Linear(idim, odim))
        if mul_drop:
            print('-{:d}'.format(odim))
        else:
            print('-{:d}, dropout={:.1f}'.format(odim, dropout[0]))
                
        self.net = nn.Sequential(*modules)
        if gpu:
            self.net = self.net.cuda()
            
            
    def forward(self, x):
        return self.net(x)
    
    
    def evaluate(self, dataset, name= 'Dev', nlables = '4'):
        num_correct = 0
        num_sample = 0
        self.net.train(mode=False)
        with torch.no_grad():
            for data in dataset:
                feature, label = data
                label = label.long()
                label = label.squeeze()
                output = self.net(feature)
                _, predictions = torch.max(output, 1)
                num_correct += (predictions == label).int().sum()
                num_sample += label.shape[0]
            
            acc = 100.0 * (num_correct.float()/num_sample)
            print(' | {:s}-accuracy : {:4f}'.format(name, acc), end='')
        return num_correct, acc
            

Defining Dataset

In [4]:
class textData(Dataset):
    
    def __init__(self, docs_path, model, dim, prefix = 'en_', language = 'english', subwords = False):
        
        print("Loading the dataset")
        dic = {'CCAT':0, 'ECAT':1, 'MCAT':2, 'GCAT':3}
        df = pd.read_csv(docs_path, sep='\t')
        df['category'] = [dic[item] for item in df.label]
        df = self.tokenized(df)
        self.number_of_samples  = len(df)

        print("Generating Document Embeddings")
        if not subwords:
            d_emb = self.doc_emb(df, model, prefix,language, dim)
            df['embedding'] = d_emb
        else:
            d_emb = self.doc_emb_subword(df,model, prefix,language, dim)
            df['embedding'] = d_emb
        
        self.x = torch.zeros([len(df),dim])
        for i,emb in enumerate(df['embedding']):
            self.x[i] = torch.from_numpy(emb)
        
        self.y = torch.zeros([len(df),1])
        for i,cat in enumerate(df['category']):
            self.y[i] = cat
        
           
        
    def __len__(self):
        #len(dataset)
        return self.number_of_samples
        
    def __getitem__(self, index):
        #for indexing dataset[0]
        #return self.x[index], self.y[index]
        sample = self.x[index], self.y[index]
        return sample
    
    def tokenized(self, df):
        n=0
        courp= []
        for item in df.iterrows():
            src = item[1]['script']
            src = src.lower()
            tokens = word_tokenize(src)
            #tokens = MicroTokenizer.cut_by_joint_model(src)
            #tokens = japtoken(src)
            words = [ word for word in tokens if word.isalpha()]
            courp.append(words)
        df['tokenized'] = courp
        return df

    def doc_emb(self, df, model, lan,language, dim):
        document_emb = []
        not_in = 0
        in_words = 0
        progress_bar = tqdm(df, leave=False)
        for i, script in enumerate(df['tokenized']):
                final_emb = np.zeros((1,dim))
                counter = 0
                for word in script:
                    if word in stopwords.words(language):
                        continue
                    m_word = lan + word
                    if m_word in model:
                        vector = model[m_word]
                        norm_v = vector/np.linalg.norm(vector)
                        final_emb = np.add(norm_v, final_emb)
                        counter += 1 
                        in_words += 1
                    else:
                        not_in += 1
                final_emb = np.divide(final_emb, counter)
                norm_finalemb = final_emb / np.linalg.norm(final_emb)
                document_emb.append(norm_finalemb)
                progress_bar.set_description("Processing ")
                progress_bar.update(1)
        return document_emb
    
    def doc_emb_subword(self, df, model, lan,language, dim):
        document_emb = []
        for i, script in enumerate(df['tokenized']):
                final_emb = np.zeros((1,dim))
                counter = 0
                for word in script:

                    if word in stopwords.words(language):
                        continue

                    m_word = lan + word
                    subwords = GetSubWords1(m_word)
                    vector = np.zeros((1,dim), dtype='float')
                    c = 0;
                    for subs in subwords:
                        if subs in model:
                            c += 1
                            embedding = model[subs]
                            vector = np.sum([vector,embedding], axis=0, keepdims=True)[0]
                    if np.all(vector==0):      
                            continue
                    vector = np.divide(vector, c)
                    norm_v = vector/np.linalg.norm(vector)
                    final_emb = np.add(norm_v, final_emb)
                    counter += 1 

                final_emb = np.divide(final_emb, counter)

                norm_finalemb = final_emb / np.linalg.norm(final_emb)
                document_emb.append(norm_finalemb)
        return document_emb

#### Here We initialize the network and do the training loop
### The train, dev and test dataset should be prepared beforehand

In [5]:
emb_model = KeyedVectors.load_word2vec_format("/Users/cons13411/xlingualembedding evaluation/Doung/en-it/en.it.combin.ruters.outputn")

In [6]:
train_set = textData('/Users/cons13411/Downloads/RCV!/mldoc/en.train.1000.txt',emb_model,
                    dim=200, subwords=False)
train_loader = DataLoader(train_set, batch_size=12, shuffle=False)
dev_set = textData('/Users/cons13411/Downloads/RCV!/mldoc/en.dev.txt', emb_model,
                    dim=200, subwords=False)
dev_loader = DataLoader(dev_set, batch_size=12, shuffle=False)

Loading the dataset
Generating Document Embeddings


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Loading the dataset
Generating Document Embeddings


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

In [7]:
np.random.seed(1)
torch.manual_seed(1)
model = FeedNet(idim=200,nhid=[10], dropout=[0.2])
writer = SummaryWriter("/Users/cons13411/runs/")
writer.add_graph(model, iter(train_loader).next()[0])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001,
                       weight_decay=0.0,
                       betas=(0.9, 0.999),
                       eps=1e-8,
                       amsgrad=False)
num_epoch = 500
total_instace = len(train_set)
best_cor = 0 
for epoch in range(num_epoch):
    print('Ep {:4d}'.format(epoch), end='')
    epoch_loss = 0
    runnng_correct = 0
    for data in train_loader:
        featur, label = data
        label = label.long()
        label = label.squeeze()
        model.zero_grad()
        model.train(mode=True)
        output = model(featur)
        loss = criterion(output, label)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        _,predicted = torch.max(output,1)
        runnng_correct += (predicted == label).sum().item()

        
    print(' | loss {:4f}'.format(epoch_loss), end='')
    print(' | Train_Accuracy:{:2f}'.format(100*(runnng_correct/total_instace)), end='')
    num_correct, accu = model.evaluate(dev_loader)
    writer.add_scalars('Accuracy', {'Train_accuracy':100*(runnng_correct/total_instace), 'Dev_accuracy':accu }, epoch)
    if num_correct > best_cor:
        best_cor = num_correct
        torch.save(model.state_dict(), "./model.pth")
        best_model = copy.deepcopy(model)   
        print("| saved")
    else:
        print('')


writer.flush()

-Network 200-10t-4, dropout=0.2
Ep    0 | loss 110.188841 | Train_Accuracy:41.900000 | Dev-accuracy : 62.400002| saved
Ep    1 | loss 90.067120 | Train_Accuracy:76.400000 | Dev-accuracy : 79.699997| saved
Ep    2 | loss 71.720166 | Train_Accuracy:78.900000 | Dev-accuracy : 81.900002| saved
Ep    3 | loss 57.918761 | Train_Accuracy:83.400000 | Dev-accuracy : 86.500000| saved
Ep    4 | loss 47.809055 | Train_Accuracy:87.100000 | Dev-accuracy : 88.900002| saved
Ep    5 | loss 41.164240 | Train_Accuracy:86.200000 | Dev-accuracy : 90.300003| saved
Ep    6 | loss 36.033276 | Train_Accuracy:88.700000 | Dev-accuracy : 91.299995| saved
Ep    7 | loss 31.566147 | Train_Accuracy:89.600000 | Dev-accuracy : 91.699997| saved
Ep    8 | loss 30.092519 | Train_Accuracy:88.900000 | Dev-accuracy : 91.699997
Ep    9 | loss 27.481609 | Train_Accuracy:89.000000 | Dev-accuracy : 91.799995| saved
Ep   10 | loss 25.503176 | Train_Accuracy:90.500000 | Dev-accuracy : 91.900002| saved
Ep   11 | loss 24.693869 | T

Ep  103 | loss 10.186024 | Train_Accuracy:95.500000 | Dev-accuracy : 92.299995
Ep  104 | loss 10.259380 | Train_Accuracy:94.700000 | Dev-accuracy : 92.099998
Ep  105 | loss 10.451052 | Train_Accuracy:95.500000 | Dev-accuracy : 92.900002
Ep  106 | loss 11.093861 | Train_Accuracy:95.400000 | Dev-accuracy : 92.199997
Ep  107 | loss 11.434726 | Train_Accuracy:94.300000 | Dev-accuracy : 92.199997
Ep  108 | loss 8.500238 | Train_Accuracy:95.800000 | Dev-accuracy : 92.299995
Ep  109 | loss 10.810313 | Train_Accuracy:95.100000 | Dev-accuracy : 92.000000
Ep  110 | loss 9.934686 | Train_Accuracy:95.400000 | Dev-accuracy : 92.199997
Ep  111 | loss 8.786099 | Train_Accuracy:95.100000 | Dev-accuracy : 91.900002
Ep  112 | loss 9.551738 | Train_Accuracy:96.000000 | Dev-accuracy : 92.099998
Ep  113 | loss 9.838837 | Train_Accuracy:95.400000 | Dev-accuracy : 91.900002
Ep  114 | loss 9.781544 | Train_Accuracy:95.200000 | Dev-accuracy : 92.099998
Ep  115 | loss 7.599399 | Train_Accuracy:97.300000 | Dev-a

Ep  208 | loss 9.303218 | Train_Accuracy:95.800000 | Dev-accuracy : 91.900002
Ep  209 | loss 8.498889 | Train_Accuracy:95.900000 | Dev-accuracy : 92.099998
Ep  210 | loss 8.640466 | Train_Accuracy:96.200000 | Dev-accuracy : 92.199997
Ep  211 | loss 9.683076 | Train_Accuracy:95.300000 | Dev-accuracy : 91.799995
Ep  212 | loss 8.185401 | Train_Accuracy:96.400000 | Dev-accuracy : 92.000000
Ep  213 | loss 8.972072 | Train_Accuracy:95.800000 | Dev-accuracy : 92.199997
Ep  214 | loss 8.647054 | Train_Accuracy:95.500000 | Dev-accuracy : 91.900002
Ep  215 | loss 8.286928 | Train_Accuracy:96.100000 | Dev-accuracy : 91.900002
Ep  216 | loss 9.752429 | Train_Accuracy:95.300000 | Dev-accuracy : 92.599998
Ep  217 | loss 9.629373 | Train_Accuracy:96.000000 | Dev-accuracy : 92.400002
Ep  218 | loss 8.706508 | Train_Accuracy:95.700000 | Dev-accuracy : 91.900002
Ep  219 | loss 8.698143 | Train_Accuracy:96.300000 | Dev-accuracy : 92.599998
Ep  220 | loss 7.427241 | Train_Accuracy:96.900000 | Dev-accurac

Ep  420 | loss 7.226664 | Train_Accuracy:97.000000 | Dev-accuracy : 92.900002
Ep  421 | loss 4.835949 | Train_Accuracy:98.100000 | Dev-accuracy : 92.799995
Ep  422 | loss 5.832998 | Train_Accuracy:97.600000 | Dev-accuracy : 92.500000
Ep  423 | loss 6.818042 | Train_Accuracy:96.900000 | Dev-accuracy : 92.799995
Ep  424 | loss 6.366802 | Train_Accuracy:97.600000 | Dev-accuracy : 92.400002
Ep  425 | loss 6.882642 | Train_Accuracy:97.200000 | Dev-accuracy : 92.699997
Ep  426 | loss 7.091166 | Train_Accuracy:97.200000 | Dev-accuracy : 92.299995
Ep  427 | loss 7.572747 | Train_Accuracy:96.300000 | Dev-accuracy : 92.599998
Ep  428 | loss 6.572835 | Train_Accuracy:96.500000 | Dev-accuracy : 92.599998
Ep  429 | loss 5.794234 | Train_Accuracy:97.500000 | Dev-accuracy : 92.099998
Ep  430 | loss 5.966709 | Train_Accuracy:97.200000 | Dev-accuracy : 92.299995
Ep  431 | loss 6.182104 | Train_Accuracy:97.300000 | Dev-accuracy : 92.199997
Ep  432 | loss 6.648570 | Train_Accuracy:96.500000 | Dev-accurac

In [8]:
test_set = textData('/Users/cons13411/Downloads/RCV!/mldoc/italian.test',
                    emb_model, prefix='it_', language='italian',
                    dim=200, subwords=False)
test_loader = DataLoader(test_set, batch_size=12, shuffle=False)

Loading the dataset
Generating Document Embeddings


HBox(children=(FloatProgress(value=0.0, max=4000.0), HTML(value='')))

In [None]:
best_model.evaluate()

In [38]:
t ={'a':2,'b':1}
q = []

In [39]:
q += [1]
q += [2]

In [40]:
q

[1, 2]