In [1]:
import datasets
from collections import Counter
import string
from gensim.utils import tokenize
from torch.utils.data import Dataset,DataLoader
from torch.nn import Module 
from torch import nn 
import torch
from tqdm import tqdm
device='cuda'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
newsdata=datasets.load_dataset('ag_news')

In [3]:
newsdata['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 120000
})

In [4]:
#Создаём словарь количества вхождений слов
words=['<unk>','<bos>','<eos>','<pad>']
#str.maketrans('','',string.punctuation)-возврящяет словарь для замены
#.translate ждёт словарь слов для замены
for sent in newsdata['train']['text']:
    proced_sent=sent.lower().translate(
        str.maketrans('','',string.punctuation)
    )
    for word in tokenize(proced_sent):
        words.append(word)
        

vocab = set(['<unk>','<bos>','<eos>','<pad>'])#Токеный неизвестногого слова, начала,конца последовательности и токен пустного пропуска для батчей


treshold=25 #Порог для включения в словарь word2ind
words=Counter(words)

for word,cnt in words.items():
    if cnt>treshold:
        vocab.add(word)
print(f'Размер словаря {len(vocab)}')
        
   

Размер словаря 11505


In [5]:
word2ind={i:char for char,i in enumerate(vocab)}
ind2word={char:i for char,i in enumerate(vocab)}

In [6]:
class WordDataset(Dataset):
    def __init__(self,data):  
        super(WordDataset,self).__init__()
        self.data=data
        self.unk=word2ind['<unk>']
        self.bos=word2ind['<bos>']
        self.eos=word2ind['<eos>']
        self.pad=word2ind['<pad>']        
    def __getitem__(self,idx:int):
        #получаем оригинальные данные
        #print(self.data)
        sent=self.data['text'][idx]
        #print(sent)
        label=self.data['label'][idx]
        proc_sent=sent.lower().translate(
            str.maketrans('','',string.punctuation)

        )
        #tokenized_sent=tokenize(proc_sent)
        tokenized_sent=[self.bos]
        tokenized_sent+=[
            word2ind.get(word,self.unk) for word in tokenize(proc_sent)
        ]
        tokenized_sent+=[self.eos]

        sample={
            'text':tokenized_sent,
            'label':label
        }
        return sample
    def __len__(self):
        return len(self.data)

In [7]:
train_dataset=WordDataset(newsdata['train'])
eval_dataset=WordDataset(newsdata['test'])

In [8]:
def make_batch(data,max_len=256,pad_id=word2ind['<pad>']):
    lenghts=[len(sent['text']) for sent in data]
    max_len=min(max_len,max(lenghts))
    new_batch=[]
    for sent in data:
        sent['text']=sent['text'][:max_len]
        for i in range(max_len-len(sent['text'])):
            sent['text'].append(pad_id)
        new_batch.append(sent['text'])

    new_batch=torch.LongTensor(new_batch)

    new_pair={
        'text':torch.LongTensor(new_batch).to(device),
        'label':torch.LongTensor([x['label'] for x in data]).to(device)
    }
    return new_pair

    

In [9]:
train_dataloader=DataLoader(train_dataset,batch_size=32,shuffle=True,collate_fn=make_batch)
eval_dataloader=DataLoader(eval_dataset,batch_size=32,shuffle=False,collate_fn=make_batch)

In [23]:
class rnn_net(Module):
    def __init__(self,inp_size,hidden_size,out_size,vocab_size):
        super(rnn_net,self).__init__()

        self.emb=nn.Embedding(vocab_size,hidden_size)
        self.rnn=nn.RNN(hidden_size,hidden_size,num_layers=3,batch_first=True)#!!!!!!!1
        self.fin_lin=nn.Linear(hidden_size,out_size)
        self.tahn=nn.Tanh()
    def forward(self,x):
        x=self.emb(x)
        x,_=self.rnn(x)
        x=self.tahn(x)
        #агрегация эмбрендингов
        x=x.mean(dim=1)
        out=self.fin_lin(x)
        return out

In [45]:
class gru_net(Module):
    def __init__(self,inp_size,hidden_size,out_size,vocab_size):
        super(gru_net,self).__init__()

        self.emb=nn.Embedding(vocab_size,hidden_size)
        self.gru=nn.GRU(hidden_size,hidden_size,num_layers=3)
        self.fin_lin=nn.Linear(hidden_size,out_size)
        self.tahn=nn.Tanh()
    def forward(self,text):
        x=self.emb(text)
        x,_=self.gru(x)
        x=self.tahn(x)

        #агрегация эмбрендингов 
        agregated_x=x.mean(dim=1)

        out=self.fin_lin(agregated_x)
        
        return out

In [46]:
rnn_model=rnn_net(256,256,4,len(vocab)).to(device)
rnn_loss_func=nn.CrossEntropyLoss(ignore_index=word2ind['<pad>'])
optimizer=torch.optim.Adam(rnn_model.parameters())



In [47]:
gru_model=gru_net(256,256,4,len(vocab)).to(device)
gru_loss_func=nn.CrossEntropyLoss(ignore_index=word2ind['<pad>'])
gru_optimizer=torch.optim.Adam(gru_model.parameters())

In [48]:
def training(model,dataloader,loss_func,optimizer):
    losses=[]
    for batch in (pbar:=tqdm(dataloader)):

        optimizer.zero_grad()

        
        pred=model(batch['text'])
        
        
        
        loss=loss_func(pred,batch['label'])
        loss_item=loss.item()
        losses.append(loss_item)
        loss.backward()
        optimizer.step()

        pbar.set_description(f'{loss_item}')
    return losses

        

        

In [49]:
gru_losses=training(model=gru_model,
                    dataloader=train_dataloader,
                    loss_func=gru_loss_func,
                    optimizer=gru_optimizer
                   )

1.3989189863204956:   1%|▎                            | 46/3750 [02:17<3:04:03,  2.98s/it]


KeyboardInterrupt: 

In [None]:
get_accuracy(gru_model,eval_dataloader)

In [14]:
def get_accuracy(model,dataloader):
    with torch.no_grad():
        pred_labels=[model(x['text']) for x in dataloader]
        real_labels=[x['label'] for x in dataloader]
        #return (pred_labels,real_labels)

    
    pred_labels=torch.cat(pred_labels).argmax(dim=1)
    real_labels=torch.cat(real_labels)
    
    
    accuracy=(pred_labels==real_labels).float().mean()
    return accuracy
        
        

In [34]:
get_accuracy(rnn_model,eval_dataloader)

tensor(0.2500, device='cuda:0')