In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from tqdm.auto import tqdm

from nltk.tokenize import word_tokenize, wordpunct_tokenize
from tqdm.auto import tqdm

import pandas as pd

import numpy as np

from torch.utils.data import Dataset, DataLoader

In [4]:
#ИЗМЕНИТЬ ПУТЬ В ЗАВИСИМОСТИ ОТ ТОГО ГДЕ ЗАПУСКАЕМ КОД
try:
    df = pd.read_csv("ctx_quest.csv")
except:
    df = pd.read_csv("/Users/lilyakhoang/input/question_generation/ctx_quest.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,context,question
0,0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?
1,1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...
2,2,"Managed by her father, Mathew Knowles, the gro...",When did Beyonce leave Destiny's Child and bec...
3,3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?
4,4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?


In [10]:
# потом можете добавить свою предобработку

def process_text(text):
    
    words = wordpunct_tokenize(text.lower())
    
    return words

In [8]:
all_text_data = []
all_text_data.extend(list(df['context']))
all_text_data.extend(list(df['question']))
all_text_data = list(set(all_text_data))
cleaned_text_data = [x for x in all_text_data if 'float' not in str(type(x))]
cleaned_text_data[:2]

['When did the U.N. vote to adopt the Sustainable Development Goals?',
 "In what case can a referee's decision be overturned?"]

In [11]:
word2freq = {}
lengths = []

for text in tqdm(cleaned_text_data):
  
    words = process_text(text)
    
    lengths.append(len(words))
    
    for word in words:
        
        if word in word2freq:
            word2freq[word] += 1
        else:
            word2freq[word] = 1

HBox(children=(IntProgress(value=0, max=144855), HTML(value='')))

In [12]:
#ИЗМЕНИТЬ ПУТЬ В ЗАВИСИМОСТИ ОТ ТОГО ГДЕ ЗАПУСКАЕМ КОД
local_path = "/Users/lilyakhoang/input/glove.6B/glove.6B.50d.txt"
this_folder_path = "glove.6B.50d.txt"

word2index = {'PAD': 0}
vectors = []
   
try:
    lines = open(local_path)
except:
    lines = open(this_folder_path)
    
embedding_dim = 50
# Zero vector for PAD
vectors.append(np.zeros((1, embedding_dim)))
progress_bar = tqdm(desc='Read word2vec', total=400000)

for line in lines:
    current_word = line.split()[0]
    if current_word in word2freq:

        word2index[current_word] = len(word2index)

        # current_vectors = current_parts[-embedding_dim:]
        current_vectors = line.split()[1:]
        current_vectors = np.array(list(map(float, current_vectors)))
        current_vectors = np.expand_dims(current_vectors, 0)

        vectors.append(current_vectors)

    progress_bar.update(1)

progress_bar.close()

vectors = np.concatenate(vectors)

lines.close()

HBox(children=(IntProgress(value=0, description='Read word2vec', max=400000, style=ProgressStyle(description_w…

In [13]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, vectors=vectors):#input_size
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.vectors = vectors
        self.vocab_size, self.embedding_dim = vectors.shape
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(self.vectors))
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first = True)

    def forward(self, input, hidden, debug = False):
        if debug == True: 
            print("====ENCODING_FORWARD====")
            print("input.shape", input.shape)
#         embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.embedding(input)
        output = embedded
        if debug == True: 
            print("embedded/output.shape",embedded.shape,"hidden.shape", hidden.shape  )
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 64, self.hidden_size, device=device)

In [15]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, vectors):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.vectors = vectors
        self.vocab_size, self.embedding_dim = vectors.shape
        # self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(self.vectors))
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, self.vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, debug = False):
        if debug == True:
          print("===FORWARD_DECODER===")
          print("input.shape {}, hidden.shape {}".format(input.shape,hidden.shape ))
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [16]:
class WordData(Dataset):
    def __init__(self, context , word2index, context_sequence_length=80, 
                 pad_token='PAD', verbose=True):
        
        super().__init__()
        
        self.x_data = []
        
        self.word2index = word2index
        self.context_sequence_length = context_sequence_length
        
        self.pad_token = pad_token
        self.pad_index = self.word2index[self.pad_token]
        
        self.load(context, verbose=verbose)
        
    @staticmethod
    def process_text(text):
        
        # Место для вашей предобработки
    
        words = wordpunct_tokenize(text.lower())

        return words
        
    def load(self, data, verbose=True):
        
        data_iterator = tqdm(data, desc='Loading data', disable=not verbose)
        
        for ctx_sent in data_iterator:
            ctx = self.process_text(ctx_sent)
            indexed_ctx = self.indexing(ctx)
            self.x_data.append(indexed_ctx)
    
    def indexing(self, tokenized_text):
        indexes = []
        for word in tokenized_text:
          if word in self.word2index:
            indexes.append(self.word2index[word])
        return indexes
    
    def padding(self, sequence):
        count = 0 
        paded_seq = []
        for seq_el in sequence:
          paded_seq.append(seq_el)
          count += 1
          if count >= self.context_sequence_length: break
        if count < self.context_sequence_length:
          for ind in range(count, self.context_sequence_length):
            paded_seq.append(self.pad_index)
        return paded_seq
    
    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self, idx):
        
        x = self.x_data[idx]
        x = self.padding(x, "context")
        x = torch.Tensor(x).long()

        return x

In [18]:
context = ["""Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, 
Texas, she performed in various singing and dancing competitions as a child,
and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's
Child. Managed by her father, Mathew Knowles, the group became one of the world's 
best-selling girl groups of all time."""]
           
           

SyntaxError: EOL while scanning string literal (<ipython-input-18-11fbaeed52ea>, line 1)

In [17]:
dataset = WordData(context, word2index)

NameError: name 'context' is not defined