In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import os

In [2]:
class DecoderRNN(nn.Module):
    
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers = 1):
        super(DecoderRNN, self).__init__() 
        self.embedding_layer = nn.Embedding(vocab_size, embed_size)
        self.linear1 = nn.Linear(embed_size*14, hidden_size)
        #self.lstm = nn.LSTM(input_size = embed_size, hidden_size = hidden_size, batch_first = True)
        ### Can add another lstm ###
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, captions):
        embed = self.embedding_layer(captions)
        
        # embed = torch.cat((features.unsqueeze(1), embed), dim = 1)
        #lstm_outputs, _ = self.lstm(embed)
        
        out = F.relu(self.linear1(embed))
        out = self.linear2(out)
        out = F.log_softmax(out, dim=1)
        
        return out

In [3]:
def read_paths():
    dir_ , file_name = [], []
    datadir = 'sentence'
    subdirs = [x[0] for x in os.walk(datadir)][1:3] # remove curr folder from list
    textdir = [x[2:] for sub in subdirs for x in os.walk(sub)] # remove empty folder
    for i,x in enumerate(textdir):
        sub = subdirs[i]
        file_name += [data for data in x[0]]
        comb_dir = [sub+'/'+data for data in x[0]]
        dir_ += [d for d in comb_dir]
    return(dir_, file_name)

In [4]:
def read_data(paths, files):
    lines, tokens, data, file_name = [], [], [], []
    for i,path in enumerate(paths):
        
        context = [sent.replace('\n','') for sent in open(path).readlines()[:-1]] 
        target = [open(path).readlines()[-1].replace('\n', '')]        
        
        c_tokens = [sent.split() for sent in context]
        t_tokens = [sent.split() for sent in target]
        
        tokens += c_tokens + t_tokens
        
        data.append((c_tokens, t_tokens))
        file_name.append(files[i])
        
    words = [w for sent in tokens for w in sent]
    word_to_idx = {w:i for i, w in enumerate(words)}
    word_to_idx['UNK'] = len(word_to_idx)
    return(data, file_name, word_to_idx)

In [13]:
def view_data(data, file_name):
    data = data[:5]
    for i, (c,t) in enumerate(data):
        print("The file is : {}". format(file_name[i]))
        print("The context data is : {}".format(c))
        print("The target data is : {}".format(t))

In [6]:
def create_data(data, word_to_idx):
    train_data, len_c, len_t = [], [], []
    for c, t in data:
        context_data = []
        for sent in c:
            add_unk = ['UNK'] * (27 - len(sent)) # 27 is max_len
            sent += add_unk
            context = [ word_to_idx[word] for word in sent] 
            context_data.append(context)
        len_c.append(len(context))
        
        add_unk = ['UNK']*(18-len(t)) # 18 is max_len
        target = [ word_to_idx[word] for sent in t for word in sent+add_unk]
        len_t.append(len(target))
        max_t = max(len_t)
        
        for c in context_data: # Make the last sentence the target for all the others
            train_data.append((c, target))
            
    max_c = max(len_c)
    return(train_data, max_c, max_t)

In [7]:
def train(model, data):
    num_epochs = 10
    
    optimizer = optim.SGD(model.parameters(), lr = 0.01)
    loss_func = nn.CrossEntropyLoss()
    
    for epoch in range(num_epochs):
        total_loss = 0
        for context, target in (data):
            print("The context is : {}".format(context))
            print("The target is : {}".format(target))
            outputs = model(torch.tensor([context], dtype = torch.long))
            loss = loss_func(outputs, torch.tensor([target], dtype = torch.long))
        """    
            total_loss += loss
            loss.backward()
            optimizer.step()
        print(total_loss)
        """

In [11]:
def main():
    paths, file_name = read_paths()
    text_data, file_name, word_to_idx = read_data(paths, file_name)
    view_data(text_data, file_name)
    vec_data, max_c, max_t = create_data(text_data, word_to_idx)
    print(len(word_to_idx))
    EMBEDDING_DIM = 128
    HIDDEN_DIM = 2
    
    #model = DecoderRNN(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), num_layers = 1)
    #train(model, vec_data)

In [14]:
if __name__=="__main__":
    main()

The file is : 2008_000912txt
The context data is : [['A', 'close-up', 'of', 'a', 'brown', 'horse.'], ['A', 'horse', 'facing', 'the', 'camera', 'with', 'a', 'horse', 'in', 'the.background'], ['A', 'horse', 'standing', 'near', 'a', 'gate', 'with', 'another', 'horse', 'in', 'the', 'background.'], ['A', 'reddish', 'brown', 'horse', 'looking', 'over', 'a', 'fence.']]
The target data is : [['A', 'young', 'horse', 'looks', 'into', 'the', 'camera', 'from', 'behind', 'a', 'fence.']]
The file is : 2008_008262txt
The context data is : [['A', 'black', 'and', 'white', 'photo', 'of', 'three', 'horses,', 'their', 'handlers,', 'and', 'three', 'onlookers.'], ['Black', 'and', 'white', 'scene', 'of', 'three', 'people', 'looking', 'at', 'three', 'horses.'], ['People', 'stand', 'in', 'the', 'arena', 'with', 'horses.'], ['The', 'horses', 'prepare', 'for', 'the', 'show.']]
The target data is : [['Three', 'horses', 'are', 'shown', 'using', 'halters.']]
The file is : 2008_003447txt
The context data is : [['A',