In [83]:
import pandas as pd
import string
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import os
import torch
from decoder import LSTMGenerator
import torch.nn as nn
from torchmetrics.text.bert import BERTScore


In [3]:
def load_data(filename):
    return pd.read_csv(filename, encoding='utf-8')

In [4]:
df = load_data("data_chunks/chunk_1.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract
0,0,When Did Cosmic Acceleration Start ?,"A precise determination, and comparison, of ..."
1,1,Constructing a maximum utility slate of on-lin...,We present an algorithm for constructing an ...
2,2,Dealing with delicate issues in waveforms calc...,We revisit the calculation of gravitational ...
3,3,Accretion vs colliding wind models for the gam...,LS I +61 303 is a puzzling Be/X-ray binary w...
4,4,Detailed study of the GRB 030329 radio aftergl...,We explore the physics behind one of the bri...


# Data Cleaning #

In [6]:
def clean_data(doc):
# make all characters lowercase
    doc = doc.lower();
    for char in string.punctuation:
        doc = doc.replace(char, ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    doc = " ".join(tokens)
    return doc

In [7]:
def clean_text (text):
    cleaned_reviews = []
    for doc in text:
        clean = clean_data(doc)
        cleaned_reviews.append(clean)
    return cleaned_reviews

In [8]:
df["cleaned_abstract"] = clean_text(df["abstract"])

# Doc2Vec Model #

In [9]:
def tag_data(df, dataset_counter):
    tagged_data = []
    for index, row in df.iterrows():
        paragraph = row['cleaned_abstract']
        # Tokenize the paragraph into words
        tags = [f"{dataset_counter}_{index}"]  # Unique tag combining dataset_counter and index
        words = word_tokenize(paragraph)
        # Create a TaggedDocument with words and an index as the tag
        tagged_data.append(TaggedDocument(words=words, tags=tags))
    return tagged_data

In [10]:
tagged_data = tag_data(df, 1)
test_model = Doc2Vec(vector_size=300, window=5, min_count=1, epochs=10)
test_model.build_vocab(tagged_data)
test_model.train(tagged_data, total_examples=test_model.corpus_count, epochs=test_model.epochs)

In [11]:
def test_retrieve_embeddings(df, dataset_counter):
    paragraph_embeddings = []
    for index, _ in df.iterrows():
        tag = [f"{dataset_counter}_{index}"]
        vector = test_model.dv[tag]
        paragraph_embeddings.append(vector)
    return paragraph_embeddings

In [12]:
embeddings = test_retrieve_embeddings(df, 1)

In [127]:
len(embeddings)

10

In [49]:
def get_string(vocab, array):
    string = []
    for number in array:
        result = next((key for key, value in vocab.items() if value == int(number)), None)
        string.append(result)
    return " ".join(string)



In [139]:
import numpy


input_encodings = torch.FloatTensor(embeddings[:10])
target_titles = df.title[:10]

# Convert the target titles into tensors of indices
vocab = {"<START>": 0, "<END>": 1}  # Add special tokens for padding, start, and end
for title in target_titles:
    for word in title.split():
        if word not in vocab:
            vocab[word] = len(vocab)


encoded_titles = []
for title in target_titles:
    encoded_title = [vocab["<START>"]] + [vocab[word] for word in title.split()] + [vocab["<END>"]]
    encoded_titles.append(torch.tensor(encoded_title))

# Instantiate the LSTMGenerator model
input_size = 300
hidden_size = 128
output_size = len(vocab)  # Size of your vocabulary
model = LSTMGenerator(input_size, hidden_size, output_size)

loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for i in range(10000):
    avg = 0
    for input_encoding, target_title in zip(input_encodings, encoded_titles):
        # Forward pass
        output = model(input_encoding.unsqueeze(0))  # Unsqueeze to add batch dimension

        output = output.squeeze(0)
        # print(f'Title size: {target_title.shape}')
        # print(f'Output thingy given to loss: {output.view(-1, output_size).shape}')
        output = output[:len(target_title)]
        # print(f'Output size:{output.shape}')
        # print(target_title)
        # print(target_title.float())
        loss = loss_function(output, target_title.float())
        avg = avg + loss
        # loss = loss_function(output_trimmed, target_title)
        # Backpropagation and parameter update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Average loss: {avg/10}')

# Generation (Inference)
# TODO: First make sure network works, cannot generate valid titles so far
# generated_titles = []
# for input_encoding in input_encodings:
#     output = model(input_encoding.unsqueeze(0))
#     print(output)
#     _, predicted_indices = torch.max(output, dim=2)
#     predicted_indices = predicted_indices.squeeze()
#
#     generated_title = ""
#     for index in predicted_indices:
#         if index.item() == vocab["<END>"]:
#             break
#         generated_title += list(vocab.keys())[list(vocab.values()).index(index.item())] + " "
#
#     generated_titles.append(generated_title.strip())
#
# # Print the generated titles
# for generated_title in generated_titles:
#     print(generated_title)


Average loss: 1441.745849609375
Average loss: 1441.6705322265625
Average loss: 1441.575927734375
Average loss: 1441.426513671875
Average loss: 1441.171630859375
Average loss: 1440.7391357421875
Average loss: 1440.142333984375
Average loss: 1439.6226806640625
Average loss: 1439.18994140625
Average loss: 1438.8453369140625
Average loss: 1438.5439453125
Average loss: 1438.277587890625
Average loss: 1438.033447265625
Average loss: 1437.7965087890625
Average loss: 1437.5540771484375
Average loss: 1437.298095703125
Average loss: 1437.024658203125
Average loss: 1436.732177734375
Average loss: 1436.422119140625
Average loss: 1436.0989990234375
Average loss: 1435.772705078125
Average loss: 1435.4569091796875
Average loss: 1435.15966796875
Average loss: 1434.878662109375
Average loss: 1434.6151123046875
Average loss: 1434.3720703125
Average loss: 1434.14892578125
Average loss: 1433.945068359375
Average loss: 1433.75927734375
Average loss: 1433.5908203125
Average loss: 1433.4388427734375
Average 

KeyboardInterrupt: 

# Training on Full Dataset

In [None]:
# training the model on all chunks in data_chunks folder
# this needs to be changed to the right folder when data has been split to train, valid,test
# this is taking a LONG time
folder_path = 'data_chunks'
model = Doc2Vec(vector_size=300, window=5, min_count=1, epochs=10)
dataset_counter = 0
for dataset in os.listdir(folder_path):
    print(dataset)
    df = load_data(folder_path+'/'+dataset)
    df["cleaned_abstract"] = clean_text(df["abstract"])
    tagged_data = tag_data(df, dataset_counter)
    if dataset_counter == 0:
        model.build_vocab(tagged_data)
    else:
        model.build_vocab(tagged_data, update=True)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    dataset_counter += 1

chunk_0.csv
chunk_1.csv


In [None]:
def retrieve_embeddings(df, dataset_counter):
    paragraph_embeddings = []
    for index, _ in df.iterrows():
        tag = [f"{dataset_counter}_{index}"]
        vector = model.dv[tag]
        paragraph_embeddings.append(vector)
    return paragraph_embeddings

In [None]:
embeddings = retrieve_embeddings(df, 1)
embeddings[0]