In [45]:
import pandas as pd
import string
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import os

In [54]:
def load_data(filename):
    return pd.read_csv(filename, encoding='utf-8')

In [76]:
df = load_data("data_chunks/chunk_1.csv")

In [56]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract
0,0,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...
1,1,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...
2,2,Polymer Quantum Mechanics and its Continuum Limit,A rather non-standard quantum representation...
3,3,"The Spitzer c2d Survey of Large, Nearby, Inste...",We discuss the results from the combined IRA...
4,4,Computing genus 2 Hilbert-Siegel modular forms...,In this paper we present an algorithm for co...


# Data Cleaning #

In [20]:
def clean_data(doc):
# make all characters lowercase
    doc = doc.lower();
    for char in string.punctuation:
        doc = doc.replace(char, ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    doc = " ".join(tokens)
    return doc

In [21]:
def clean_text (text):
    cleaned_reviews = []
    for doc in text:
        clean = clean_data(doc)
        cleaned_reviews.append(clean)
    return cleaned_reviews

In [77]:
df["cleaned_abstract"] = clean_text(df["abstract"])

# Doc2Vec Model #

In [74]:
def tag_data(df, dataset_counter):
    tagged_data = []
    for index, row in df.iterrows():
        paragraph = row['cleaned_abstract']
        # Tokenize the paragraph into words
        tags = [f"{dataset_counter}_{index}"]  # Unique tag combining dataset_counter and index
        words = word_tokenize(paragraph)
        # Create a TaggedDocument with words and an index as the tag
        tagged_data.append(TaggedDocument(words=words, tags=tags))
    return tagged_data

In [78]:
tagged_data = tag_data(df, 1)
test_model = Doc2Vec(vector_size=300, window=5, min_count=1, epochs=10)
test_model.build_vocab(tagged_data)
test_model.train(tagged_data, total_examples=test_model.corpus_count, epochs=test_model.epochs)

In [41]:
def test_retrieve_embeddings(df, dataset_counter):
    paragraph_embeddings = []
    for index, _ in df.iterrows():
        tag = [f"{dataset_counter}_{index}"]
        vector = test_model.dv[tag]
        paragraph_embeddings.append(vector)
    return paragraph_embeddings

In [67]:
embeddings = test_retrieve_embeddings(df, 1)

In [68]:
embeddings[0]

array([-0.05853846,  0.12990786,  0.09955842,  0.01129305,  0.01371073,
       -0.10407504,  0.00695676,  0.18072245,  0.04993944, -0.03912092,
        0.0019877 , -0.00741177, -0.0160102 , -0.01287496, -0.06809775,
       -0.04715111,  0.13039264, -0.11001746, -0.01203263, -0.00446862,
       -0.1425551 , -0.06038997,  0.08392388,  0.05226517,  0.13549608,
       -0.0167216 , -0.01507288, -0.04386185, -0.14844136, -0.13457423,
       -0.04697548,  0.03975921, -0.05266241, -0.00127416, -0.04520372,
        0.01356521, -0.02390807, -0.17189   , -0.01763073, -0.00558726,
       -0.01078703, -0.07455625, -0.06139837, -0.15039201,  0.07285127,
       -0.02163354,  0.00432912, -0.01121517, -0.05049729,  0.17521936,
       -0.0224429 ,  0.08021046, -0.07906831,  0.00533044, -0.14926799,
        0.10100345,  0.03749479, -0.02722978, -0.05376054, -0.03557311,
       -0.09521517, -0.02923338, -0.08365134, -0.04411107,  0.12877952,
       -0.04005622,  0.03942119, -0.03837168, -0.04304772, -0.04

# Training on Full Dataset

In [None]:
# training the model on all chunks in data_chunks folder
# this needs to be changed to the right folder when data has been split to train, valid,test
# this is taking a LONG time
folder_path = 'data_chunks'
model = Doc2Vec(vector_size=300, window=5, min_count=1, epochs=10)
dataset_counter = 0
for dataset in os.listdir(folder_path):
    print(dataset)
    df = load_data(folder_path+'/'+dataset)
    df["cleaned_abstract"] = clean_text(df["abstract"])
    tagged_data = tag_data(df, dataset_counter)
    if dataset_counter == 0:
        model.build_vocab(tagged_data)
    else:
        model.build_vocab(tagged_data, update=True)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    dataset_counter += 1

chunk_0.csv
chunk_1.csv


In [None]:
def retrieve_embeddings(df, dataset_counter):
    paragraph_embeddings = []
    for index, _ in df.iterrows():
        tag = [f"{dataset_counter}_{index}"]
        vector = model.dv[tag]
        paragraph_embeddings.append(vector)
    return paragraph_embeddings

In [None]:
embeddings = retrieve_embeddings(df, 1)
embeddings[0]