In [1]:
import os
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np


In [2]:
df = pd.read_csv('..\Data\data.csv')
df.head()


Unnamed: 0,text,label,split
0,"Dr. B.S. CHAUHAN, J. This reference before us...",1,val
1,This special leave petition has been filed ag...,0,val
2,Leave granted. This appeal has been preferred...,1,train
3,"S. RADHAKRISHNAN, J. We are, in this case, co...",0,train
4,"Dr. B. S. CHAUHAN, J. These appeals have been...",0,test


In [3]:
null_text = df[df['text'].isnull()]
print(null_text)


Empty DataFrame
Columns: [text, label, split]
Index: []


In [5]:
from sklearn.model_selection import train_test_split


train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

train_labels = train_df['label'].values
val_labels = val_df['label'].values
test_labels = test_df['label'].values


In [6]:
def create_tagged_documents(data):
    tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(data['text'])]
    return tagged_data


In [7]:
train_tagged = create_tagged_documents(train_df)
val_tagged = create_tagged_documents(val_df)
test_tagged = create_tagged_documents(test_df)
os.makedirs('doc2vec_embeddings_500', exist_ok=True)
os.makedirs('doc2vec_embeddings_1000', exist_ok=True)


In [8]:
embedding_sizes = [500, 1000]
num_epochs = 50

models = {}
for size in embedding_sizes:
    models[size] = Doc2Vec(vector_size=size, window=5, min_count=1, workers=4, epochs=num_epochs)
    models[size].build_vocab(train_tagged)
    models[size].train(train_tagged, total_examples=models[size].corpus_count, epochs=models[size].epochs)


In [None]:
datasets = ['train', 'val', 'test']

for size in embedding_sizes:
    for dataset in datasets:
        embeddings = [models[size].infer_vector(doc.words) for doc in locals()[f'{dataset}_tagged']]
        np.save(f'doc2vec_embeddings_{size}/{dataset}_embed.npy', np.array(embeddings))
        np.save(f'doc2vec_embeddings_{size}/{dataset}_labels.npy', locals()[f'{dataset}_labels'])
