In [None]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import nltk
import lxml
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt     
from tqdm import tqdm
from gensim.models import doc2vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re
     

In [None]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(str(v).split(), [label]))
    return labeled


In [None]:
def get_unsplit_data_d2v():
  y = df["Target"]
  x_lyrics = df["Lyrics"]

  return x_lyrics, y


In [None]:
def get_split_data_d2v(x, y):
  split = split_data(x, y)
  x_train =split[0][0]
  y_train = split[0][1]
  x_val = split[1][0]
  y_val = split[1][1]
  x_test = split[2][0]
  y_test = split[2][1]

  x_train_val = np.concatenate((x_train, x_val))
  y_train_val = np.concatenate((y_train, y_val))

  x_train_labelled = label_sentences(x_train_val, 'Train')
  x_test_labelled = label_sentences(x_test, 'Test')
  all_data = x_train_labelled + x_test_labelled

  return {"all_data": all_data, "y_train": y_train_val, "y_test": y_test, "len_x_train": len(x_train_val), "len_x_test": len(x_test)}
     


In [None]:
def model_dbow_func (vector_size, all_data):
  model_dbow = doc2vec.Doc2Vec(dm=0, vector_size=vector_size, negative=5, min_count=1, alpha=0.065, 
                     min_alpha=0.065)
  model_dbow.build_vocab([x for x in tqdm(all_data)])
  return model_dbow

In [None]:
def train_d2v_model(model_dbow, all_data):
  for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), 
                     total_examples=len(all_data), 
                     epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

  return model_dbow

In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors


In [None]:
def vectorise(vector_size, model_dbow, len_x_train, len_x_test):
  train_vectors_dbow = get_vectors(model_dbow, len_x_train, vector_size, 'Train')
  test_vectors_dbow = get_vectors(model_dbow, len_x_test, vector_size, 'Test')

  return train_vectors_dbow, test_vectors_dbow

In [None]:
vector_sizes = [10, 20, 50, 100, 200, 500]

def optimal_vector_size_d2v():

  x, y = get_unsplit_data_d2v()
  data_dict = get_split_data_d2v(x, y)

  all_data = data_dict["all_data"]
  y_train = data_dict["y_train"]
  y_test = data_dict["y_test"]
  len_x_train = data_dict["len_x_train"]
  len_x_test = data_dict["len_x_test"]

  result = {}
  for size in vector_sizes:
    d2v_model = model_dbow_func(size, all_data)
    trained_d2v_model = train_d2v_model(d2v_model, all_data)
    train_vectors, test_vectors = vectorise(size, trained_d2v_model, len_x_train, len_x_test)

    accuracy, pred_prob = run_NN_vectors(train_vectors, test_vectors, y_train, y_test)
    result[size] = accuracy, pred_prob

    print(f"Size: {size}, Accuracy: {accuracy}")

  return result

optimal_vector_size_d2v()