# Code Example

In [None]:
# Import the classes from the "utils" folder

from utils.text_preprocessing import TextPreprocessing
from utils.neural_embedding_models import NeuralEmbeddingModels
from utils.embedding_validation import EmbeddingValidation

## Text Preprocessing

In [None]:
# Pre-process the training corpus

# Set the class arguments
digits_preprocess = 'removal'
lemmatize_method = 'TextBlob'
corpus_path = r'path/to/corpus/file'
output_format = 'text'

# Initialize the "cleaner" object
cleaner = TextPreprocessing(digits=digits_preprocess, lemma=lemmatize_method, POS_noun=False, 
                            punctuation=True, lowercase=True, token=False)

# Fit the corpus
cleaner.FitText(corpus=corpus_path)

# Store the cleaned corpus locally as text file
cleaner.SaveText(file_format=output_format)


## Model Training

In [None]:
# Train a language model with a custom hyperparameter configuration

# Set the class arguments
model_architecture = 'FastText'
clean_corpus_path = r'path/to/cleaned/corpus/file'
hyp_configuration = {'vector_size': 300,
                    'window': 10,
                    'min_count': 75,
                    'epochs': 5}
output_model_name = 'MyTrainedModel'

# Initializer the "trainer" object
trainer = NeuralEmbeddingModels(corpus=clean_corpus_path, model=model_architecture, hyperparameters=hyp_configuration)

# Train the model
trainer.Train()

# S the trained model locally
model_name = 'MyTrainedModel'
trainer.SaveModel(filename=output_model_name)

# Store the word vectors file locally
trainer.WordVectors()

## Embedding Validation

In [None]:
# Evaluate the trained model on a set of embedding validation task

word_vectors_path = r'path/to/word/vectors/file'
vector_dimension = hyp_configuration['vector_size']

# Initialize the "validator" object
validator = EmbeddingValidation(vector_file=word_vectors_path, vector_size=vector_dimension)

In [None]:
# Word Relatedness task

# Set the function argument
relatedness_data_path = r'path/to/relatedness/file'
metric = 'cosine'
correlation = 'Spearman'

validator.Relatedness(word_pairs=relatedness_data_path, eval_metric=metric, corr_metric='Spearman')

In [None]:
# Word Categorization task

# Set the function argument
categorization_data_path = r'path/to/categorization/file'
category_number = 10
seed = 420

validator.Categorization(category_file=categorization_data_path, n_categories=category_number, seed=seed)

In [None]:
# Word Analogy task

# Set the function argument
sample_dimension = 150
seed = 420
eval_metric = '3CosMUL' 

validator.Analogy(words=True, sample=sample_dimension, seed=seed, metric=eval_metric)

In [None]:
# Sentiment Analysis task

# Set the function argument
sentiment_data_path = r'path/to/sentiment/analysis/file'
sample_dimension = 25000
seed = 420
eval_metric = 'accuracy'

validator.SentimentAnalysis(labeled_dataset=sentiment_data_path, sample=sample_dimension, 
                            seed=seed, metrics=eval_metric, preprocessing=True)

In [None]:
# Frequency Category task

# Set the function argument
sample_dimension = 15000
freq_threshold_values = [250, 500, 750, 1000]
eval_metric = 'accuracy'

validator.FrequencyCategory(sample_size=sample_dimension, threshold_list=freq_threshold_values, metrics=eval_metric)