In [1]:
import os
import sys

# local modules in src
directory_to_prepend = os.path.abspath("src")
if directory_to_prepend not in sys.path:
    sys.path = [directory_to_prepend] + sys.path

from preprocess import preprocess, get_combined_sentence_embedding, get_weighted_embedding, split_with_without_text

import fasttext as ft 
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca_2d = PCA(n_components=2)
from plot import plot_2d

path_to_data = './data/english_words.csv'
path_to_fasttext_model = './models/cc.en.300.bin'

assert os.path.exists(path_to_data)
assert os.path.exists(path_to_fasttext_model)

#random_seed = 12345

# Create Embeddings using fasttext

In [2]:
# read dataset, fill na values with empty string, and replace newline characters with empty string
df = pd.read_csv(path_to_data, usecols=['Word', 'Meaning']) 
df = preprocess(df, remove_duplicates=False)

print(f'-> dataset is read with shape: {df.shape}')
# split dataset into rows with text and without text
# TODO double check
num_words_with_text = 0
num_words_without_text = 0
for index, row in df.iterrows():
    if type(row['Meaning']) == str and len(row['Meaning']) > 0:
        num_words_with_text = num_words_with_text + 1
    else:
        num_words_without_text = num_words_without_text + 1

df_with_text, df_without_text = split_with_without_text(df)

assert num_words_with_text == df_with_text.shape[0]
assert num_words_without_text == df_without_text.shape[0]

print(f'-> #words with text: {df_with_text.shape[0]}')
print(f'-> #words without text: {df_without_text.shape[0]}')

# drop the values without text 
del df, df_without_text

-> dataset is read with shape: (13161, 2)
-> #words with text: 13145
-> #words without text: 16


In [3]:
ft_model = ft.load_model(path_to_fasttext_model)

# combining word and text we can get different representations of the input  
embeddings = {
    # appends word to the text and uses sentence embedding
    'combined_sentence': (lambda row: get_combined_sentence_embedding(ft_model, row)),
    # uses word embedding for word and sentence embedding for the text (weighted)
    'text_vec': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0)),
    'weighted01': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0.1)),
    'weighted02': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0.2)),
    'weighted03': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0.3)),
    'weighted04': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0.4)),
    'weighted05': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0.5)),
    'weighted06': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0.6)),
    'weighted07': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0.7)),
    'weighted08': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0.8)),
    'weighted09': (lambda row: get_weighted_embedding(ft_model, row, word_weight=0.9)),
    'word_vec': (lambda row: get_weighted_embedding(ft_model, row, word_weight=1)),
}

# create embeddings for the words with text
print(f"-> Creating embeddings...")
for key in embeddings:
    df_with_text[key] = df_with_text.apply(lambda row: embeddings[key](row.loc), axis=1)
    print(f"-> Embedding using method '{key}' is created.")

# save the vectors 
print(f"-> Embeddings are created, processed dataset has the following columns:\n{df_with_text.columns}")
df_with_text.to_csv('./data/english_words_with_embeddings.csv')

# plot the results
for key in embeddings:
    # save the 2d plot to visualize
    # rows with text should be clustered in the end
    embeddings_2d = pca_2d.fit_transform(df_with_text[key].values.tolist())
    plot_2d(embeddings_2d[:,0], embeddings_2d[:,1], save='fasttext_preprocessed_words_data_with_text_' + key + '_vectors_2d.pdf')

print(f'-> Processed dataset vectors (with text and without text) 2D plot added under plots.')

-> Creating embeddings...
-> Embedding using method 'combined_sentence' is created.




-> Embedding using method 'text_vec' is created.
-> Embedding using method 'weighted01' is created.
-> Embedding using method 'weighted02' is created.
-> Embedding using method 'weighted03' is created.
-> Embedding using method 'weighted04' is created.
-> Embedding using method 'weighted05' is created.
-> Embedding using method 'weighted06' is created.
-> Embedding using method 'weighted07' is created.
-> Embedding using method 'weighted08' is created.
-> Embedding using method 'weighted09' is created.
-> Embedding using method 'word_vec' is created.
-> Embeddings are created, processed dataset has the following columns:
Index(['Word', 'Meaning', 'combined_sentence', 'text_vec', 'weighted01',
       'weighted02', 'weighted03', 'weighted04', 'weighted05', 'weighted06',
       'weighted07', 'weighted08', 'weighted09', 'word_vec'],
      dtype='object')
-> Processed dataset vectors (with text and without text) 2D plot added under plots.


<Figure size 640x480 with 0 Axes>