## Creating an 8-d Character Embedding

Put the path of a GloVe embedding or similar in the following cell and then **run all cells**.

In [24]:
file_path = "../glove.6B/glove.6B.50d.txt"

In [25]:
import numpy as np
import os
from sklearn.decomposition import PCA

vectors = {}
with open(file_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        word = line_split[0]

        for char in word:
            if ord(char) < 128:
                if char in vectors:
                    vectors[char] = (vectors[char][0] + vec,
                                     vectors[char][1] + 1)
                else:
                    vectors[char] = (vec, 1)

base_name = os.path.splitext(os.path.basename(file_path))[0] + '-char.txt'
with open(base_name, 'w') as f2:
    for word in vectors:
        avg_vector = np.round(
            (vectors[word][0] / vectors[word][1]), 6).tolist()
        f2.write(word + " " + " ".join(str(x) for x in avg_vector) + "\n")

In [26]:
from sklearn.decomposition import PCA

target_dimension = 8

# base_name should be defined from the previous cell
matrix, letters = [], []
with open(base_name, 'r') as f3:
    for line in f3:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        matrix.append(vec)
        letter = line_split[0]
        letters.append(letter)
        
matrix = np.array(matrix, dtype=float)
pca = PCA(n_components=target_dimension)
new_matrix = pca.fit_transform(matrix)

embedding_path = os.path.join(os.getcwd(), 'models', 'char-embeds.txt')
with open(embedding_path, 'w') as f4:
    for letter, vec in zip(letters, new_matrix):
        f4.write(letter + " " + " ".join(str(x) for x in vec) + "\n")

os.remove(base_name)

In [27]:
print("8-d Character Embedding saved in: {0}".format(embedding_path))

8-d Character Embedding saved in: /Users/georgymh/Documents/Cal/294/DecentML/models/char-embeds.txt
