# Word Embeddings

In [None]:
# :author: AnnieKLamar

In [None]:
import numpy as np
import string
import os
import glob
import pandas as pd
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
from gensim.models.callbacks import CallbackAny2Vec
import tempfile
import csv

def clean_files_to_df(quiet=True):
    """Reads all csv files in data to a pandas dataframe."""
    path = os.getcwd() + '\data'
    text_files = glob.glob(os.path.join(path, "*.csv"))
    header = ['string']
    all_lines = []
    for file in text_files:
        if not quiet:
            print('File Name:', file.split("\\")[-1])

        # open and read file             
        open_file = open(file, 'r', encoding='utf8')
        file_lines = open_file.readlines()
        
        for line in file_lines:
            if len(line.split(' '))>3:
                line = line.replace("†", "")
                line = line.replace('""', '')
                line = line.replace('…', '')
                line = line.replace("'", "")
                line = line.replace('”', "")
                line = line.replace('“', '')
                line = line.replace('—', '')
                line = line.replace('῾', '')
                line = line.replace('"', '')
                line = line.replace("!", '')
                line = line.replace('*', '')
                line = line.replace('-', '')
                line = line.replace('>', '')
                all_lines.append([line.strip()])
                
    corpus_df = pd.DataFrame(all_lines, columns=header)
    return corpus_df

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.last = 0
        self.first_run = True

    def on_epoch_end(self, model):
        if self.first_run:
            self.last = model.get_latest_training_loss()
            self.first_run = False
            print('Loss after epoch {}: {}'.format(self.epoch, self.last))
            self.epoch += 1
        else:
            loss = model.get_latest_training_loss()
            converted = loss - self.last
            print('Loss after epoch {}: {}'.format(self.epoch, converted))
            self.epoch += 1
            self.last = loss
            
def preprocessing(corpus_df):
    training_data = []
    
    for index,row in corpus_df.iterrows():
        training_data.append(row['string'])
    
    tokenized = []
    for item in training_data:
        tokens = item.split(" ")
        tokenized.append(tokens)
    return tokenized

def train_skipgram(training_data):
    model = gensim.models.Word2Vec(epochs=100, sg=1, sentences=training_data, min_count=5, compute_loss=True, 
                                   callbacks=[callback()])
    with tempfile.NamedTemporaryFile(prefix='gensim-sg-model-', delete=False) as tmp:
        temporary_filepath = tmp.name
        model.save(temporary_filepath)
        print("Model saved: " + temporary_filepath)
    return model

def train_cbow(training_data):
    model = gensim.models.Word2Vec(epochs=175, sg=0, sentences=training_data, min_count=5, compute_loss=True, 
                                   callbacks=[callback()])
    with tempfile.NamedTemporaryFile(prefix='gensim-cbow-model-', delete=False) as tmp:
        temporary_filepath = tmp.name
        model.save(temporary_filepath)
        print("Model saved: " + temporary_filepath)
    return model

def get_ten_closest(model, output_file):
    closest_file = open(output_file, 'w+', encoding='utf8', newline='')
    options_csv_writer = csv.writer(closest_file)
    options_fields = ['word', 'sim01word', 'sim01score', 'sim02word', 'sim02score', 'sim03word', 'sim03score',
                      'sim04word', 'sim04score', 'sim05word', 'sim05score', 'sim06word', 'sim06score',
                      'sim07word', 'sim07score', 'sim08word', 'sim08score', 'sim09word', 'sim09score', 
                      'sim10word', 'sim10score']
    options_csv_writer.writerow(options_fields)
    for key in model.wv.key_to_index.keys():
        topten = model.wv.most_similar(positive=[key], topn=10)
        row = [key]
        for pair in topten:
            row.append(pair[0])
            row.append(pair[1])
        options_csv_writer.writerow(row)
        
def get_ten_farthest(model, output_file):
    closest_file = open(output_file, 'w+', encoding='utf8', newline='')
    options_csv_writer = csv.writer(closest_file)
    options_fields = ['word', 'neg01word', 'neg01score', 'neg02word', 'neg02score', 'neg03word', 'neg03score',
                      'neg04word', 'neg04score', 'neg05word', 'neg05score', 'neg06word', 'neg06score',
                      'neg07word', 'neg07score', 'neg08word', 'neg08score', 'neg09word', 'neg09score', 
                      'neg10word', 'neg10score']
    options_csv_writer.writerow(options_fields)
    for key in model.wv.key_to_index.keys():
        topten = model.wv.most_similar(negative=[key], topn=10)
        row = [key]
        for pair in topten:
            row.append(pair[0])
            row.append(pair[1])
        options_csv_writer.writerow(row)       

def cbow_pipeline():
    print("Training CBOW model...")
    training_data = preprocessing(clean_files_to_df())
    cbow_model = train_cbow(training_data)
    get_ten_closest(cbow_model, 'results/cbow_most_similar.csv')
    get_ten_farthest(cbow_model, 'results/cbow_least_similar.csv')
    print("Top ten files saved as csv.")
    
def skipgram_pipeline():
    print("Training Skipgram model...")
    training_data = preprocessing(clean_files_to_df())
    sg_model = train_skipgram(training_data)
    get_ten_closest(sg_model, 'results/sg_most_similar.csv')
    get_ten_farthest(sg_model, 'results/sg_least_similar.csv')
    print("Top ten files saved as csv.")
    
cbow_pipeline()
skipgram_pipeline()