In [2]:
import json
import os
import pandas as pd
import numpy as np

### Part 1: Basic operations with word embeddings

I have selected two words: *current* and *classify* and created tables with similar words. 

In [13]:
with open('../results/similar_words/3.json') as f:
    similar_3 = json.load(f)

with open('../results/similar_words/29.json') as f:
    similar_29 = json.load(f)

def similarities_table(index_nr, filename):
    
    df = pd.DataFrame({
        'Wikipedia' :similar_3[index_nr]['similar_words'],
        'Wikipedia scores': similar_3[index_nr]['similar_scores'],
        'Gigaword': similar_29[index_nr]['similar_words'],
        'Gigaword scores': similar_29[index_nr]['similar_scores']
    }, index=similar_3[index_nr]['similar_idx'])

    with open(filename, "w") as f:
        f.write(df.to_latex())
    
similarities_table(1, '../report/similarities_current.tex')
similarities_table(4, '../report/similarities_classify.tex')

### Part 2: Intrinsic evaluation of pre-trained word embeddings

In [133]:
with open('../results/intrinsic_eval/40.json') as f:
    intrinsic_40 = json.load(f)
with open('../results/intrinsic_eval/75.json') as f:
    intrinsic_75 = json.load(f)
with open('../results/intrinsic_eval/82.json') as f:
    intrinsic_82 = json.load(f)
    
models = [('40 CoNLL-17', intrinsic_40), ('75 Oil and Gas', intrinsic_75), ('82 Common Crawl', intrinsic_82)]

Results from evaluation on Simlex:

In [149]:
def simlex_table(models, filename, filename2):
    sections = ['adj', 'noun', 'verb', 'total']
    
    simlex_oov = {}
    simlex_results = {}
    for name, model in models:
        simlex_results[name] = [model['simlex'][x]['pearson'][0] for x in sections]
        simlex_oov[name] = [model['simlex'][x]['oov_ratio'] for x in sections]

    df = pd.DataFrame(simlex_results, index=sections)
    df2 = pd.DataFrame(simlex_oov, index=sections)
    
    with open(filename, "w") as f:
        f.write(df.to_latex())

    with open(filename2, "w") as f:
        f.write(df2.to_latex())
        
simlex_table(models, '../report/simlex_table_1.tex', '../report/simlex_oov_table_1.tex')

Results from evaluation on Google Analogies:

In [150]:
def analogies_table(models, filename):
#     sections = sorted(intrinsic_40['analogies'].keys(), reverse=True)

    analogies_results = {}
    for name, model in models:
        sections = model['analogies']['sections']
        analogies_results[name] = [model['analogies']['scores'].get(x, None) for x in sections]

        
    df = pd.DataFrame(analogies_results, index=sections)
    with open(filename, "w") as f:
        f.write(df.to_latex())
        
analogies_table(models, '../report/analogies_table_1.tex')

### Part 3: Training a word embedding model on in-domain data

In [123]:
%%bash
paste -d "\t" \
<(zcat ../vectors/signal_1.txt.gz | tail -n +2 | head -n 20 | cut -f1 -d ' ') \
<(unzip -c ../../vectors/40.zip model.txt | tail -n +4 | head -n 20 | cut -f1 -d ' ') \
<(unzip -c ../../vectors/75.zip model.txt | tail -n +4 | head -n 20 | cut -f1 -d ' ') \
<(unzip -c ../../vectors/82.zip model.txt | tail -n +4 | head -n 20 | cut -f1 -d ' ') \
>../results/top_words.csv

In [151]:
df = pd.DataFrame(np.genfromtxt('../results/top_words.csv', delimiter='\t', dtype=np.str),
             columns=['SignalMedia', 'CoNLL17', 'Oil and Gas', 'Common Crawl'])

with open('../report/top_words.tex', "w") as f:
    f.write(df.head(10).to_latex())

In [148]:
with open('../results/intrinsic_eval/signal_1.json') as f:
    intrinsic_signal_1 = json.load(f)
with open('../results/intrinsic_eval/signal_2.json') as f:
    intrinsic_signal_2 = json.load(f)
models_2 = [('SignalMedia1', intrinsic_signal_1), ('SignalMedia2', intrinsic_signal_2)]
simlex_table(models_2, '../report/simlex_table_2.tex', '../report/simlex_oov_table_2.tex')
analogies_table(models_2, '../report/analogies_table_2.tex')

### Part 4: Document classification with word embeddings 

In [24]:
dirs = sorted(os.listdir('../results/classifier'))

results = {}
for experiment in dirs:
    with open(os.path.join('../results/classifier', experiment, 'metrics.json')) as f:
        x = json.load(f)
        f1 = np.round(x['f1'], 3).tolist()
        f1.append(np.round(x['avg_f1'], 3))
        labels = x['labels']
        labels.append('Average')
        results[experiment] = f1

results = pd.DataFrame(results, index=labels)

with open('../report/classification_results.tex', "w") as f:
    f.write(results.to_latex())

In [42]:
best_dirs = sorted(os.listdir('../results/best'))


precision = []
recall = []
f1 = []


for exp in best_dirs:
    with open(os.path.join('../results/best', exp, 'metrics.json'), "r") as f:
        x = json.load(f) 
        precision.append(x['precision'] + [x['avg_precision']])
        recall.append(x['recall'] + [x['avg_recall']])
        f1.append(x['f1'] + [x['avg_f1']])
        labels = x['labels']
        labels.append('Average')

        
avg_precision = np.mean(precision, axis=0)
avg_recall = np.mean(recall, axis=0)
avg_f1 = np.mean(f1, axis=0)

std_precision = np.std(precision, axis=0)
std_recall = np.std(recall, axis=0)
std_f1 = np.std(f1, axis=0)

avg_df = pd.DataFrame({'precision':avg_precision, 'recall':avg_recall, 'f1':avg_f1}, index=labels)
std_df = pd.DataFrame({'precision':std_precision, 'recall':std_recall, 'f1':std_f1}, index=labels)

with open('../report/best_avg.tex', "w") as f:
    f.write(avg_df.to_latex())
    
with open('../report/best_std.tex', "w") as f:
    f.write(std_df.to_latex())