# Imports:

In [1]:
import gensim.downloader
from gensim.models.word2vec import LineSentence
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.test.utils import datapath
from gensim.models import Word2Vec, KeyedVectors
from gensim.corpora.csvcorpus import CsvCorpus
import pandas as pd
import csv
import os
import gensim
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
import matplotlib as plt



# Model details

In [2]:
def model_details(qw, cw, gw, res, model_name):
    with open(model_name, 'a', newline='') as csvfile:
        fieldnames = ['question-word', 'correct-word', 'guess-word', 'result']
        detail_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        info = os.stat(model_name)
        if info.st_size == 0:
            detail_writer.writeheader()

        detail_writer.writerow({'question-word': qw, 'correct-word': cw, 'guess-word': gw, 'result' : res})

# Model processing

In [3]:
def model_processing(model_name):
    for index, question in synonyms.iterrows():
        try:
            similarities = [google_news.similarity(question[0], word) for word in question if word is not question[0] and word is not question[1]]
            max_value = max(similarities)
            max_index = similarities.index(max_value)
            question_word, answer_word, guess_word, = question[0], question[1], question[max_index+2]
            zipped_list = zip(question[2:], similarities)
            word_score_dict = dict(zipped_list)
            
            if question[max_index+2] == question[1]:
                model_details(question_word, answer_word, guess_word, 'True')
            else:
                model_details(question_word, answer_word, guess_word, 'False')
                
        except KeyError:
            question_word, answer_word, guess_word, = question[0], question[1], question[max_index+2]
            model_details(question_word, answer_word, guess_word, 'Guess')
            print('----- ' + str(index) + ' = Not Available------')

# Available models and datasets

In [4]:
available_info = api.info()
# print(available_info)

# 2 models from different corpora, but same embedding size (Task 2.3.1)

In [None]:
wikinews_corpus_300 = api.load("fasttext-wiki-news-subwords-300")
conceptnet_corpus_300 = api.load("conceptnet-numberbatch-17-06-300")

# 2 models from same corpus, but different embedding sizes (Task 2.3.2)

In [None]:
glovetwitter_corpus_25 = api.load("glove-twitter-25")
glovetwitter_corpus_50 = api.load("glove-twitter-50")

# Calculating statistics and appending to analysis.csv

In [None]:
#TODO: Brandon, this is where you would practically copy and paste what you did for Task 1, but for every model above. I'm leaving placeholders for the time being.
file = open("analysis.csv","a", newline="", encoding = "utf-8")

wikinews_model_300_name = "fasttext-wiki-news-subwords-300"
wikinews_model_300_vocabulary_size = 25000
wikinews_model_300_vocabulary_correct = 130
wikinews_model_300_vocabulary_guess = 24870
wikinews_model_300_accuracy = wikinews_model_300_vocabulary_correct / wikinews_model_300_vocabulary_guess
file.write(wikinews_model_300_name +","+"{}".format(wikinews_model_300_vocabulary_size)+","+"{}".format(wikinews_model_300_vocabulary_correct)+"{}".format(wikinews_model_300_vocabulary_guess)+","+"{}".format(wikinews_model_300_accuracy))

conceptnet_model_300_name = "conceptnet-numberbatch-17-06-300"
conceptnet_model_300_vocabulary_size = 15000
conceptnet_model_300_vocabulary_correct = 30
conceptnet_model_300_vocabulary_guess = 14970
conceptnet_model_300_accuracy = conceptnet_model_300_vocabulary_correct / conceptnet_model_300_vocabulary_guess
file.write(conceptnet_model_300_name +","+"{}".format(conceptnet_model_300_vocabulary_size)+","+"{}".format(conceptnet_model_300_vocabulary_correct)+"{}".format(conceptnet_model_300_vocabulary_guess)+","+"{}".format(conceptnet_model_300_accuracy))

glovetwitter_model_25_name = "glove-twitter-25"
glovetwitter_model_25_vocabulary_size = 5000
glovetwitter_model_25_vocabulary_correct = 10
glovetwitter_model_25_vocabulary_guess = 4990
glovetwitter_model_25_accuracy = glovetwitter_model_25_vocabulary_correct / glovetwitter_model_25_vocabulary_guess
file.write(glovetwitter_model_25_name +","+"{}".format(glovetwitter_model_25_vocabulary_size)+","+"{}".format(glovetwitter_model_25_vocabulary_correct)+"{}".format(glovetwitter_model_25_vocabulary_guess)+","+"{}".format(glovetwitter_model_25_accuracy))

glovetwitter_model_50_name = "glove-twitter-50"
glovetwitter_model_50_vocabulary_size = 5000
glovetwitter_model_50_vocabulary_correct = 10
glovetwitter_model_50_vocabulary_guess = 4990
glovetwitter_model_50_accuracy = glovetwitter_model_25_vocabulary_correct / glovetwitter_model_25_vocabulary_guess
file.write(glovetwitter_model_50_name +","+"{}".format(glovetwitter_model_50_vocabulary_size)+","+"{}".format(glovetwitter_model_50_vocabulary_correct)+"{}".format(glovetwitter_model_50_vocabulary_guess)+","+"{}".format(glovetwitter_model_50_accuracy))
file.close()

# Drawing graphs for every statistic

In [None]:
plt.bar(["wikinews_vocabulary_size", "wikinews_vocabulary_correct", "wikinews_vocabulary_guess", "wikinews_accuracy"],[wikinews_model_300_vocabulary_size, wikinews_model_300_vocabulary_correct, wikinews_model_300_vocabulary_guess, wikinews_model_300_accuracy])
plt.savefig("Wikinews_statistics.pdf")

plt.bar(["conceptnet_vocabulary_size", "conceptnet_vocabulary_correct", "conceptnet_vocabulary_guess", "conceptnet_accuracy"],[conceptnet_model_300_vocabulary_size, conceptnet_model_300_vocabulary_correct, conceptnet_model_300_vocabulary_guess, conceptnet_model_300_accuracy])
plt.savefig("Conceptnet_statistics.pdf")

plt.bar(["glovetwitter_25_vocabulary_size", "glovetwitter_25_vocabulary_correct", "glovetwitter_25_vocabulary_guess", "glovetwitter_25_accuracy"],[glovetwitter_model_25_vocabulary_size, glovetwitter_model_25_vocabulary_correct, glovetwitter_model_25_vocabulary_guess, glovetwitter_model_25_accuracy])
plt.savefig("Glovetwitter_25_statistics.pdf")

plt.bar(["glovetwitter_50_vocabulary_size", "glovetwitter_50_vocabulary_correct", "glovetwitter_50_vocabulary_guess", "glovetwitter_50_accuracy"],[glovetwitter_model_50_vocabulary_size, glovetwitter_model_50_vocabulary_correct, glovetwitter_model_50_vocabulary_guess, glovetwitter_model_50_accuracy])
plt.savefig("Glovetwitter_50_statistics.pdf")
# Mostly done here, waiting on human gold standard