In [1]:
import json
import numpy as np
import pandas as pd
from datetime import datetime
from scipy.stats import spearmanr
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import nltk

In [2]:
class SemanticSpace(object):
    
    """
    This class creates distributional semantic space using a co-occurrence based approach.
    
    Any update you plan on doing (applying weighting schemes or dimensionality reduction, use a
    prediction-based DSM, ...) need to be implemented in this class.
    
    In its current implementation, this class gets the corpus as a list of lists, with inner lists 
    consisting of strings. You can change anything you want: use different corpora, in different 
    formats, apply feature weighting schemes to the raw co-occurrences, apply dimensionality reduction,
    learn the embeddings using prediction-based methods, ...
    
    The important things are:
    - the SemanticSpace class needs to have the  word2idx attribute (a dictionary mapping strings to 
        integers, representing the row indices in the embedding space corresponding to each word). The 
        keys of this dictionary must be lower-cased tokens!
    - the SemanticSpace class needs to have the embeddings attribute (a 2d NumPy array where rows are 
        words and columns are dimensions, symbolic or latent)
    """
    
    def __init__(self, corpus, t=10):
        
        """
        To initialize the semantic space it is enough to provide a corpus as a list of lists,
        where each inner list consists of strings, representing the words in a sentence, and to
        indicate a frequency threshold: only words with a frequency count higher or equal to the
        threshold are used to build the semantic space.
        """
        
        self.corpus = corpus
        
        # compute a word frequency distribution
        self.freqs = self.freq_distr()
        
        # select words which occur more often than the threshold
        self.targets = {w for w, f in self.freqs.items() if f > t}
        
        # map words to numerical indices
        self.word2idx = {w: i for i, w in enumerate(self.targets)}
        
        # update a co-occurrence matrix.
        self.embeddings = self.harvest_counts()
        
        """
        IMPORTANT: the actual semantic space needs to be encoded as a NumPy 2d array!
        Whatever transformation you decide to apply to raw counts (or whether you want to use Word2Vec)
        make sure that self.embeddings is a 2d NumPy array with as many rows as there are words in the 
        vocabulary. I will compute cosine similarity indexing the embedding space!
        """
        
    
    def freq_distr(self):
        
        word_frequencies = Counter()
        for sentence in self.corpus:
            for word in sentence:
                word_frequencies[word] += 1
        return word_frequencies
    
    def harvest_counts(self):
        """Uses word2vec to get word vectors. Model is trained
           on the corpus, then word vectors are concatenated in
           a matrix.
        """
        #Train word2vec model on corpus
        print('Training word2vec model')
        model = Word2Vec(self.corpus,
                         size      = 1000, 
                         window    = 10, 
                         min_count = 1,
                         workers   = 4,
                         sg        = 1,  #Use skip gram
                         iter      = 10   #Number of epochs
                         )
        #Initialise empty matrix to put word vectors
        wordvectors = np.empty((len(self.word2idx), len(model.wv[model.wv.index2entity[0]]))) #wordvectors[word2idx[word]] = wordvect
        
        #Iterate over all unique idx values
        for idx, word in enumerate(self.word2idx):
            
            wordvector = model.wv[word]
            wordvectors[idx] = wordvector #Put word vector in matrix
        print('Finished training model')
        
        return wordvectors

In [3]:
class Sim(object):
    
    """This class compares semantic similarity scores retrieved from a corpus to human-generated norms."""
    
    def __init__(self, norms, semantic_space):
        
        """
        This class is initialized providing two input structures:
        - a Pandas DataFrame: the first column ('w1') contains the first word in the similarity pair, 
            the second column ('w2') contains the second word in the similarity pair, the third column 
            ('sim') contains the similarity score between w1 and w2.
        - an object of class SemanticSpace (check the docs of this class for what it consists of and what 
            the necessary attributes it needs to have are)
        
        Don't change this class at all! Make sure that it works with the SemanticSpace class as you modify it.
        I will use this class to evaluate your submissions, specifically the compute_correlation() method.
        """
        
        self.norms = norms
        self.embeddings = semantic_space.embeddings
        self.word2idx = semantic_space.word2idx
        
    def compute_similarity(self, w1, w2):
        
        try:
            e1 = self.embeddings[self.word2idx[w1], :]
            try:
                e2 = self.embeddings[self.word2idx[w2], :]
                s = cosine_similarity(e1.reshape(1, -1), e2.reshape(1, -1))
                return s[0][0]
            
            except KeyError:
                print("Couldn't find the embedding for word {}. Not computing cosine for pair {}-{}.".format(
                    w2, w1, w2)
                     )
                
        except KeyError:
            print("Couldn't find the embedding for word {}. Not computing cosine for pair {}-{}.".format(
                    w1, w1, w2)
                     )
        
        return None
    
    def compute_correlation(self):
        
        true_similarities = []
        estimated_similarities = []
        for _, row in self.norms.iterrows():
            s = self.compute_similarity(row['w1'], row['w2'])
            if s:
                estimated_similarities.append(s)
                true_similarities.append(row['sim'])
        
        print("Pairs for which it was possible to compute cosine similarity: {}".format(
            len(estimated_similarities))
             )
        
        print("Spearman rho between estimated and true similarity scores: {}".format(
            spearmanr(true_similarities, estimated_similarities)[0])
             )
        
        

In [4]:
#Load CHILDES dataset
CHILDES = json.load(open("corpus.json", 'r'))
CHILDES = np.asarray(CHILDES[0])

#Load BROWN corpus
nltk.download('brown')
corpus = np.asarray(nltk.corpus.brown.sents())

corpus = np.concatenate((CHILDES, corpus), axis=0)


"""
CHILDES[0] is a list of lists, where each inner list is a sentence, encoded as a list of tokens (as strings).
CHILDES[1] is a list of lists, where each inner list is a sentence, encoded a list of lemma~pos pairs (as strings).
The two lists contain exactly the same sentences from a collation of corpora from CHILDES.
"""

S = SemanticSpace(corpus, t=1)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Bram\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


Training word2vec model


In [8]:
norms = pd.read_csv("norms.test.csv", header = 0, sep = ' ')
sim = Sim(norms, S)
sim.compute_correlation()

FileNotFoundError: [Errno 2] File b'norms.test.csv' does not exist: b'norms.test.csv'

In [None]:
# t=0: 965 pairs in the test, 49K tokens in the embedding space, rho=0.2239
# t=10: 829 pairs in the test, 12K tokens in the embedding space
# t=25: 714 pairs in the test, 8K tokens in the embedding space

In [6]:
norms = pd.read_csv("norms.dev.csv", header = 0, sep = ' ')
sim = Sim(norms, S)
sim.compute_correlation()

Couldn't find the embedding for word pug. Not computing cosine for pair graveyard-pug.
Couldn't find the embedding for word skyline. Not computing cosine for pair scenery-skyline.
Couldn't find the embedding for word canine. Not computing cosine for pair canine-husky.
Couldn't find the embedding for word gravestone. Not computing cosine for pair gravestone-silhouette.
Couldn't find the embedding for word skyline. Not computing cosine for pair brick-skyline.
Couldn't find the embedding for word skyline. Not computing cosine for pair skyline-skyscraper.
Couldn't find the embedding for word skyline. Not computing cosine for pair downtown-skyline.
Couldn't find the embedding for word skyline. Not computing cosine for pair building-skyline.
Couldn't find the embedding for word droplet. Not computing cosine for pair cloud-droplet.
Couldn't find the embedding for word canine. Not computing cosine for pair canine-paw.
Couldn't find the embedding for word boardwalk. Not computing cosine for pai