<a href="https://colab.research.google.com/github/BeWildering/comp-semantics/blob/main/CompositionalSemantics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries, load language models

In [2]:
import os
import urllib.request
import numpy as np
import pandas as pd
import math
import re
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.decomposition import PCA
import spacy
import en_core_web_sm
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

# Get interactive Tools for Matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Load language model for parsing sentences
nlp = en_core_web_sm.load()

#Show all columns of dataframe
pd.options.display.max_columns = None
pd.options.display.max_rows = None

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Load word embeddings

In [3]:
if not os.path.exists('/content/glove.6B.zip'):
  urllib.request.urlretrieve('https://nlp.stanford.edu/data/glove.6B.zip', 'glove.6B.zip')
if not os.path.exists('/content/glove.6B.100d.txt'):
  !unzip /content/glove.6B

glove_file = datapath('/content/glove.6B.300d.txt')

tmp_file = "/content/glove_tmp"
glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

Archive:  /content/glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


Component Functions

In [4]:
def product(x):
  '''Calculate the product of a series'''
  prod = 1
  for w in x:
    prod = prod * w
  return prod

def get_features(text):
  """Extracts linguistic features and dependency weights. Returns as dataframe."""
  doc = nlp(text)
  features = pd.DataFrame(columns=['token', 'pos','head','children'])
  for token in doc:
    features = features.append({'token': token.text,
                                'pos': token.pos_,
                                # 'dep': token.dep_,
                                'head': token.head.text,
                                # 'head_pos': token.head.pos_,
                                'children': [child for child in token.children]},
                               ignore_index=True)
  features['dep_wt'] = features.apply(lambda row: len(row['children']) + 1, axis=1)
  features['dep_wt_log'] = features.apply(lambda row: math.log(row['dep_wt'], 10) + 1, axis=1)
  features = get_ants(features)
  return features

def get_ants(token_df):
  '''Generate antonyms and insert as new column in token features dataframe.'''
  pos_keys = {'VERB': 'v', 'ADV': 'r', 'ADJ': 'a', 'NOUN': 'n'}
  ants = []
  for w,p in zip(token_df.token, token_df.pos):
    if p in pos_keys.keys():                      # only consider verb, adv, adj, and noun
      for s in wn.synsets(w, pos=pos_keys[p]):
        for l in s.lemmas():
          if l.antonyms():
            ants.append(l.antonyms()[0].name())   # get first antonym for any sense
            break
          else:
            ants.append(None)
          break
        else:
          continue
        break
      else:
        ants.append(None)
    else:
      ants.append(None)
  token_df.insert(len(token_df.columns), "ant", ants)
  return token_df

# Inner combination methods
innerA = lambda x: [w for w,wt in x]
innerB = lambda x: [sum(model[w]*wt for w,wt in x)]
innerC = lambda x: [product(model[w]*wt for w,wt in x)]

# Outer combination methods
outerA = lambda x,y: model.most_similar(positive=x, negative=y, topn=20)
outerB = lambda x,y: model.most_similar_cosmul(positive=x, negative=y, topn=20)

# Combination methods
combA = (innerA, outerA)  # 3CosAdd
combB = (innerA, outerB)  # 3CosMul
combC = (innerB, outerA)  # Add
combD = (innerC, outerA)  # Mul

# Weighting schemes
wtA = lambda x: [1]*len(x)
wtB = lambda x: x.dep_wt
wtC = lambda x: x.dep_wt_log

# Stopword removal options
stopA = lambda x: x   # Return dataframe with stopwords
def stopB(tokens_df):
  '''Return dataframe without stopwords'''
  stopless = remove_stopwords(' '.join(tokens_df['token'])).split()
  return tokens_df[tokens_df['token'].isin(stopless)]

# Antonym inclusion options
negA = lambda x: [] # Don't include antonyms
negB = lambda x: x  # Include antonyms

# Parameter sets for iterative testing
combs = [combA,combB,combC,combD]
wts = [wtA,wtB,wtC]
stops = [stopA,stopB] 
negs = [negA,negB]

def compose(tokens_df, comb_type, wt_type, stop_type, neg_type):
  '''Generate list of words most similar to the combined meaning of
  the input tokens. Takes token features dataframe and parameters as inputs,
  returns wordlist.'''
  stop_df = stop_type(tokens_df)                                            # handle stopwords
  p = comb_type[0](zip(stop_df.token, wt_type(stop_df)))                    # combine positives
  ants_df = stop_df[stop_df['ant'].values != None]                          # handle antonyms
  if len(ants_df) == 0:
    n = []
  else:
    n = comb_type[0](zip(ants_df.ant, wt_type(ants_df)))                    # combine negatives
  most_sim = comb_type[1](p, neg_type(n))                                   # combine positives and negatives
  return [w[0] for w in most_sim if w[0] not in list(stop_df.token)][:10]   # first 10 original words to be generated


Data storage dicts

In [6]:
#Initiate storage dicts
total = {}              # contains words and their full results
scores = {}             # contains a running list of scores for each model
for i in range(24):
  scores[i] = []        # Initiate empty list for each of the 24 models

Data collection
*   Recommended wordlist: https://simple.wikipedia.org/wiki/Wikipedia:List_of_1000_basic_words
*   Recommended dictionary: https://www.learnersdictionary.com/


In [17]:
# Specify word and definition/description to test
word = 'student'
definition = "a person who attends a school, college, or university"
total[word] = [definition]

new = []
raw_string = total[word][0]
word_string = re.sub(r'[^\w\s]', '', raw_string) #remove any punctuation

tokens = get_features(word_string)
print('All tokens and features:')
print(tokens)
print()
print('Tokens and features minus stopwords:')
print(stopB(tokens))
print()

# Test all parameter combinations (but wtB and wtC only used with combC)
i,m = 0,0
for comb in combs:
  i+=1
  j,k,l = 0,0,0
  if comb == combC:
    for wt in wts:
      j+=1
      k,l = 0,0
      for stop in stops:
        k+=1
        l = 0
        for neg in negs:
          l+=1
          m+=1
          result = compose(tokens, comb, wt, stop, neg)
          new.append(result)
          print(m, '--', 'comb:', i,'wt:', j,'stop:', k,'neg:', l, result)
  else:
    wt = wtA
    j = 1
    for stop in stops:
      k+=1
      l = 0
      for neg in negs:
        l+=1
        m+=1
        result = compose(tokens, comb, wt, stop, neg)
        new.append(result)
        print(m, '--', 'comb:', i,'wt:', j,'stop:', k,'neg:', l, result)

total[word].append(np.array(new))             # add resulting wordlist to 'total' dict

locations = np.where(total[word][1] == word)  # note index of the correct word in top 10

# Set default score for each model to 0
scores_new = {}
for i in range(24):
  scores_new[i] = 0

# If correct word generated, then add update model score to (10 - index of correct word)
for i in range(len(locations[0])):
  scores_new[locations[0][i]] = 10 - locations[1][i]

# Add scores to storage dict 'scores'
for key in scores_new.keys():
  scores[key].append(scores_new[key])

print()
print('Model number: Score')
print(scores_new)

All tokens and features:
        token    pos     head                     children  dep_wt  \
0           a    DET   person                           []       1   
1      person   NOUN   person                 [a, attends]       3   
2         who   PRON  attends                           []       1   
3     attends   VERB   person               [who, college]       3   
4           a    DET  college                           []       1   
5      school   NOUN  college                           []       1   
6     college   NOUN  attends  [a, school, or, university]       5   
7          or  CCONJ  college                           []       1   
8  university   NOUN  college                           []       1   

   dep_wt_log   ant  
0    1.000000  None  
1    1.477121  None  
2    1.000000  None  
3    1.477121  miss  
4    1.000000  None  
5    1.000000  None  
6    1.698970  None  
7    1.000000  None  
8    1.000000  None  

Tokens and features minus stopwords:
        token   

Display aggregated results (per model)

In [None]:
print(len(total.keys()), 'test(s) run')
print('Words:', list(total.keys()))
print('Definitions:', [part[0] for part in list(total.values())])
scores

Transfer scores into DataFrame

In [None]:
scores_df = pd.DataFrame()
for item in scores.keys():
  scores_df = scores_df.append(pd.Series(scores[item]), ignore_index=True)
scores_df.columns = total.keys()
scores_df.insert(len(scores_df.keys()), 'total', scores_df.sum(axis=1))     # add cumulative score for each model

print(scores_df)

    attack  breakfast  chocolate  different  expensive  foreign  grandfather  \
0      0.0        0.0        0.0        0.0        0.0      0.0          0.0   
1      0.0        0.0        0.0        0.0        0.0      0.0          0.0   
2      0.0        0.0        6.0        0.0        0.0      0.0          6.0   
3      0.0        0.0        5.0        0.0        0.0      0.0          8.0   
4      0.0        0.0        0.0        0.0        0.0      0.0          0.0   
5      0.0        0.0        0.0        0.0        0.0      0.0          0.0   
6      0.0        1.0        6.0        0.0        0.0      0.0          7.0   
7      0.0        1.0        5.0        0.0        0.0      0.0          8.0   
8      0.0        0.0        0.0        0.0        0.0      0.0          0.0   
9      0.0        0.0        0.0        0.0        0.0      0.0          0.0   
10     0.0        6.0        6.0        0.0        0.0      0.0          7.0   
11     0.0        6.0        4.0        

Save results to file

In [None]:
scores_df.to_csv('/filepath/filename.csv')