# Static vector analyses

## Set-up

In [1]:
import break_utils
import gensim.downloader as api
import io
import pandas as pd

In [2]:
def get_break_neighbors(df):    
    neigh = break_utils.neighbors("break", df)
    x = list(neigh.head(20).index)
    i = list(range(1, 11))
    j = list(range(11, 21))
    results = pd.DataFrame(list(zip(i, x[: 10], j, x[10: ])))
    print(results.to_latex(header=None, index=None)) 

## GloVe

https://nlp.stanford.edu/projects/glove/

In [3]:
def get_break_neighbors_glove(filename, d=300):
    glv = break_utils.glove2dict(filename)
    glv = {k: v for k, v in glv.items() if len(v) == d}    
    df = pd.DataFrame(glv).T
    get_break_neighbors(df)

In [4]:
get_break_neighbors_glove("reps/glove.6B.300d.txt")

\begin{tabular}{rlrl}
\toprule
  1 &     break &  11 &   before \\
  2 &  breaking &  12 &      put \\
  3 &     broke &  13 &    start \\
  4 &    breaks &  14 &     take \\
  5 &       set &  15 &   trying \\
  6 &       try &  16 &    could \\
  7 &    chance &  17 &       to \\
  8 &      time &  18 &   broken \\
  9 &     again &  19 &      end \\
 10 &      back &  20 &  finally \\
\bottomrule
\end{tabular}



In [5]:
get_break_neighbors_glove("reps/glove.twitter.27B.200d.txt", d=200)

\begin{tabular}{rlrl}
\toprule
  1 &   break &  11 &  weeks \\
  2 &    time &  12 &  start \\
  3 &  breaks &  13 &   last \\
  4 &  before &  14 &    end \\
  5 &    then &  15 &  broke \\
  6 &    take &  16 &  again \\
  7 &    days &  17 &   next \\
  8 &   after &  18 &  maybe \\
  9 &     let &  19 &  leave \\
 10 &      up &  20 &   down \\
\bottomrule
\end{tabular}



In [6]:
get_break_neighbors_glove("reps/glove.42B.300d.txt")

\begin{tabular}{rlrl}
\toprule
  1 &     break &  11 &     get \\
  2 &    breaks &  12 &     out \\
  3 &  breaking &  13 &  trying \\
  4 &     broke &  14 &      we \\
  5 &     going &  15 &  broken \\
  6 &       let &  16 &   again \\
  7 &      away &  17 &    come \\
  8 &      take &  18 &    down \\
  9 &        up &  19 &    make \\
 10 &       'll &  20 &  before \\
\bottomrule
\end{tabular}



In [7]:
get_break_neighbors_glove("reps/glove.840B.300d.txt")

\begin{tabular}{rlrl}
\toprule
  1 &     break &  11 &      up \\
  2 &    breaks &  12 &  trying \\
  3 &  breaking &  13 &    away \\
  4 &       end &  14 &   start \\
  5 &     broke &  15 &     get \\
  6 &      down &  16 &   again \\
  7 &      take &  17 &     'll \\
  8 &       let &  18 &    back \\
  9 &     going &  19 &     out \\
 10 &     leave &  20 &     off \\
\bottomrule
\end{tabular}



## word2vec

In [8]:
wv = api.load('word2vec-google-news-300')

In [9]:
def get_word2vec():
    x = wv.most_similar(positive=['break'], topn=20)
    x = [w for w, s in x]
    i = list(range(1, 11))
    j = list(range(11, 21))
    results = pd.DataFrame(list(zip(i, x[: 10], j, x[10: ])))
    print(results.to_latex(header=None, index=None)) 

In [10]:
get_word2vec()

\begin{tabular}{rlrl}
\toprule
  1 &    breaks &  11 &          brief\_respite \\
  2 &  breaking &  12 &  Nadal\_netted\_forehand \\
  3 &     broke &  13 &                 loosen \\
  4 &    broken &  14 &                  smash \\
  5 &     Break &  15 &                    rip \\
  6 &  Breaking &  16 &       overhit\_forehand \\
  7 &  breather &  17 &       miscued\_forehand \\
  8 &   shatter &  18 &                    cut \\
  9 &     crack &  19 &                   slip \\
 10 &   breaker &  20 &                 Breaks \\
\bottomrule
\end{tabular}



## fastText

https://fasttext.cc/docs/en/english-vectors.html

In [11]:
def get_break_neighbors_fasttext(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    df = pd.DataFrame(data).T
    get_break_neighbors(df)

In [12]:
get_break_neighbors_fasttext("reps/wiki-news-300d-1M.vec")

\begin{tabular}{rlrl}
\toprule
  1 &     break &  11 &      follow \\
  2 &    breaks &  12 &       smash \\
  3 &  breaking &  13 &       BREAK \\
  4 &     broke &  14 &       knock \\
  5 &     Break &  15 &  water-main \\
  6 &    broken &  16 &        miss \\
  7 &     crack &  17 &         tie \\
  8 &      take &  18 &          go \\
  9 &   shatter &  19 &       relax \\
 10 &       fix &  20 &       start \\
\bottomrule
\end{tabular}



In [13]:
get_break_neighbors_fasttext("reps/wiki-news-300d-1M-subword.vec")

\begin{tabular}{rlrl}
\toprule
  1 &     break &  11 &        breakin \\
  2 &    breaks &  12 &        breaked \\
  3 &  breaking &  13 &         broken \\
  4 &    breake &  14 &       legbreak \\
  5 &  re-break &  15 &           reak \\
  6 &    break- &  16 &     semi-break \\
  7 &   unbreak &  17 &      minibreak \\
  8 &   breakes &  18 &        breaker \\
  9 &    break. &  19 &  breaking-down \\
 10 &     broke &  20 &      tea-break \\
\bottomrule
\end{tabular}



In [14]:
get_break_neighbors_fasttext("reps/crawl-300d-2M.vec")

\begin{tabular}{rlrl}
\toprule
  1 &      break &  11 &   break.The \\
  2 &     breaks &  12 &     break.I \\
  3 &   breaking &  13 &    break.It \\
  4 &      Break &  14 &  break.This \\
  5 &      broke &  15 &      broken \\
  6 &  break.And &  16 &    break.So \\
  7 &   Breaking &  17 &    break.In \\
  8 &     break. &  18 &      break- \\
  9 &      BREAK &  19 &      breack \\
 10 &     Breaks &  20 &  break.That \\
\bottomrule
\end{tabular}



In [15]:
get_break_neighbors_fasttext("reps/crawl-300d-2M-subword.vec")

\begin{tabular}{rlrl}
\toprule
  1 &      break &  11 &        take \\
  2 &     breaks &  12 &      broken \\
  3 &   breaking &  13 &    re-break \\
  4 &      Break &  14 &      breake \\
  5 &      broke &  15 &      abreak \\
  6 &     break. &  16 &   break.But \\
  7 &  break.And &  17 &      break- \\
  8 &    rebreak &  18 &  break.What \\
  9 &   break.So &  19 &        bend \\
 10 &     breack &  20 &  break.That \\
\bottomrule
\end{tabular}

