In [1]:
import json
import os

from words import *
from top_dense import *
from wordbank import *
from seedlings import *

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)




In [3]:
%reload_ext autoreload
%autoreload 2

## Load the word vectors

We're using the 42 billion token, 300D, GloVe model, 
(glove.42B.300d.zip from http://nlp.stanford.edu/projects/glove/)
with 1.9 million words in the vocab. The original 
vector file was parsed into a numpy array and that 
numpy array was saved to disc. The function that does
this parsing/loading is words.create_numpy_from_glove()
We load that numpy array and the vocab dictionary from
the output of that function here.

In [5]:
glove = GloVe(vocab="data/model/dict_glove_42b_300", 
              vectors="data/model/vectors_glove_42b_300.npy")

### Load the WordBank data

Load the Wordbank data. This dataset is a measure of words currently in the infants' vocabulary. We filter these words for just nouns, so we can compare it with the SEEDLings data. We use the "category" and "type" fields for each word to determine this. These words will be passed into the semantic graph generator. We'll do the same thing to the environmental linguistic input using the SEEDLings corpus.

The resulting wordmap object is a dictionary of keys to words. Here key is just the name of the input dataset, 
i.e. wordbank_english

In [78]:
wordbank_english = WordBank(input="data/wb_cdi/wb_eng.csv")

wb_eng_wordmap = wordbank_english.wordmap()

### Load SEEDLings Monthly Data

We load the aggregated monthly words from the SEEDLings corpus. Each month is its own dataset (i.e. a key in the wordmap)

In [21]:
seedlings_wordmap = load_seedlings("data/seedlings/bl")

### Generate semantic graphs

For every word, give it an edge with any other word that's within some threshold of cos(θ) to it (i.e.  cos(θ) >= 1.0 - threshold ). Output these graphs in JSON form with each word as a key, and a list of its neighbors as the value. We're generating a distinct graph for a range of threshold values. The "start" and "end" parameters specify this range and the "step" parameter specifies the granularity. 

In [79]:
# English WordBank
glove.graph_cosine_range(output_path="english_wordbank", wordmap=wb_eng_wordmap, 
                         start=0.3, end=0.81, step=0.01)

In [None]:
# monthly aggregated SEEDLings
glove.graph_cosine_range(output_path="seedlings", wordmap=seedlings_wordmap, 
                         start=0.3, end=0.81, step=0.01)

### Rank nodes in each graph by their degree

For each of the graphs that we generated in the previous step, we output a csv file of the top ranked words (by degree). 

In [80]:
rank_density(input_path="data/output/english_wordbank/semgraphs", 
             output_path="data/output/english_wordbank/ranked_out")

In [None]:
rank_density(input_path="data/output/seedlings/semgraphs", 
             output_path="data/output/seedlings/ranked_out")

## Plot semantic networks


In [14]:
graph_path = "data/output/english_wordbank/semgraphs/cosine_0.4/semgraph_wb_eng"

graph = SemanticGraph(source="WordBank", sim_func="cos", 
                      thresh=0.45, path=graph_path)

fig = plot_semantic_graph(graph)

iplot(fig, filename='semantic_graph')    

In [15]:
graph_path = "data/output/english_wordbank/semgraphs/cosine_0.5/semgraph_wb_eng"

graph = SemanticGraph(source="WordBank", sim_func="cos", 
                      thresh=0.5, path=graph_path)

fig = plot_semantic_graph(graph)

iplot(fig, filename='semantic_graph') 

In [16]:
graph_path = "data/output/english_wordbank/semgraphs/cosine_0.55/semgraph_wb_eng"

graph = SemanticGraph(source="WordBank", sim_func="cos", 
                      thresh=0.55, path=graph_path)

fig = plot_semantic_graph(graph)

iplot(fig, filename='semantic_graph') 

In [17]:
graph_path = "data/output/english_wordbank/semgraphs/cosine_0.6/semgraph_wb_eng"

graph = SemanticGraph(source="WordBank", sim_func="cos", 
                      thresh=0.6, path=graph_path)

fig = plot_semantic_graph(graph)

iplot(fig, filename='semantic_graph')    