In [6]:
import pandas as pd
import os
import random
import gensim.models.doc2vec

from gensim.models.doc2vec import Doc2Vec


df = pd.read_csv('./bostonGlobe/bostonglobe2014.csv')
df.head()

Unnamed: 0,text
0,['The Boston Police Department has issued a co...
1,"['\n individuals who worked in finance, insura..."
2,"['Dr. Kenneth C. Edelin, whose historic 1975 m..."
3,['The first words in the Bible are “In the beg...
4,"['When they go up, they form the centerpiece o..."


In [8]:
df.iloc[5,0]

"['First Night Boston almost didn’t happen this year. Money ran out, budgets were cut, and then Mayor Thomas M. Menino, in one of his last acts, made sure the show would go on. Here are some of the highlights. For a full listing of events, details on where to buy the $10 First Night buttons, as well as venue information, go to \\n\\n\\n\\n\\n\\nFolk music for all ages is demonstrated by Dan Zanes, former member of  the Del Fuegos band, and Elizabeth Mitchell, a Smithsonian Folkways recording artist. In 2007 Zanes received a Grammy Award and Mitchell was a nominee for the 56th annual Grammy Awards for Best Children’s Album. \\n\\n\\nGroove to the beat of Afro-Brazilian drumming put together by Grooversity and IntAfrika. Created by Brazilian percussionist Marcus Santos, the Grooversity entertainment group not only educates, but also engages others to be a part of social change. \\n\\n\\n\\nAdults and children can learn how to walk the tight wire, use the aerial lyra, and ride the gym whe

In [9]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–", "\xc2", "\xa0",
             "\x80", "\x9c", "\x99", "\x94", "\xad", "\xe2", "\x9d"]

for char in spec_chars:
    df['text'] = df['text'].str.replace(char, ' ')

In [10]:
df.iloc[5,0]

'  First Night Boston almost didn’t happen this year  Money ran out  budgets were cut  and then Mayor Thomas M  Menino  in one of his last acts  made sure the show would go on  Here are some of the highlights  For a full listing of events  details on where to buy the $10 First Night buttons  as well as venue information  go to  n n n n n nFolk music for all ages is demonstrated by Dan Zanes  former member of  the Del Fuegos band  and Elizabeth Mitchell  a Smithsonian Folkways recording artist  In 2007 Zanes received a Grammy Award and Mitchell was a nominee for the 56th annual Grammy Awards for Best Children’s Album   n n nGroove to the beat of Afro Brazilian drumming put together by Grooversity and IntAfrika  Created by Brazilian percussionist Marcus Santos  the Grooversity entertainment group not only educates  but also engages others to be a part of social change   n n n nAdults and children can learn how to walk the tight wire  use the aerial lyra  and ride the gym wheels  If these

In [11]:
def tokenize(text, stopwords, max_len = 20):
    return [token for token in gensim.utils.simple_preprocess(text, max_len=max_len) if token not in stopwords]

In [13]:
articles = df.values.tolist()
articles_flat = [item for sublist in articles for item in sublist]
tagged_docs = [gensim.models.doc2vec.TaggedDocument(tokenize(text, [], max_len=200), [i]) for i, text in enumerate(articles_flat)]

In [14]:
tagged_docs[5]

TaggedDocument(words=['first', 'night', 'boston', 'almost', 'didn', 'happen', 'this', 'year', 'money', 'ran', 'out', 'budgets', 'were', 'cut', 'and', 'then', 'mayor', 'thomas', 'menino', 'in', 'one', 'of', 'his', 'last', 'acts', 'made', 'sure', 'the', 'show', 'would', 'go', 'on', 'here', 'are', 'some', 'of', 'the', 'highlights', 'for', 'full', 'listing', 'of', 'events', 'details', 'on', 'where', 'to', 'buy', 'the', 'first', 'night', 'buttons', 'as', 'well', 'as', 'venue', 'information', 'go', 'to', 'nfolk', 'music', 'for', 'all', 'ages', 'is', 'demonstrated', 'by', 'dan', 'zanes', 'former', 'member', 'of', 'the', 'del', 'fuegos', 'band', 'and', 'elizabeth', 'mitchell', 'smithsonian', 'folkways', 'recording', 'artist', 'in', 'zanes', 'received', 'grammy', 'award', 'and', 'mitchell', 'was', 'nominee', 'for', 'the', 'th', 'annual', 'grammy', 'awards', 'for', 'best', 'children', 'album', 'ngroove', 'to', 'the', 'beat', 'of', 'afro', 'brazilian', 'drumming', 'put', 'together', 'by', 'groove

In [15]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=30, epochs=40, window=2, dm=1)
model.build_vocab(tagged_docs)
model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.epochs)

In [16]:
vector = model.infer_vector(['black', 'african american', 'african-american', 'haitian', 'jamaican', 'west indian', 'dominican'])

print(vector)

[-8.58766958e-02  7.44623095e-02  7.96360895e-02  8.87808353e-02
 -1.78865358e-01 -7.85488725e-01 -1.97267607e-01 -5.58691859e-01
 -3.04320663e-01  2.62528569e-01  3.38720948e-01  7.69906566e-02
 -8.63881968e-03 -5.98610580e-01 -7.88958251e-01  1.88517079e-01
  4.46221054e-01  4.55275655e-01  2.15268180e-01 -1.24387398e-01
  4.76542145e-01 -7.22765625e-01 -2.06352636e-01  1.00681755e-05
  1.14174232e-01  3.78336728e-01 -3.76232922e-01  1.47920907e-01
 -1.15364231e-02 -1.80365860e-01]


In [20]:
def pick_random_word(model, threshold=10):
    # pick a random word with a suitable number of occurences
    while True:
        word = random.choice(model.wv.index_to_key)
        if model.wv.get_vecattr(word, "count") > threshold:
            return word

#target_word = pick_random_word(model)
# or uncomment below line, to just pick a word from the relevant domain:
target_word = ['black', 'haitian', 'jamaican', 'dominican']

for w in target_word:
    print(f'target_word: {repr(w)} model: {model} similar words:')
    for i, (word, sim) in enumerate(model.wv.most_similar(w, topn=10), 1):
        print(f'    {i}. {sim:.2f} {repr(word)}')
    print()

target_word: 'black' model: Doc2Vec(dm/m,d30,n5,w2,mc5,s0.001,t3) similar words:
    1. 0.76 'masked'
    2. 0.73 'hispanic'
    3. 0.72 'transgender'
    4. 0.70 'latino'
    5. 0.69 'asian'
    6. 0.69 'vietnamese'
    7. 0.69 'muslim'
    8. 0.68 'liberian'
    9. 0.66 'kidnaps'
    10. 0.66 'grieving'

target_word: 'haitian' model: Doc2Vec(dm/m,d30,n5,w2,mc5,s0.001,t3) similar words:
    1. 0.76 'american'
    2. 0.74 'japanese'
    3. 0.71 'captive'
    4. 0.71 'verdean'
    5. 0.70 'irish'
    6. 0.69 'creole'
    7. 0.69 'asian'
    8. 0.68 'eric'
    9. 0.67 'chinese'
    10. 0.67 'diaspora'

target_word: 'jamaican' model: Doc2Vec(dm/m,d30,n5,w2,mc5,s0.001,t3) similar words:
    1. 0.75 'spicy'
    2. 0.72 'juice'
    3. 0.72 'organic'
    4. 0.71 'stomping'
    5. 0.70 'onions'
    6. 0.69 'burmese'
    7. 0.68 'unlawfully'
    8. 0.67 'boxy'
    9. 0.67 'savory'
    10. 0.66 'émigré'

target_word: 'dominican' model: Doc2Vec(dm/m,d30,n5,w2,mc5,s0.001,t3) similar words:
    1. 