In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from gensim.models import phrases, word2vec
import nltk
import numpy as np
import codecs
from collections import Counter
from nltk.corpus import stopwords
import argparse
import os
import string
import re
import pandas as pd
from gensim.models import KeyedVectors

In [3]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
stopwords = set(stopwords.words('english'))

In [0]:
root_dir = "/content/drive/My Drive/Polygence/Audrey/"

## Load data

In [0]:
data = pd.read_csv(root_dir + "/data/all_articles_dependencies.csv")

In [0]:
all_text = " ".join(data['coref_resolved'])

## Clean text

In [0]:
punct_chars = list((set(string.punctuation) | {'»', '–', '—', '-',"­", '\xad', '-', '◾', '®', '©','✓','▲', '◄','▼','►', '~', '|', '“', '”', '…', "'", "`", '_', '•', '*', '■'} - {"'"}))
punct_chars.sort()
punctuation = ''.join(punct_chars)
replace = re.compile('[%s]' % re.escape(punctuation))
sno = nltk.stem.SnowballStemmer('english')
printable = set(string.printable)

In [0]:
def clean_text(text):
    # lower case
    text = text.lower()
    # eliminate urls
    text = re.sub(r'http\S*|\S*\.com\S*|\S*www\S*', ' ', text)
    # substitute all other punctuation with whitespace
    text = replace.sub(' ', text)
    # replace all whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    # strip off spaces on either end
    text = text.strip()
    # make sure all chars are printable
    text = ''.join([c for c in text if c in printable])
    words = text.split()
    # remove numeric
    words = [w for w in words if not w.isdigit()]
    return words

In [10]:
# split text into sentences (nltk.sent_tokenize)
sents = nltk.sent_tokenize(all_text)
print(len(sents))

492298


In [0]:
all_sentences = [clean_text(sent) for sent in sents]

## Create model

In [0]:
# Create vocabulary model
bigrams = phrases.Phrases(all_sentences, min_count=5, delimiter=b' ', common_terms=stopwords)

In [0]:
# Create vocabulary of bigrams
print("Creating vocabulary...")
data = bigrams[all_sentences]
vocab = [w for sent in data for w in sent]
vocab = [w for w, count in Counter(vocab).most_common() if count >= 5]

# Save vocab
with codecs.open(os.path.join(root_dir + "models", 'word2vec_vocab.txt'), 'w', encoding='utf-8') as f:
    f.write('\n'.join(vocab))

Creating vocabulary...




In [0]:
# Run word2vec model with bootstrapping
sample_size = int(0.8 * len(all_sentences))
for run_idx in range(20):
    print("Run #%d" % run_idx)
    data = bigrams[np.random.choice(all_sentences, sample_size, replace=True)]
    model = word2vec.Word2Vec(data, size=200, window=5, sg=1, min_count=5, workers=10)
    model.wv.save(os.path.join(root_dir + "models", "word2vec_model_" + str(run_idx) + '.wv'))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


'\n# Run word2vec model with bootstrapping\nsample_size = int(0.8 * len(all_sentences))\nfor run_idx in range(20):\n    print("Run #%d" % run_idx)\n    data = bigrams[np.random.choice(all_sentences, sample_size, replace=True)]\n    model = word2vec.Word2Vec(data, size=200, window=5, sg=1, min_count=5, workers=10)\n    model.wv.save(os.path.join(root_dir + "models", "word2vec_model_" + str(run_idx) + \'.wv\'))\n'

In [0]:
# Run for each source
def run_word2vec_for_source(data, source):
  data = data[data["source"] == source]
  all_text = " ".join(data['coref_resolved'])
  sents = nltk.sent_tokenize(all_text)
  print(len(sents))
  all_sentences = [clean_text(sent) for sent in sents]
  bigrams = phrases.Phrases(all_sentences, min_count=5, delimiter=b' ', common_terms=stopwords)
  # Create vocabulary of bigrams
  print("Creating vocabulary...")
  data = bigrams[all_sentences]
  vocab = [w for sent in data for w in sent]
  vocab = [w for w, count in Counter(vocab).most_common() if count >= 5]

  # Save vocab
  with codecs.open(os.path.join(root_dir + "models", source + '_vocab.txt'), 'w', encoding='utf-8') as f:
      f.write('\n'.join(vocab))

  # Run word2vec model for source
  model = word2vec.Word2Vec(data, size=200, window=5, sg=1, min_count=5, workers=10)
  model.wv.save(os.path.join(root_dir + "models", source + "_model.wv"))

In [0]:
sources = data.groupby("source").filter(lambda x: len(x) > 200)["source"].unique()
sources

array(['abc-news', 'associated-press', 'breitbart-news', 'cbs-news',
       'cnn', 'fox-news', 'national-review', 'nbc-news', 'newsweek',
       'new-york-magazine', 'politico', 'reuters', 'the-hill',
       'the-new-york-times', 'the-washington-post', 'usa-today'],
      dtype=object)

In [0]:
for source in sources:
  print(source)
  run_word2vec_for_source(data, source)

abc-news
17559
Creating vocabulary...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


associated-press
26144
Creating vocabulary...
breitbart-news
19440
Creating vocabulary...
cbs-news
45477
Creating vocabulary...
cnn
19275
Creating vocabulary...
fox-news
42761
Creating vocabulary...
national-review
7621
Creating vocabulary...
nbc-news
27887
Creating vocabulary...
newsweek
20954
Creating vocabulary...
new-york-magazine
22481
Creating vocabulary...
politico
73295
Creating vocabulary...
reuters
16092
Creating vocabulary...
the-hill
36107
Creating vocabulary...
the-new-york-times
25252
Creating vocabulary...
the-washington-post
47832
Creating vocabulary...
usa-today
21518
Creating vocabulary...


In [0]:
def get_models(filelist):
    model_files = [f for f in filelist if f.endswith('.wv')]
    models = [KeyedVectors.load(fname, mmap='r') for fname in model_files]
    return models

In [13]:
filelist = []
for subdir, dirs, files in os.walk(root_dir + "models/word2vec_sources"):
  for file in files:
    filelist.append(os.path.join(subdir, file))
models = get_models(filelist)
len(models)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


16

In [14]:
# Get vocab (intersection of all vocab)
vocab = set(models[0].vocab)
for m in models:
    vocab &= set(m.vocab)
len(vocab)

2430

In [0]:
idx2word = {i: w for i, w in enumerate(list(vocab))}

In [0]:
def get_closest(queries, models, vocab, idx2word):
    cosines = []
    for m in models:
        cosines.append([np.mean([m.similarity(q, word) for q in queries]) for word in vocab])
    cosines = np.mean(np.array(cosines), axis=0)
    return [(idx2word[idx], cosines[idx]) for idx in cosines.argsort()[-20:][::-1]]

In [0]:
def filter_words(words, vocab):
    words = set(words)
    not_in_vocab = words - vocab
    if not_in_vocab:
        print("Not in vocab:")
        print(not_in_vocab)
    return list(words - not_in_vocab)

In [18]:
queries = ["power"] # this should be a list of queries (it can also be just one query)

 # Remove queries not in vocab
queries = filter_words(queries, vocab)

print("Getting most similar words...")
closest = get_closest(queries, models, vocab, idx2word)
for (w, c) in closest:
    print("%s %.2f" % (w, c))

Getting most similar words...


  if np.issubdtype(vec.dtype, np.int):


power 1.00
influence 0.78
authority 0.77
powers 0.76
leverage 0.76
prevent 0.75
undermine 0.75
constitutional 0.74
control 0.74
interests 0.74
conduct 0.74
damage 0.74
protect 0.74
abuse 0.74
actions 0.74
constitution 0.73
carry 0.73
pursue 0.73
acts 0.73
engage 0.73


In [0]:
from scipy.stats import ttest_ind

In [0]:
def get_cosines(name1, name2, words1, words2, queries, models):
    df_w1 = []
    df_w2 = []
    df_q = []
    df_type = []
    df_pvals = []
    for key, values in queries.items():
        for q in values:
            vals1 = [m.similarity(word1, q) for m in models for word1 in words1]
            vals2 = [m.similarity(word2, q) for m in models for word2 in words2]
            df_w1.append(np.mean(vals1))
            df_w2.append(np.mean(vals2))
            df_q.append(q)
            df_type.append(key)
            df_pvals.append(ttest_ind(vals1, vals2)[1])
    df = pd.DataFrame({name1: df_w1, name2: df_w2, 'query': df_q, 'word category': df_type, "p value": df_pvals})
    return df

In [25]:
queries = {"power": ["powerful", "power", "authority", "powers", "influence"]}
words1 = ["harris", "kamala", "kamala harris"]
words2 =  ["biden", "joe", "joe biden"]
# Remove queries not in vocab
words1 = filter_words(words1, vocab)
words2 = filter_words(words2, vocab)
for k, v in queries.items():
    queries[k] = filter_words(v, vocab)

Not in vocab:
{'kamala'}


In [26]:
get_cosines("harris", "biden", words1, words2, queries, models)

  if np.issubdtype(vec.dtype, np.int):


Unnamed: 0,harris,biden,query,word category,p value
0,0.322345,0.401408,influence,power,0.014362
1,0.360954,0.435324,powers,power,0.0463
2,0.351781,0.398524,powerful,power,0.203388
3,0.352849,0.43135,authority,power,0.061056
4,0.190589,0.319343,power,power,6.3e-05
