In [42]:
import pandas as pd

import re
from nltk.corpus import stopwords

import gensim.models.word2vec as w2v

In [2]:
tweets = pd.read_csv('../ethan/washington_outage_tweets.csv')
outages = pd.read_csv('../ethan/washington_outages.csv')

In [30]:
tweets.drop_duplicates(subset='text', inplace=True)

## Preprocessing
These steps are adapted from the work of Noah Christiansen, Jen Hill, and Vonn Napoleon Johnson, available [here](https://github.com/jenrhill/Power_Outage_Identification).

In [35]:
# set up stopwords, but include certain relevant but common words

stops = [w for w in stopwords.words('english') 
         if w not in ['out','on','off']] + ['']

In [36]:
# this function almost directly taken from christiansen et al's
# tweets_to_tweetlist
# cleans, reformats, tokenizes, and removes stopwords from tweets

def tweets_cleaner(raw):
    clean = raw.replace('-',' ').replace('#',' ')
    clean = re.sub(r"[^a-zA-Z ]", "", clean)
    clean = re.sub(r"http.*?\b","",clean)
    clean = re.sub(r"pictwitter.*?\b","",clean)
    clean = re.sub(r"www.*?\b","",clean)
    clean = clean.lower().split()
    clean = [w for w in clean if w not in stops]
    return clean

In [37]:
# clean tweets and compound in list

tweet_list = [tweets_cleaner(t)
              for t in tweets['text']]

In [40]:
# visualize tweets against the words they're reduced down to

pd.DataFrame([(a,b) for a,b in zip(list(tweets['text']), tweet_list)],
             columns=['Tweet','Words'])

Unnamed: 0,Tweet,Words
0,@Number12ForLife No power in Issaquah Highland...,"[numberforlife, power, issaquah, highlands, un..."
1,@KING5Seattle @KIRO7Seattle tiger mountain rd ...,"[kingseattle, kiroseattle, tiger, mountain, rd..."
2,@KING5Seattle @KIRO7Seattle SE may valley road...,"[kingseattle, kiroseattle, se, may, valley, ro..."
3,Just lost power. They power better be back on ...,"[lost, power, power, better, back, on]"
4,Update: Power Outages Affecting Renton's Benso...,"[update, power, outages, affecting, rentons, b..."
...,...,...
197,Power is restored! We’ll be open regular hours...,"[power, restored, well, open, regular, hours, ..."
198,"Soooo, the power is out. @psetalk says they e...","[soooo, power, out, psetalk, says, expect, us,..."
199,Tree hanging across road on power line. @ Bain...,"[tree, hanging, across, road, on, power, line,..."
200,"Power's out, but we're still open! Feel free t...","[powers, out, still, open, feel, free, come, v..."


# EVERYTHING BELOW THIS IS BAD

In [59]:
# Instantiate Word2Vec model
tweets2vec = w2v.Word2Vec(
    sg = 1, # skip-gram train algo
    seed = 1007, # Random Number Generator to make results repeatable
    workers = 3, # number of threads
    size = 300, # Dimensionality of the hidden layer
    min_count = 2, # how many times the word has to appear to be kept in the vocab.
    window = 7, # size of the window to train words
    sample = 1e-5 # downsampling setting for frequent words
)

In [60]:
tweets2vec.build_vocab(tweet_list)

In [61]:
print("Tweets2Vec vocabulary length:", len(tweets2vec.wv.vocab))

Tweets2Vec vocabulary length: 247


In [67]:
tweets2vec.most_similar('blackout')

  """Entry point for launching an IPython kernel.


[('today', 0.2116914987564087),
 ('call', 0.15200787782669067),
 ('grill', 0.11780968308448792),
 ('come', 0.11520563066005707),
 ('breakfast', 0.1140291839838028),
 ('fuck', 0.11393752694129944),
 ('first', 0.11338429898023605),
 ('wasnow', 0.11179263889789581),
 ('sb', 0.1097029447555542),
 ('service', 0.10533488541841507)]