In [68]:
import pandas as pd

import re
from nltk.corpus import stopwords

import gensim.models.word2vec as w2v

In [69]:
tweets = pd.read_csv('../ethan/washington_outage_tweets.csv')
outages = pd.read_csv('../ethan/washington_outages.csv')

In [70]:
tweets.drop_duplicates(subset='text', inplace=True)

## Preprocessing
These steps are adapted from the work of Noah Christiansen, Jen Hill, and Vonn Napoleon Johnson, available [here](https://github.com/jenrhill/Power_Outage_Identification).

In [71]:
# set up stopwords, but include certain relevant but common words

stops = [w for w in stopwords.words('english') 
         if w not in ['out','on','off']] + ['']

In [99]:
# this function almost directly taken from christiansen et al's
# tweets_to_tweetlist
# cleans, reformats, tokenizes, and removes stopwords from tweets

def tweets_cleaner(raw):
    
    clean = str(raw).replace('-',' ').replace('#',' ')
    clean = re.sub(r"([@][\w_-]+)", "", clean) # remove @s
    clean = re.sub(r"[^a-zA-Z ]", "", clean)
    clean = re.sub(r"http.*?\b","",clean)
    clean = re.sub(r"pictwitter.*?\b","",clean)
    clean = re.sub(r"www.*?\b","",clean)
    clean = clean.lower().split()
    clean = [w for w in clean if w not in stops]
    return clean

In [100]:
# clean tweets and compound in list

tweet_list = [tweets_cleaner(t)
              for t in tweets['text']]

In [101]:
# visualize tweets against the words they're reduced down to

pd.DataFrame([(a,b) for a,b in zip(list(tweets['text']), tweet_list)],
             columns=['Tweet','Words'])

Unnamed: 0,Tweet,Words
0,@Number12ForLife No power in Issaquah Highland...,"[power, issaquah, highlands, understand, issaq..."
1,@KING5Seattle @KIRO7Seattle tiger mountain rd ...,"[tiger, mountain, rd, se, block, way, blocking..."
2,@KING5Seattle @KIRO7Seattle SE may valley road...,"[se, may, valley, road, closed, moment, tree, ..."
3,Just lost power. They power better be back on ...,"[lost, power, power, better, back, on]"
4,Update: Power Outages Affecting Renton's Benso...,"[update, power, outages, affecting, rentons, b..."
...,...,...
8379,How are the M’s 17th in the power rankings at ...,"[ms, th, power, rankings, several, teams, ahea..."
8380,Do you have this power?,[power]
8381,"@Super_Sodiq: ""I'm bringing Nigerian Power to ...","[im, bringing, nigerian, power, octagonimagine..."
8382,Mueller's report must be made public— and Cong...,"[muellers, report, must, made, public, congres..."


# EVERYTHING BELOW THIS IS BAD

In [120]:
# Instantiate Word2Vec model
tweets2vec = w2v.Word2Vec(
    sg = 1, # skip-gram train algo
    seed = 1007, # Random Number Generator to make results repeatable
    workers = 3, # number of threads
    size = 300, # Dimensionality of the hidden layer
    min_count = 10, # how many times the word has to appear to be kept in the vocab.
    window = 7, # size of the window to train words
    sample = 1e-5 # downsampling setting for frequent words
)

In [121]:
tweets2vec.build_vocab(tweet_list)

In [122]:
print("Tweets2Vec vocabulary length:", len(tweets2vec.wv.vocab))

Tweets2Vec vocabulary length: 1155


In [130]:
tweets2vec.most_similar('outage', topn=20)

  """Entry point for launching an IPython kernel.


[('national', 0.16528137028217316),
 ('try', 0.16525691747665405),
 ('travel', 0.15678316354751587),
 ('find', 0.15612109005451202),
 ('rtee', 0.15053236484527588),
 ('visit', 0.14896191656589508),
 ('watch', 0.14827287197113037),
 ('class', 0.14548921585083008),
 ('shit', 0.14305120706558228),
 ('present', 0.1429540365934372),
 ('shut', 0.13828980922698975),
 ('members', 0.13651598989963531),
 ('follow', 0.1328839659690857),
 ('works', 0.13122248649597168),
 ('bend', 0.12886610627174377),
 ('deep', 0.12750190496444702),
 ('theres', 0.12683957815170288),
 ('best', 0.126201793551445),
 ('online', 0.12507817149162292),
 ('respect', 0.12417101860046387)]

In [129]:
tweets2vec.most_similar(positive=['blackout','power','outage'],
                        negative=['button','internet'],
                        topn=20)

  This is separate from the ipykernel package so we can avoid doing imports until


[('best', 0.1922171711921692),
 ('incredible', 0.17450061440467834),
 ('forever', 0.1728125512599945),
 ('simple', 0.16075347363948822),
 ('theres', 0.15430928766727448),
 ('birthday', 0.14712947607040405),
 ('end', 0.1387871950864792),
 ('deal', 0.13815714418888092),
 ('upset', 0.13406437635421753),
 ('endorphins', 0.13295292854309082),
 ('deep', 0.13248813152313232),
 ('n', 0.13069875538349152),
 ('clayton', 0.12997141480445862),
 ('try', 0.1299016773700714),
 ('field', 0.12989607453346252),
 ('yo', 0.12924906611442566),
 ('coffee', 0.12756924331188202),
 ('cut', 0.12720748782157898),
 ('market', 0.1241501122713089),
 ('world', 0.1212422177195549)]