In [68]:
import pandas as pd

import re
from nltk.corpus import stopwords

import gensim.models.word2vec as w2v

In [69]:
tweets = pd.read_csv('../ethan/washington_outage_tweets.csv')
outages = pd.read_csv('../ethan/washington_outages.csv')

In [70]:
tweets.drop_duplicates(subset='text', inplace=True)

In [197]:
tweets = tweets.drop(
    index=tweets[tweets['outage']==0].index
).reset_index(drop=True)

## Preprocessing
These steps are adapted from the work of Noah Christiansen, Jen Hill, and Vonn Napoleon Johnson, available [here](https://github.com/jenrhill/Power_Outage_Identification).

In [198]:
# set up stopwords, but include certain relevant but common words

stops = [w for w in stopwords.words('english') 
         if w not in ['out','on','off']] + ['']

In [199]:
# this function almost directly taken from christiansen et al's
# tweets_to_tweetlist
# cleans, reformats, tokenizes, and removes stopwords from tweets

def tweets_cleaner(raw):
    
    clean = str(raw).replace('-',' ').replace('#',' ')
    clean = re.sub(r"([@][\w_-]+)", "", clean) # remove @s
    clean = re.sub(r"[^a-zA-Z ]", "", clean)
    clean = re.sub(r"http.*?\b","",clean)
    clean = re.sub(r"pictwitter.*?\b","",clean)
    clean = re.sub(r"www.*?\b","",clean)
    clean = clean.lower().split()
    clean = [w for w in clean if w not in stops]
    return clean

In [200]:
# clean tweets and compound in list

tweet_list = [tweets_cleaner(t)
              for t in tweets['text']]
#    all tweets expected to maybe be outage-related ^

In [201]:
tweets['outage'].value_counts()

-1    2850
 1     184
Name: outage, dtype: int64

In [202]:
tweet_list[190]

['power',
 'mother',
 'nature',
 'snoqualmie',
 'waterfalls',
 'f',
 'sewwsvaijawisovu']

In [203]:
list(tweets['text'])[190]

'The power of Mother Nature @ Snoqualmie Waterfalls https://www.instagram.com/p/BD_rKf2xmLUnsf4rP-f-68SEwWSvAiJAWISOVU0/\xa0…'

In [204]:
# visualize tweets against the words they're reduced down to

pd.DataFrame([(a,b) for a,b in zip(list(tweets['text']), tweet_list)],
             columns=['Tweet','Words'])

Unnamed: 0,Tweet,Words
0,@Number12ForLife No power in Issaquah Highland...,"[power, issaquah, highlands, understand, issaq..."
1,@KING5Seattle @KIRO7Seattle tiger mountain rd ...,"[tiger, mountain, rd, se, block, way, blocking..."
2,@KING5Seattle @KIRO7Seattle SE may valley road...,"[se, may, valley, road, closed, moment, tree, ..."
3,Just lost power. They power better be back on ...,"[lost, power, power, better, back, on]"
4,Update: Power Outages Affecting Renton's Benso...,"[update, power, outages, affecting, rentons, b..."
...,...,...
3029,How are the M’s 17th in the power rankings at ...,"[ms, th, power, rankings, several, teams, ahea..."
3030,Do you have this power?,[power]
3031,"@Super_Sodiq: ""I'm bringing Nigerian Power to ...","[im, bringing, nigerian, power, octagonimagine..."
3032,Mueller's report must be made public— and Cong...,"[muellers, report, must, made, public, congres..."


# EVERYTHING BELOW THIS IS BAD

In [205]:
# Instantiate Word2Vec model
tweets2vec = w2v.Word2Vec(
    sg = 1, # skip-gram train algo
    seed = 1007, # Random Number Generator to make results repeatable
    workers = 3, # number of threads
    size = 300, # Dimensionality of the hidden layer
    min_count = 10, # how many times the word has to appear to be kept in the vocab.
    window = 7, # size of the window to train words
    sample = 1e-5 # downsampling setting for frequent words
)

In [206]:
tweets2vec.build_vocab(tweet_list)

In [207]:
print("Tweets2Vec vocabulary length:", len(tweets2vec.wv.vocab))

Tweets2Vec vocabulary length: 488


In [208]:
tweets2vec.most_similar('outage', topn=20)

  """Entry point for launching an IPython kernel.


[('try', 0.16525691747665405),
 ('find', 0.15612109005451202),
 ('watch', 0.14827287197113037),
 ('class', 0.14548921585083008),
 ('shit', 0.14305120706558228),
 ('theres', 0.12683957815170288),
 ('best', 0.126201793551445),
 ('respect', 0.12417101860046387),
 ('de', 0.1211705133318901),
 ('nbsb', 0.1172221377491951),
 ('give', 0.11638787388801575),
 ('anyone', 0.11514642834663391),
 ('lose', 0.11175920069217682),
 ('something', 0.10313186049461365),
 ('good', 0.10256955027580261),
 ('soon', 0.10226915776729584),
 ('yesterday', 0.097187839448452),
 ('white', 0.0970287099480629),
 ('fear', 0.09576299041509628),
 ('gods', 0.09463256597518921)]

In [209]:
tweets2vec.most_similar(positive=['blackout','power','outage'],
                        negative=['button','internet'],
                        topn=20)

  This is separate from the ipykernel package so we can avoid doing imports until


[('best', 0.1922171711921692),
 ('theres', 0.15430928766727448),
 ('end', 0.1387871950864792),
 ('deal', 0.13815714418888092),
 ('n', 0.13069875538349152),
 ('try', 0.1299016773700714),
 ('coffee', 0.12756924331188202),
 ('cut', 0.12720748782157898),
 ('world', 0.1212422177195549),
 ('never', 0.11831799149513245),
 ('jesus', 0.11743222922086716),
 ('flower', 0.11393364518880844),
 ('anyone', 0.11387676000595093),
 ('pm', 0.11117894947528839),
 ('minute', 0.1104523241519928),
 ('respect', 0.10868483781814575),
 ('think', 0.10465019196271896),
 ('hour', 0.1028299331665039),
 ('change', 0.10028521716594696),
 ('something', 0.09914322942495346)]