## Import Dependencies

In [16]:
import re 
import pandas as pd
import requests
from bs4 import BeautifulSoup
import html2text
import stanza
import spacy
from spacy_stanza import StanzaLanguage
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import wordninja

In [17]:
data = pd.read_csv('data - data.csv')
data = data[['text']]
data['clean_text'] = data.text

In [18]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
#remove non-ascii characters
data.clean_text = data.clean_text.map(lambda x: _removeNonAscii(x))


In [19]:
# remove special characters, numbers, punctuations
import contractions

# data['clean_text'] = data['clean_text'].apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) )
data['clean_text'] = data['clean_text'].apply(lambda x: re.sub(r'http\S+', '', x))
data['clean_text'] = data['clean_text'].apply(
    lambda x: [contractions.fix(word) for word in x.split()])


In [20]:
data['clean_text'] = data['clean_text'].apply(lambda x: ' '.join(x))

In [21]:
data['clean_text'] = data['clean_text'].str.replace("[^a-zA-Z0-9]", " ")

In [22]:
data.clean_text = data.clean_text.apply(lambda x: [wordninja.split(word) for word in x.split()])

In [23]:
data.clean_text = data.clean_text.apply(lambda x:[item for sublist in x for item in sublist] )

In [24]:
data.clean_text = data.clean_text.apply(lambda x:' '.join(x))

In [25]:
import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

In [26]:
%%time
# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output
data.clean_text = lemmatization(data.clean_text)

CPU times: user 44.1 s, sys: 599 ms, total: 44.7 s
Wall time: 50.8 s


In [27]:
data

Unnamed: 0,text,clean_text
0,Just happened a terrible car crash,just happen a terrible car crash
1,Our Deeds are the Reason of this #earthquake M...,-PRON- deed be the reason of this earthquake M...
2,"Heard about #earthquake is different cities, s...",hear about earthquake be different city stay s...
3,"there is a forest fire at spot pond, geese are...",there be a forest fire at spot pond goose be f...
4,Forest fire near La Ronge Sask. Canada,forest fire near La Rong e S ask Canada
...,...,...
10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,m 1 94 01 04 UTC 5 km S of Volcano Hawaii
10872,Police investigating after an e-bike collided ...,Police investigate after an e bike collide wit...
10873,The Latest: More Homes Razed by Northern Calif...,the late More home raze by Northern California...
10874,MEG issues Hazardous Weather Outlook (HWO) htt...,MEG issue Hazardous Weather Outlook H wo


In [28]:
#turn all comments' tokens into one single list
data.clean_text  = data.clean_text.apply(lambda x: x.split())
unlist_comments = [item for items in data.clean_text for item in items]

> We will then use NLTK’s tools to generate all possible bigrams and trigrams

In [29]:
import nltk
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_comments)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_comments)

## Methods to Rank Collocations

### Counting frequencies of adjacent words with part of speech filters:

In [30]:
bigram_freq = bigramFinder.ngram_fd.items()

In [31]:
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [32]:
bigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,bigram,freq
0,"(-PRON-, be)",1552
1,"(in, the)",430
2,"(do, not)",405
3,"(be, a)",391
4,"(of, the)",375


In [33]:
bigramFreqTable[:10]

Unnamed: 0,bigram,freq
149,"(-PRON-, be)",1552
166,"(in, the)",430
298,"(do, not)",405
29,"(be, a)",391
158,"(of, the)",375
231,"(-PRON-, have)",361
161,"(and, -PRON-)",329
1243,"(be, not)",229
297,"(-PRON-, do)",227
452,"(-PRON-, will)",226


In [85]:
### issue with this is adjacent spaces, stop words, articles, prepositions or pronouns are common and are not meaningful
### To fix this, we filter out for collocations not containing stop words and filter for only the following structures:
### Bigrams: (Noun, Noun), (Adjective, Noun)
### Trigrams: (Adjective/Noun, Anything, Adjective/Noun)
### This is a common structure used in literature and generally works well.

In [34]:
en_stopwords = set(stopwords.words('english'))
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [35]:
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [36]:
filtered_bi[:10]

Unnamed: 0,bigram,freq
621,"(-PRON-, -PRON-)",155
3282,"(-PRON-, think)",83
13904,"(suicide, bomber)",79
1395,"(-PRON-, feel)",73
2126,"(think, -PRON-)",72
4155,"(YouTube, video)",62
17538,"(burn, building)",57
646,"(-PRON-, S)",57
5590,"(Northern, California)",54
20523,"(tis, ed)",50


In [37]:

trigram_freq = trigramFinder.ngram_fd.items()

In [38]:
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

In [39]:
trigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,trigram,freq
0,"(-PRON-, do, not)",157
1,"(-PRON-, be, a)",108
2,"(be, go, to)",102
3,"(-PRON-, be, not)",99
4,"(-PRON-, can, not)",93


In [40]:
trigramFreqTable[:10]

Unnamed: 0,trigram,freq
311,"(-PRON-, do, not)",157
5017,"(-PRON-, be, a)",108
1530,"(be, go, to)",102
1902,"(-PRON-, be, not)",99
42,"(-PRON-, can, not)",93
2634,"(and, -PRON-, be)",91
1529,"(-PRON-, be, go)",75
4839,"(-PRON-, like, a)",67
4840,"(like, a, YouTube)",61
4841,"(a, YouTube, video)",61


In [41]:
def rightTypesTri(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or '  ' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [42]:
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypesTri(x))]

In [43]:
filtered_tri[:10]

Unnamed: 0,trigram,freq
25909,"(trauma, tis, ed)",50
114177,"(PKK, suicide, bomber)",42
114176,"(old, PKK, suicide)",41
80974,"(pre, break, good)",41
41511,"(MH, 370, Malaysia)",36
114193,"(Turkey, Army, trench)",34
105973,"(airport, get, swallow)",34
51413,"(Obama, declare, disaster)",32
114194,"(Army, trench, release)",32
100088,"(Quarantine, Offensive, Content)",28


## PMI 
### The main intuition is that it measures how much more likely the words co-occur than if they were independent.

In [44]:
freq_bi = filtered_bi[:20].bigram.values

In [45]:
freq_tri = filtered_tri[:20].trigram.values

In [46]:
bigramFinder.apply_freq_filter(20)

In [47]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [48]:
bigramPMITable[:10]

Unnamed: 0,bigram,PMI
0,"(Madhya, Pradesh)",12.963405
1,"(GB, BO)",12.963405
2,"(Pamela, Geller)",12.899275
3,"(Virgin, galactic)",12.577347
4,"(Temple, Mount)",12.529752
5,"(Quarantine, Offensive)",12.515946
6,"(Offensive, Content)",12.46864
7,"(Fun, tenn)",12.422837
8,"(Stock, Market)",12.280926
9,"(recount, horror)",12.137435


In [49]:
trigramFinder.apply_freq_filter(20)

In [50]:
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [51]:
trigramPMITable[:10]

Unnamed: 0,trigram,PMI
0,"(Quarantine, Offensive, Content)",24.984587
1,"(anthrax, lab, mishap)",24.637439
2,"(Stock, Market, Crash)",24.031338
3,"(Salt, River, Wild)",23.896408
4,"(River, Wild, Horse)",23.740289
5,"(Turkey, Army, trench)",23.144749
6,"(Malaysia, PM, Investigators)",22.965325
7,"(hot, Fun, tenn)",22.9436
8,"(horrible, sub, reddit)",22.914535
9,"(wreckage, conclusively, confirm)",22.860673


In [52]:
pmi_bi = bigramPMITable[:20].bigram.values
pmi_tri = trigramPMITable[:20].trigram.values

In [53]:
bigramTtable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)

In [54]:
bigramTtable.head()

Unnamed: 0,bigram,t
0,"(-PRON-, be)",31.899863
1,"(do, not)",19.809452
2,"(in, the)",16.646294
3,"(of, the)",15.552997
4,"(-PRON-, have)",15.4581


In [55]:
filteredT_bi = bigramTtable[bigramTtable.bigram.map(lambda x: rightTypes(x))]

In [56]:
filteredT_bi[:10]

Unnamed: 0,bigram,t
31,"(suicide, bomber)",8.879703
42,"(-PRON-, think)",7.930487
44,"(YouTube, video)",7.854063
46,"(-PRON-, feel)",7.690244
52,"(burn, building)",7.517279
55,"(Northern, California)",7.338377
59,"(think, -PRON-)",7.218402
62,"(tis, ed)",7.068331
63,"(trauma, tis)",7.067245
72,"(Full, read)",6.919996


In [57]:
trigramTtable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.student_t)), columns=['trigram','t']).sort_values(by='t', ascending=False)

In [58]:
trigramTtable.head()

Unnamed: 0,trigram,t
0,"(-PRON-, do, not)",12.502202
1,"(be, go, to)",10.072797
2,"(-PRON-, be, a)",9.795103
3,"(-PRON-, be, not)",9.7428
4,"(-PRON-, can, not)",9.628492


In [59]:
filteredT_tri = trigramTtable[trigramTtable.trigram.map(lambda x: rightTypesTri(x))]

In [60]:
filteredT_tri.head(10)

Unnamed: 0,trigram,t
13,"(trauma, tis, ed)",7.071066
20,"(PKK, suicide, bomber)",6.480738
26,"(old, PKK, suicide)",6.40312
27,"(pre, break, good)",6.403113
38,"(MH, 370, Malaysia)",5.999996
42,"(Turkey, Army, trench)",5.830951
45,"(airport, get, swallow)",5.830945
71,"(Army, trench, release)",5.656853
75,"(Obama, declare, disaster)",5.65685
95,"(Quarantine, Offensive, Content)",5.291502


In [61]:

t_bi = filteredT_bi[:20].bigram.values

In [62]:
t_tri = filteredT_tri[:20].trigram.values

In [63]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [64]:
bigramChiTable.head(20)

Unnamed: 0,bigram,chi-sq
0,"(Madhya, Pradesh)",175710.0
1,"(GB, BO)",167722.227159
2,"(Quarantine, Offensive)",163994.133036
3,"(Pamela, Geller)",160429.04326
4,"(Offensive, Content)",158703.096342
5,"(MH, 370)",152422.109417
6,"(Fun, tenn)",142759.499279
7,"(Stock, Market)",129382.852949
8,"(tis, ed)",129185.29035
9,"(Temple, Mount)",124172.999145


In [65]:
trigramChiTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.chi_sq)), columns=['trigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [66]:
trigramChiTable.head(20)

Unnamed: 0,trigram,chi-sq
0,"(Quarantine, Offensive, Content)",929539900.0
1,"(anthrax, lab, mishap)",626353600.0
2,"(Stock, Market, Crash)",445786400.0
3,"(Salt, River, Wild)",405984100.0
4,"(Turkey, Army, trench)",315313600.0
5,"(River, Wild, Horse)",280268500.0
6,"(wreckage, conclusively, confirm)",251340300.0
7,"(trauma, tis, ed)",238962900.0
8,"(hot, Fun, tenn)",201675100.0
9,"(Malaysia, PM, Investigators)",196549200.0


In [67]:
chi_bi = bigramChiTable[:20].bigram.values

In [68]:
chi_tri = trigramChiTable[:20].trigram.values

In [69]:
bigramLikTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [70]:
bigramLikTable.head()

Unnamed: 0,bigram,likelihood ratio
0,"(-PRON-, be)",3163.770741
1,"(do, not)",2927.755745
2,"(MH, 370)",1517.501582
3,"(can, not)",1190.329246
4,"(suicide, bomber)",1107.134222


In [71]:

filteredLik_bi = bigramLikTable[bigramLikTable.bigram.map(lambda x: rightTypes(x))]

In [72]:
filteredLik_bi.head(10)

Unnamed: 0,bigram,likelihood ratio
4,"(suicide, bomber)",1107.134222
6,"(tis, ed)",837.845125
10,"(trauma, tis)",785.007804
17,"(YouTube, video)",674.19621
19,"(Northern, California)",671.968048
20,"(Cross, Body)",664.626077
23,"(mass, murderer)",620.140891
24,"(loud, bang)",614.354159
25,"(Full, read)",606.962473
27,"(mass, murder)",596.386279


In [73]:
trigramLikTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [74]:
trigramLikTable.head()

Unnamed: 0,trigram,likelihood ratio
0,"(-PRON-, be, not)",6830.205838
1,"(-PRON-, do, not)",6075.703805
2,"(if, -PRON-, be)",6022.415118
3,"(when, -PRON-, be)",5758.108735
4,"(-PRON-, be, a)",5572.981828


In [75]:
filteredLik_tri = trigramLikTable[trigramLikTable.trigram.map(lambda x: rightTypesTri(x))]

In [76]:
filteredLik_tri.head(20)

Unnamed: 0,trigram,likelihood ratio
33,"(MH, 370, Malaysia)",2995.523069
36,"(PKK, suicide, bomber)",2625.816507
37,"(trauma, tis, ed)",2434.293229
49,"(Obama, declare, disaster)",1767.82301
50,"(old, PKK, suicide)",1765.820099
51,"(Turkey, Army, trench)",1714.724621
57,"(Cross, Body, Bag)",1624.771643
62,"(pre, break, good)",1596.264121
63,"(Quarantine, Offensive, Content)",1585.431687
64,"(Obama, sign, disaster)",1551.781748


In [77]:

lik_bi = filteredLik_bi[:20].bigram.values

In [78]:
lik_tri = filteredLik_tri[:20].trigram.values

In [79]:
bigramsCompare = pd.DataFrame([freq_bi, pmi_bi, t_bi, chi_bi, lik_bi]).T

In [80]:
bigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [81]:
bigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(-PRON-, -PRON-)","(Madhya, Pradesh)","(suicide, bomber)","(Madhya, Pradesh)","(suicide, bomber)"
1,"(-PRON-, think)","(GB, BO)","(-PRON-, think)","(GB, BO)","(tis, ed)"
2,"(suicide, bomber)","(Pamela, Geller)","(YouTube, video)","(Quarantine, Offensive)","(trauma, tis)"
3,"(-PRON-, feel)","(Virgin, galactic)","(-PRON-, feel)","(Pamela, Geller)","(YouTube, video)"
4,"(think, -PRON-)","(Temple, Mount)","(burn, building)","(Offensive, Content)","(Northern, California)"
5,"(YouTube, video)","(Quarantine, Offensive)","(Northern, California)","(MH, 370)","(Cross, Body)"
6,"(burn, building)","(Offensive, Content)","(think, -PRON-)","(Fun, tenn)","(mass, murderer)"
7,"(-PRON-, S)","(Fun, tenn)","(tis, ed)","(Stock, Market)","(loud, bang)"
8,"(Northern, California)","(Stock, Market)","(trauma, tis)","(tis, ed)","(Full, read)"
9,"(tis, ed)","(recount, horror)","(Full, read)","(Temple, Mount)","(mass, murder)"


In [82]:
trigramsCompare = pd.DataFrame([freq_tri, pmi_tri, t_tri, chi_tri, lik_tri]).T

In [83]:
trigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [84]:
trigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(trauma, tis, ed)","(Quarantine, Offensive, Content)","(trauma, tis, ed)","(Quarantine, Offensive, Content)","(MH, 370, Malaysia)"
1,"(PKK, suicide, bomber)","(anthrax, lab, mishap)","(PKK, suicide, bomber)","(anthrax, lab, mishap)","(PKK, suicide, bomber)"
2,"(old, PKK, suicide)","(Stock, Market, Crash)","(old, PKK, suicide)","(Stock, Market, Crash)","(trauma, tis, ed)"
3,"(pre, break, good)","(Salt, River, Wild)","(pre, break, good)","(Salt, River, Wild)","(Obama, declare, disaster)"
4,"(MH, 370, Malaysia)","(River, Wild, Horse)","(MH, 370, Malaysia)","(Turkey, Army, trench)","(old, PKK, suicide)"
5,"(Turkey, Army, trench)","(Turkey, Army, trench)","(Turkey, Army, trench)","(River, Wild, Horse)","(Turkey, Army, trench)"
6,"(airport, get, swallow)","(Malaysia, PM, Investigators)","(airport, get, swallow)","(wreckage, conclusively, confirm)","(Cross, Body, Bag)"
7,"(Obama, declare, disaster)","(hot, Fun, tenn)","(Army, trench, release)","(trauma, tis, ed)","(pre, break, good)"
8,"(Army, trench, release)","(horrible, sub, reddit)","(Obama, declare, disaster)","(hot, Fun, tenn)","(Quarantine, Offensive, Content)"
9,"(Quarantine, Offensive, Content)","(wreckage, conclusively, confirm)","(Quarantine, Offensive, Content)","(Malaysia, PM, Investigators)","(Obama, sign, disaster)"
