## Import Dependencies

In [1]:
import re 
import pandas as pd
import requests
from bs4 import BeautifulSoup
import html2text
import stanza
import spacy
from spacy_stanza import StanzaLanguage
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import wordninja

In [2]:
data = pd.read_csv('data - data.csv',memory_map=True)
data = data[['text']]
data['clean_text'] = data.text

In [3]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
#remove non-ascii characters
data.clean_text = data.clean_text.map(lambda x: _removeNonAscii(x))


In [4]:
# remove special characters, numbers, punctuations
import contractions

# data['clean_text'] = data['clean_text'].apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) )
data['clean_text'] = data['clean_text'].apply(lambda x: re.sub(r'http\S+', '', x))
data['clean_text'] = data['clean_text'].apply(
    lambda x: [contractions.fix(word) for word in x.split()])


In [5]:
data['clean_text'] = data['clean_text'].apply(lambda x: ' '.join(x))

In [6]:
data['clean_text'] = data['clean_text'].str.replace("[^a-zA-Z0-9]", " ")

In [7]:
# data.clean_text = data.clean_text.apply(lambda x: [wordninja.split(word) for word in x.split()])

In [8]:
import spacy
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

In [9]:
data.clean_text

0                       Just happened a terrible car crash
1        Our Deeds are the Reason of this  earthquake M...
2        Heard about  earthquake is different cities  s...
3        there is a forest fire at spot pond  geese are...
4                   Forest fire near La Ronge Sask  Canada
                               ...                        
10871           M1 94  01 04 UTC  5km S of Volcano Hawaii 
10872    Police investigating after an e bike collided ...
10873    The Latest  More Homes Razed by Northern Calif...
10874           MEG issues Hazardous Weather Outlook  HWO 
10875     CityofCalgary has activated its Municipal Eme...
Name: clean_text, Length: 10876, dtype: object

In [10]:
#turn all comments' tokens into one single list
data.clean_text  = data.clean_text.apply(lambda x: x.split())
unlist_comments = [item for items in data.clean_text for item in items]

In [11]:
from flair.data import Sentence
from flair.models import SequenceTagger

sentence = Sentence('SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS INDICATING L/C NO. AND CONTRACT NO. 25162373 AND/OR 25162792')
tagger = SequenceTagger.load('ner')
tagger.predict(sentence)

2020-05-21 13:04:03,735 loading file /Users/subir/.flair/models/en-ner-conll03-v0.4.pt




[Sentence: "SIGNED COMMERCIAL INVOICE IN 3 ORIGINALS INDICATING L/C NO. AND CONTRACT NO. 25162373 AND/OR 25162792" - 15 Tokens]

In [16]:
# Tags I've chosen for relations
deps = ["ROOT", "adj", "attr", "agent", "amod"]

# Tags I've chosen for entities(subjects and objects)
deps = ["compound", "prep", "conj", "mod"]

In [17]:
def processSubjectObjectPairs(tokens):
    subject = ''
    object = ''
    relation = ''
    subjectConstruction = ''
    objectConstruction = ''
    for token in tokens:
        printToken(token)
        if "punct" in token.dep_:
            continue
        if isRelationCandidate(token):
            relation = appendChunk(relation, token.lemma_)
        if isConstructionCandidate(token):
            if subjectConstruction:
                subjectConstruction = appendChunk(subjectConstruction, token.text)
            if objectConstruction:
                objectConstruction = appendChunk(objectConstruction, token.text)
        if "subj" in token.dep_:
            subject = appendChunk(subject, token.text)
            subject = appendChunk(subjectConstruction, subject)
            subjectConstruction = ''
        if "obj" in token.dep_:
            object = appendChunk(object, token.text)
            object = appendChunk(objectConstruction, object)
            objectConstruction = ''

    return (subject.strip(), relation.strip(), object.strip())

In [32]:
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates

def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee
    import networkx, nltk
    
    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
   
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return zip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top n_keywords
    ranks = networkx.pagerank(graph)
    if 0 < n_keywords < 1:
        n_keywords = int(round(len(candidates) * n_keywords))
    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.items(), key=lambda x: x[1], reverse=True)[:n_keywords]}
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)
    
    return sorted(keyphrases.items(), key=lambda x: x[1], reverse=True)

In [18]:
def printGraph(triples):
    G = nx.Graph()
    for triple in triples:
        G.add_node(triple[0])
        G.add_node(triple[1])
        G.add_node(triple[2])
        G.add_edge(triple[0], triple[1])
        G.add_edge(triple[1], triple[2])

    pos = nx.spring_layout(G)
    plt.figure()
    nx.draw(G, pos, edge_color='black', width=1, linewidths=1,
            node_size=500, node_color='seagreen', alpha=0.9,
            labels={node: node for node in G.nodes()})
    plt.axis('off')
    plt.show()

In [33]:
score_keyphrases_by_textrank(' '.join(unlist_comments))

[('fire', 0.0033978901367757676),
 ('amp', 0.0032472202099448483),
 ('new', 0.0030228625363960732),
 ('fire news', 0.002946977877757825),
 ('people', 0.0029177301277455844),
 ('amp rt', 0.0027482843821168045),
 ('fire video', 0.002612043212407895),
 ('news', 0.0024960656187398827),
 ('good fire', 0.0024428602053214368),
 ('fire way', 0.0024413132872876143),
 ('new video', 0.0024245294122180482),
 ('world amp', 0.0023609788937145565),
 ('fire house today amp', 0.002349924225280915),
 ('fire buildings', 0.0023396379272937126),
 ('hours amp amp', 0.0023261699526597266),
 ('years amp', 0.0022499914891177817),
 ('rt', 0.0022493485542887603),
 ('new world', 0.002248800056940169),
 ('new post people', 0.0021624901576356745),
 ('amp w', 0.0021057650627351247),
 ('fire right', 0.0021053465965199425),
 ('city amp', 0.0021032142238301558),
 ('amp best', 0.002087895392002288),
 ('god amp', 0.002081852623362918),
 ('hot amp', 0.0020750184622855836),
 ('people dead', 0.002052317097182301),
 ('dead p

> We will then use NLTK’s tools to generate all possible bigrams and trigrams

In [34]:
import nltk
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_comments)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_comments)

## Methods to Rank Collocations

### Counting frequencies of adjacent words with part of speech filters:

In [35]:
bigram_freq = bigramFinder.ngram_fd.items()

In [36]:
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [37]:
bigramFreqTable.reset_index(drop=True)

Unnamed: 0,bigram,freq
0,"(I, am)",478
1,"(in, the)",366
2,"(of, the)",333
3,"(it, is)",304
4,"(do, not)",287
...,...,...
98201,"(yet, justicemalala)",1
98202,"(Ahrar, not)",1
98203,"(and, Ahrar)",1
98204,"(Nusrah, and)",1


In [38]:
bigramFreqTable[:10]

Unnamed: 0,bigram,freq
148,"(I, am)",478
161,"(in, the)",366
152,"(of, the)",333
144,"(it, is)",304
286,"(do, not)",287
29,"(is, a)",176
2652,"(to, the)",162
752,"(to, be)",162
471,"(on, the)",160
933,"(in, a)",158


``
Issue with this is adjacent spaces, stop words, articles, prepositions or pronouns are common and are not meaningful
To fix this, we filter out for collocations not containing stop words and filter for only the following structures:``
  
  - **Bigrams: (Noun, Noun), (Adjective, Noun)** \
  
  - **Trigrams: (Adjective/Noun, Anything, Adjective/Noun)** \
 
 *This is a common structure used in literature and generally works well* 

In [39]:
en_stopwords = set(stopwords.words('english'))
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [40]:
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [41]:
filtered_bi[:10]

Unnamed: 0,bigram,freq
3988,"(YouTube, video)",61
13425,"(suicide, bomber)",58
5332,"(Northern, California)",53
16965,"(burning, buildings)",50
4519,"(Full, read)",48
15787,"(mass, murder)",44
6155,"(suicide, bombing)",44
13722,"(Cross, Body)",43
83579,"(PKK, suicide)",42
83578,"(old, PKK)",41


In [42]:

trigram_freq = trigramFinder.ngram_fd.items()

In [43]:
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

In [44]:
trigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,trigram,freq
0,"(I, do, not)",68
1,"(liked, a, YouTube)",61
2,"(a, YouTube, video)",61
3,"(I, liked, a)",61
4,"(I, can, not)",47


In [45]:
trigramFreqTable[:10]

Unnamed: 0,trigram,freq
294,"(I, do, not)",68
4424,"(liked, a, YouTube)",61
4425,"(a, YouTube, video)",61
4423,"(I, liked, a)",61
319,"(I, can, not)",47
106877,"(bomber, who, detonated)",42
106879,"(detonated, bomb, in)",42
106878,"(who, detonated, bomb)",42
106876,"(suicide, bomber, who)",42
106875,"(PKK, suicide, bomber)",42


In [46]:
def rightTypesTri(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or '  ' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [47]:
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypesTri(x))]

In [48]:
filtered_tri[:10]

Unnamed: 0,trigram,freq
106875,"(PKK, suicide, bomber)",42
106874,"(old, PKK, suicide)",41
99025,"(Airport, Get, Swallowed)",34
106890,"(Turkey, Army, trench)",34
126117,"(MH370, Malaysia, PM)",32
47495,"(Obama, Declares, Disaster)",30
47160,"(Knock, Detonation, Sensor)",28
47159,"(Ignition, Knock, Detonation)",28
90398,"(Refugio, oil, spill)",27
97944,"(Are, There, Gems)",26


## PMI 
-  **The main intuition is that it measures how much more likely the words co-occur than if they were independent.**

In [49]:
freq_bi = filtered_bi[:20].bigram.values

In [50]:
freq_tri = filtered_tri[:20].trigram.values

In [51]:
bigramFinder.apply_freq_filter(20)

In [52]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [53]:
bigramPMITable[:10]

Unnamed: 0,bigram,PMI
0,"(Madhya, Pradesh)",12.818283
1,"(Pamela, Geller)",12.818283
2,"(Ignition, Knock)",12.470359
3,"(Virgin, Galactic)",12.470359
4,"(subreddits, banned)",12.461426
5,"(Christian, Attacked)",12.455713
6,"(Temple, Mount)",12.441213
7,"(Offensive, Content)",12.36315
8,"(Work, Light)",12.285935
9,"(Quarantine, Offensive)",12.23332


In [54]:
trigramFinder.apply_freq_filter(20)

In [55]:
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [56]:
trigramPMITable[:25]

Unnamed: 0,trigram,PMI
0,"(anthrax, lab, mishaps)",24.632596
1,"(Quarantine, Offensive, Content)",24.59647
2,"(Waving, Israeli, Flag)",24.533061
3,"(Ignition, Knock, Detonation)",24.500146
4,"(horrible, subreddits, banned)",24.453738
5,"(Knock, Detonation, Sensor)",24.44952
6,"(Funtenna, hijacking, computers)",24.375519
7,"(Wreckage, Conclusively, Confirmed)",24.260195
8,"(transport, bioterror, germs)",24.073629
9,"(River, Wild, Horse)",23.818942


In [57]:
pmi_bi = bigramPMITable[:20].bigram.values
pmi_tri = trigramPMITable[:20].trigram.values

## T-test
- **T-test has been criticized as it assumes normal distribution. Therefore, we will also look into the chi-square test.**

In [58]:
bigramTtable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)

In [59]:
bigramTtable.head()

Unnamed: 0,bigram,t
0,"(I, am)",21.51991
1,"(it, is)",16.789338
2,"(do, not)",16.748021
3,"(in, the)",16.073035
4,"(of, the)",15.095408


In [60]:
filteredT_bi = bigramTtable[bigramTtable.bigram.map(lambda x: rightTypes(x))]

In [61]:
filteredT_bi[:10]

Unnamed: 0,bigram,t
30,"(YouTube, video)",7.7955
35,"(suicide, bomber)",7.606961
44,"(Northern, California)",7.268854
49,"(burning, buildings)",7.056831
55,"(Full, read)",6.920995
60,"(mass, murder)",6.627967
61,"(suicide, bombing)",6.622073
67,"(Cross, Body)",6.55382
70,"(PKK, suicide)",6.475081
76,"(old, PKK)",6.398089


In [62]:
trigramTtable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.student_t)), columns=['trigram','t']).sort_values(by='t', ascending=False)

In [63]:
trigramTtable.head()

Unnamed: 0,trigram,t
0,"(I, do, not)",8.240293
1,"(liked, a, YouTube)",7.81014
2,"(a, YouTube, video)",7.810002
3,"(I, liked, a)",7.808226
4,"(I, can, not)",6.850316


In [64]:
filteredT_tri = trigramTtable[trigramTtable.trigram.map(lambda x: rightTypesTri(x))]

In [65]:
filteredT_tri.head(10)

Unnamed: 0,trigram,t
5,"(PKK, suicide, bomber)",6.480738
11,"(old, PKK, suicide)",6.40312
17,"(Airport, Get, Swallowed)",5.830951
18,"(Turkey, Army, trench)",5.830951
44,"(MH370, Malaysia, PM)",5.656851
55,"(Obama, Declares, Disaster)",5.477224
67,"(Ignition, Knock, Detonation)",5.291502
68,"(Knock, Detonation, Sensor)",5.291502
73,"(Refugio, oil, spill)",5.196152
89,"(Funtenna, hijacking, computers)",5.099019


In [66]:

t_bi = filteredT_bi[:20].bigram.values

In [67]:
t_tri = filteredT_tri[:20].trigram.values

## Chi-Square
- **The chi-square test assumes in the null hypothesis that words are independent, just like in t-test.

In [68]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [69]:
bigramChiTable.head(20)

Unnamed: 0,bigram,chi-sq
0,"(Ignition, Knock)",158895.0
1,"(Madhya, Pradesh)",158895.0
2,"(Pamela, Geller)",151671.545328
3,"(Conclusively, Confirmed)",144155.926871
4,"(Offensive, Content)",131705.63589
5,"(Virgin, Galactic)",130516.78512
6,"(subreddits, banned)",129710.84194
7,"(costlier, bigger)",126175.763761
8,"(Quarantine, Offensive)",125184.483946
9,"(Stock, Market)",124314.786052


In [70]:
trigramChiTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.chi_sq)), columns=['trigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [71]:
trigramChiTable.head(20)

Unnamed: 0,trigram,chi-sq
0,"(Ignition, Knock, Detonation)",664411100.0
1,"(Knock, Detonation, Sensor)",641500400.0
2,"(Quarantine, Offensive, Content)",634184100.0
3,"(anthrax, lab, mishaps)",624254600.0
4,"(Wreckage, Conclusively, Confirmed)",602792300.0
5,"(Funtenna, hijacking, computers)",565895000.0
6,"(horrible, subreddits, banned)",528489700.0
7,"(Waving, Israeli, Flag)",509807700.0
8,"(transport, bioterror, germs)",423737400.0
9,"(Airport, Get, Swallowed)",366845800.0


In [72]:
chi_bi = bigramChiTable[:20].bigram.values

In [73]:
chi_tri = trigramChiTable[:20].trigram.values

## Likelihood

In [74]:
bigramLikTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [75]:
bigramLikTable.head()

Unnamed: 0,bigram,likelihood ratio
0,"(I, am)",3923.840593
1,"(do, not)",2305.536272
2,"(it, is)",1566.415894
3,"(can, not)",1061.803191
4,"(going, to)",1035.233462


In [76]:

filteredLik_bi = bigramLikTable[bigramLikTable.bigram.map(lambda x: rightTypes(x))]

In [77]:
filteredLik_bi.head(10)

Unnamed: 0,bigram,likelihood ratio
7,"(suicide, bomber)",754.289677
8,"(YouTube, video)",709.281722
13,"(Northern, California)",644.891626
14,"(Cross, Body)",642.957271
17,"(Full, read)",632.992075
19,"(loud, bang)",624.695035
20,"(mass, murder)",612.350282
21,"(oil, spill)",596.703267
24,"(PKK, suicide)",585.394848
25,"(old, PKK)",579.265825


In [78]:
trigramLikTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [79]:
trigramLikTable.head()

Unnamed: 0,trigram,likelihood ratio
0,"(I, am, not)",6869.440972
1,"(I, am, a)",6270.92788
2,"(I, am, going)",6170.728109
3,"(and, I, am)",6056.168759
4,"(I, am, so)",6049.362832


In [80]:
filteredLik_tri = trigramLikTable[trigramLikTable.trigram.map(lambda x: rightTypesTri(x))]

In [81]:
filteredLik_tri.head(20)

Unnamed: 0,trigram,likelihood ratio
13,"(PKK, suicide, bomber)",2145.188416
24,"(old, PKK, suicide)",1786.769136
26,"(Airport, Get, Swallowed)",1759.738765
29,"(Turkey, Army, trench)",1694.202621
35,"(Refugio, oil, spill)",1610.54965
38,"(Cross, Body, Bag)",1569.34479
40,"(Ignition, Knock, Detonation)",1554.440561
43,"(Knock, Detonation, Sensor)",1541.391192
52,"(Northern, California, Wildfire)",1474.798483
53,"(MH370, Malaysia, PM)",1456.635142


In [82]:

lik_bi = filteredLik_bi[:20].bigram.values

In [83]:
lik_tri = filteredLik_tri[:20].trigram.values

## Different Approach Comparison

In [84]:
bigramsCompare = pd.DataFrame([freq_bi, pmi_bi, t_bi, chi_bi, lik_bi]).T

In [85]:
bigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [86]:
bigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(YouTube, video)","(Madhya, Pradesh)","(YouTube, video)","(Ignition, Knock)","(suicide, bomber)"
1,"(suicide, bomber)","(Pamela, Geller)","(suicide, bomber)","(Madhya, Pradesh)","(YouTube, video)"
2,"(Northern, California)","(Ignition, Knock)","(Northern, California)","(Pamela, Geller)","(Northern, California)"
3,"(burning, buildings)","(Virgin, Galactic)","(burning, buildings)","(Conclusively, Confirmed)","(Cross, Body)"
4,"(Full, read)","(subreddits, banned)","(Full, read)","(Offensive, Content)","(Full, read)"
5,"(mass, murder)","(Christian, Attacked)","(mass, murder)","(Virgin, Galactic)","(loud, bang)"
6,"(suicide, bombing)","(Temple, Mount)","(suicide, bombing)","(subreddits, banned)","(mass, murder)"
7,"(Cross, Body)","(Offensive, Content)","(Cross, Body)","(costlier, bigger)","(oil, spill)"
8,"(PKK, suicide)","(Work, Light)","(PKK, suicide)","(Quarantine, Offensive)","(PKK, suicide)"
9,"(old, PKK)","(Quarantine, Offensive)","(old, PKK)","(Stock, Market)","(old, PKK)"


In [87]:
trigramsCompare = pd.DataFrame([freq_tri, pmi_tri, t_tri, chi_tri, lik_tri]).T

In [88]:
trigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likelihood Ratio Test With Filter']

In [89]:
trigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likelihood Ratio Test With Filter
0,"(PKK, suicide, bomber)","(anthrax, lab, mishaps)","(PKK, suicide, bomber)","(Ignition, Knock, Detonation)","(PKK, suicide, bomber)"
1,"(old, PKK, suicide)","(Quarantine, Offensive, Content)","(old, PKK, suicide)","(Knock, Detonation, Sensor)","(old, PKK, suicide)"
2,"(Airport, Get, Swallowed)","(Waving, Israeli, Flag)","(Airport, Get, Swallowed)","(Quarantine, Offensive, Content)","(Airport, Get, Swallowed)"
3,"(Turkey, Army, trench)","(Ignition, Knock, Detonation)","(Turkey, Army, trench)","(anthrax, lab, mishaps)","(Turkey, Army, trench)"
4,"(MH370, Malaysia, PM)","(horrible, subreddits, banned)","(MH370, Malaysia, PM)","(Wreckage, Conclusively, Confirmed)","(Refugio, oil, spill)"
5,"(Obama, Declares, Disaster)","(Knock, Detonation, Sensor)","(Obama, Declares, Disaster)","(Funtenna, hijacking, computers)","(Cross, Body, Bag)"
6,"(Knock, Detonation, Sensor)","(Funtenna, hijacking, computers)","(Ignition, Knock, Detonation)","(horrible, subreddits, banned)","(Ignition, Knock, Detonation)"
7,"(Ignition, Knock, Detonation)","(Wreckage, Conclusively, Confirmed)","(Knock, Detonation, Sensor)","(Waving, Israeli, Flag)","(Knock, Detonation, Sensor)"
8,"(Refugio, oil, spill)","(transport, bioterror, germs)","(Refugio, oil, spill)","(transport, bioterror, germs)","(Northern, California, Wildfire)"
9,"(Are, There, Gems)","(River, Wild, Horse)","(Funtenna, hijacking, computers)","(Airport, Get, Swallowed)","(MH370, Malaysia, PM)"


- We can see that PMI and chi-square methods give pretty good results even without applying filters.
- Frequency and T-test methods are also similar to each other. 
- In real applications, we can eyeball the list and set a threshold at a value from when the list stops making sense.

In [90]:
COL_ORDER = ['PMI', 'Chi-Sq Test','Likelihood Ratio Test With Filter', 'T-test With Filter', 'Frequency With Filter']
from ordered_set import OrderedSet
trigram_set = OrderedSet()
for col in COL_ORDER:
    [trigram_set.add(x) for x in trigramsCompare[col].values]

In [91]:
[' '.join(x) for x in list(trigram_set)][:20]

['anthrax lab mishaps',
 'Quarantine Offensive Content',
 'Waving Israeli Flag',
 'Ignition Knock Detonation',
 'horrible subreddits banned',
 'Knock Detonation Sensor',
 'Funtenna hijacking computers',
 'Wreckage Conclusively Confirmed',
 'transport bioterror germs',
 'River Wild Horse',
 'Salt River Wild',
 'Stock Market Crash',
 'Airport Get Swallowed',
 'Typhoon Devastated Saipan',
 'Refugio oil spill',
 'waves Black Hat',
 'content policy goes',
 'Now Quarantine Offensive',
 'hot Funtenna hijacking']

## LDA

In [92]:
from gensim import models,corpora
import spacy
import gensim

nlp = spacy.load('en_core_web_lg')

In [93]:
data.clean_text = data.clean_text.apply(lambda x: ' '.join(x)) 

In [94]:
data

Unnamed: 0,text,clean_text
0,Just happened a terrible car crash,Just happened a terrible car crash
1,Our Deeds are the Reason of this #earthquake M...,Our Deeds are the Reason of this earthquake Ma...
2,"Heard about #earthquake is different cities, s...",Heard about earthquake is different cities sta...
3,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...
4,Forest fire near La Ronge Sask. Canada,Forest fire near La Ronge Sask Canada
...,...,...
10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,M1 94 01 04 UTC 5km S of Volcano Hawaii
10872,Police investigating after an e-bike collided ...,Police investigating after an e bike collided ...
10873,The Latest: More Homes Razed by Northern Calif...,The Latest More Homes Razed by Northern Califo...
10874,MEG issues Hazardous Weather Outlook (HWO) htt...,MEG issues Hazardous Weather Outlook HWO


In [95]:
def clean_up(text):
    removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    text_out = []
    doc= nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha and len(token)>2 and token.pos_ not in removal:
            lemma = token.lemma_
            text_out.append(lemma)
    return text_out
    
datalist = data.text.apply(lambda x:clean_up(x))

In [96]:
dictionary = corpora.Dictionary(datalist)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in datalist]

In [97]:
num_topics =12
Lda = models.LdaMulticore
lda= Lda(doc_term_matrix, num_topics=num_topics,id2word = dictionary, 
         passes=20,chunksize=2000,random_state=3)

## Optimize # of k topics

In [99]:
coherence = []
for k in range(1,20):
    print('Round: '+str(k))
    Lda = models.LdaMulticore
    ldamodel = Lda(doc_term_matrix, num_topics=k, id2word = dictionary, passes=20, iterations=100,  chunksize = 2000, eval_every = 10)
    cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=datalist, dictionary=dictionary, coherence='c_v')
    coherence.append((k,cm.get_coherence()))

Round: 1




KeyboardInterrupt: 

In [None]:
x_val = [x[0] for x in coherence]
y_val = [x[1] for x in coherence]

In [None]:
import matplotlib.pyplot as plt 
plt.plot(x_val,y_val)
plt.scatter(x_val,y_val)
plt.title('Number of Topics vs. Coherence')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence')
plt.xticks(x_val)
plt.show()

In [None]:
num_topics = 8
Lda = models.LdaMulticore
lda = Lda(doc_term_matrix,
          num_topics=8,
          id2word=dictionary,
          passes=20,
          iterations=100,
          chunksize=10000,
          eval_every=10)

In [None]:
# To show initial topics
ldamodel.show_topics(20, num_words=3, formatted=False)

In [None]:
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
topic_data =  pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, mds = 'tsne')
pyLDAvis.display(topic_data)

In [None]:

all_topics = {}
lambd = 0.5  # Adjust this accordingly
for i in range(1,8): #Adjust this to reflect number of topics chosen for final LDA model
    topic = topic_data.topic_info[topic_data.topic_info.Category == 'Topic'+str(i)]
    topic['relevance'] = topic['loglift']*(1-lambd)+topic['logprob']*lambd
    all_topics['Topic '+str(i)] = topic.sort_values(by='relevance', ascending=False).Term[:20].values

In [None]:
all_topics['Topic 1']

In [None]:
pd.DataFrame(all_topics).T