## Assignment Alternus Vera - Fake News Classifier

#### Team Psychic Pandas

#### Factor for analysing fake news - Sensationalism

In [None]:
import pandas as pd
import numpy as np
import requests
import bs4
from bs4 import BeautifulSoup
import json
import re

## Reading train and test data

In [2]:
DATA_FOLDER = '/Users/andrew/Documents/MS - SJSU/Fall Sem 2018/CMPE 257 - Machine Learning/liar_dataset'
TRAIN_PATH = '/train.tsv'
TEST_PATH = '/test.tsv'
train_df_1 = pd.read_csv(DATA_FOLDER+TRAIN_PATH,sep='\t', header = None)
test_df = pd.read_csv(DATA_FOLDER+TEST_PATH,sep='\t', header = None)

### Naming the columns

In [3]:
train_df_1.columns = ["id", "label", "statement", "subject", "speaker", "speaker_title", "State", "party_affiliation", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire","context"]
test_df.columns = ["id", "label", "statement", "subject", "speaker", "speaker_title", "State", "party_affiliation", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire","context"]


In [4]:
train_df_1.head(5)

Unnamed: 0,id,label,statement,subject,speaker,speaker_title,State,party_affiliation,barely_true,false,half_true,mostly_true,pants_on_fire,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [5]:
test_df['statement']

0       Building a wall on the U.S.-Mexico border will...
1       Wisconsin is on pace to double the number of l...
2       Says John McCain has done nothing to help the ...
3       Suzanne Bonamici supports a plan that will cut...
4       When asked by a reporter whether hes at the ce...
5       Over the past five years the federal governmen...
6       Says that Tennessee law requires that schools ...
7       Says Vice President Joe Biden "admits that the...
8       Donald Trump is against marriage equality. He ...
9       We know that more than half of Hillary Clinton...
10      We know there are more Democrats in Georgia th...
11      PolitiFact Texas says Congressman Edwards atta...
12             Denali is the Kenyan word for black power.
13      Says 57 percent of federal spending goes to th...
14           On residency requirements for public workers
15      Says the unemployment rate for college graduat...
16      Unfortunately we have documented instances whe...
17      A rece

### Taking statements from dataset and assigning index to it

In [39]:
statement = train_df_1[['statement']]
statement['index'] = statement.index
statement_news = statement


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [40]:
### printing the length of the entire dataset of statements
print(len(statement_news))
print(statement_news[:5])

10240
                                           statement  index
0  Says the Annies List political group supports ...      0
1  When did the decline of coal start? It started...      1
2  Hillary Clinton agrees with John McCain "by vo...      2
3  Health care reform legislation is likely to ma...      3
4  The economic turnaround started at the end of ...      4


### Data Preprocessing

* Next step is to prepare the data by removing all punctuations,stopwords etc
* At first, tokenization is applied to split the text we have just got to sentences and then sentences to words.
* Next, it is converted to lowercase
* Then Lemmatization is applied to change all past and future words to present and also changes words with third person to first
* Next step os to apply Stemming whre words are changed to their root form

* For preprocessing, genism library is imported.

In [41]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/andrew/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Function to carry out lemmetization and stemming

In [42]:
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

### Have taken an article to test lemmatization and stemming done

In [43]:
doc_sample = statement_news[statement_news['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Members', 'of', 'the', 'public', 'are', 'being', 'charged', '$50', 'to', 'hear', 'Gov.', 'Scott', 'Walker', 'and', 'a', 'dozen', 'members', 'of', 'his', 'administration', 'talk', 'about', 'jobs', 'and', 'the', 'economy', 'at', 'Lambeau', 'Field.']


 tokenized and lemmatized document: 
['member', 'public', 'charg', 'hear', 'scott', 'walker', 'dozen', 'member', 'administr', 'talk', 'job', 'economi', 'lambeau', 'field']


### Processing all the articles in the set

In [45]:
processed_statement = statement_news['statement'].map(preprocess)
processed_statement[:10]

0    [say, anni, list, polit, group, support, trime...
1    [declin, coal, start, start, natur, take, star...
2    [hillari, clinton, agre, john, mccain, vote, g...
3    [health, care, reform, legisl, like, mandat, f...
4                    [econom, turnaround, start, term]
5    [chicago, bear, start, quarterback, year, tota...
6               [dunnam, live, district, repres, year]
7    [person, stage, work, activ, year, pass, russ,...
8    [take, million, oregon, lotteri, fund, port, n...
9    [say, primari, oppon, glenn, grothman, leibham...
Name: statement, dtype: object

### Bags of words on the processed dataset

* Creating a dictionary of words from the processed data that we got above

In [46]:
dictionary = gensim.corpora.Dictionary(processed_statement)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abort
1 anni
2 demand
3 group
4 list
5 polit
6 say
7 support
8 trimest
9 administr
10 begin


In [47]:
## Keeping only the most frequent words
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

### Using bow2Vec

In [48]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_statement]
bow_corpus[4310]

[(7, 1),
 (85, 1),
 (127, 1),
 (128, 1),
 (181, 2),
 (252, 1),
 (275, 1),
 (322, 1),
 (540, 1),
 (790, 1),
 (1019, 1)]

In [49]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 7 ("administr") appears 1 time.
Word 85 ("economi") appears 1 time.
Word 127 ("scott") appears 1 time.
Word 128 ("walker") appears 1 time.
Word 181 ("member") appears 2 time.
Word 252 ("job") appears 1 time.
Word 275 ("public") appears 1 time.
Word 322 ("hear") appears 1 time.
Word 540 ("charg") appears 1 time.
Word 790 ("talk") appears 1 time.
Word 1019 ("field") appears 1 time.


### Applying TF-IDF to bow_corpus 

In [50]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.3397402827336795),
 (1, 0.5002880765433487),
 (2, 0.4088224168287155),
 (3, 0.4639566513984633),
 (4, 0.40750764496407926),
 (5, 0.10879086838115597),
 (6, 0.27202739591951525)]


### Applying LDA for Bag Of Words

In [52]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)


In [53]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.046*"say" + 0.029*"texa" + 0.024*"state" + 0.021*"american" + 0.015*"job" + 0.013*"million" + 0.011*"vote" + 0.010*"time" + 0.010*"school" + 0.010*"percent"
Topic: 1 
Words: 0.030*"state" + 0.028*"say" + 0.021*"job" + 0.019*"percent" + 0.017*"nation" + 0.017*"counti" + 0.016*"year" + 0.013*"rate" + 0.012*"presid" + 0.010*"creat"
Topic: 2 
Words: 0.055*"say" + 0.030*"obama" + 0.024*"presid" + 0.015*"percent" + 0.015*"barack" + 0.014*"state" + 0.014*"secur" + 0.012*"support" + 0.012*"social" + 0.009*"citi"
Topic: 3 
Words: 0.044*"year" + 0.027*"say" + 0.023*"percent" + 0.018*"illeg" + 0.016*"state" + 0.015*"immigr" + 0.013*"elect" + 0.011*"school" + 0.010*"budget" + 0.009*"vote"
Topic: 4 
Words: 0.032*"say" + 0.029*"state" + 0.022*"vote" + 0.020*"peopl" + 0.016*"percent" + 0.012*"unit" + 0.012*"spend" + 0.011*"clinton" + 0.011*"cut" + 0.011*"countri"
Topic: 5 
Words: 0.043*"say" + 0.039*"vote" + 0.018*"percent" + 0.015*"time" + 0.011*"right" + 0.010*"state" + 0.010*"co

### Applying LDA on TF-IDF

In [55]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.012*"say" + 0.011*"state" + 0.008*"dont" + 0.008*"year" + 0.007*"democrat" + 0.007*"govern" + 0.007*"care" + 0.007*"spend" + 0.007*"vote" + 0.007*"republican"
Topic: 1 Word: 0.014*"year" + 0.011*"say" + 0.011*"percent" + 0.011*"state" + 0.009*"dollar" + 0.009*"spend" + 0.008*"million" + 0.007*"school" + 0.007*"govern" + 0.006*"cost"
Topic: 2 Word: 0.014*"say" + 0.013*"job" + 0.011*"illeg" + 0.010*"creat" + 0.009*"obama" + 0.009*"year" + 0.009*"secur" + 0.009*"campaign" + 0.009*"immigr" + 0.009*"presid"
Topic: 3 Word: 0.017*"say" + 0.010*"state" + 0.009*"texa" + 0.008*"year" + 0.008*"nation" + 0.007*"countri" + 0.007*"billion" + 0.007*"number" + 0.007*"percent" + 0.006*"presid"
Topic: 4 Word: 0.013*"say" + 0.012*"percent" + 0.008*"romney" + 0.008*"countri" + 0.008*"school" + 0.008*"million" + 0.008*"health" + 0.007*"tax" + 0.007*"year" + 0.007*"mitt"
Topic: 5 Word: 0.014*"say" + 0.013*"state" + 0.010*"obama" + 0.010*"year" + 0.009*"plan" + 0.008*"immigr" + 0.007*"studen

In [57]:
processed_statement[4310]

['member',
 'public',
 'charg',
 'hear',
 'scott',
 'walker',
 'dozen',
 'member',
 'administr',
 'talk',
 'job',
 'economi',
 'lambeau',
 'field']

In [58]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))



Score: 0.9307518601417542	 
Topic: 0.030*"state" + 0.028*"say" + 0.021*"job" + 0.019*"percent" + 0.017*"nation" + 0.017*"counti" + 0.016*"year" + 0.013*"rate" + 0.012*"presid" + 0.010*"creat"


##### LDA on Bag of words gave an evaluation of 93%

In [60]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9307539463043213	 
Topic: 0.015*"say" + 0.010*"american" + 0.009*"state" + 0.009*"tri" + 0.009*"presid" + 0.009*"clinton" + 0.009*"obama" + 0.009*"unemploy" + 0.009*"hillari" + 0.009*"know"


#### LDA on Bag of words also gave an evaluation of 93%

### Testing on an unseen document

In [66]:
test_df['statement'][5]

'Over the past five years the federal government has paid out $601 million in retirement and disability benefits to deceased former federal employees.'

In [65]:
unseen_document = test_df['statement'][5]
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6009959578514099	 Topic: 0.044*"year" + 0.027*"say" + 0.023*"percent" + 0.018*"illeg" + 0.016*"state"
Score: 0.3323208689689636	 Topic: 0.036*"say" + 0.034*"state" + 0.027*"million" + 0.021*"billion" + 0.019*"year"


## Applying Factor Sensationalism

In [72]:
import itertools

In [73]:
sensationalist_phrases_vocab = [
'Assassination',
'Attack',
'Domestic security',
'Law enforcement',
'Disaster',
'National preparedness',
'Response',
'Recovery',
'Emergency response',
'First responder',
'Militia',
'Shooting',
'Evacuation',
'Hostage',
'Explosion',
'Organized crime',
'Gangs',
'National security',
'State of emergency',
'Security breach',
'Threat',
'Standoff',
'Lockdown',
'Bomb',
'Riot',
'Emergency Landing',
'Incident',
'Suspicious',
'Nuclear threat',
'Hazardous',
'Infection',
'Outbreak',
'Contamination',
'Terror',
'Epidemic',
'Critical Infrastructure',
'National infrastructure',
'Transportation security',
'Grid',
'Outage',
'Disruption',
'Violence',
'Drug cartel',
'Narcotics',
'Shootout',
'Trafficking',
'Kidnap',
'Illegal',
'Smuggling', 
'Al Qaeda',
'Terror attack',
'Weapon',
'Improvised explosive device',
'Suicide bomber',
'Suicide attack',
'Hurricane',
'Tornado',
'Tsunami',
'Earthquake',
'Tremor',
'Flood',
'Storm',
'Extreme weather',
'Forest fire',
'Ice',
'Stranded',
'Wildfire',
'Avalanche',
'Blizzard',
'Lightening',
'Emergency Broadcast System',
'Cyber Security',
'DDOS',
'Denial of service',
'Malware',
'Phishing',
'Cyber attack',
'Cyber terror',
'believe',
'support',
'ISIS',
'absolutely',
'promise',
'society',
'FBI',
'declare',
'war',
'islam',
'recession',
'price',
'stock market',
'lottery',
'terror',
'sanctoin',
'ban',
'signed',
'climate change',
'global warming',
'killed',
'shooting',
'gun fire',
'nuclear',
]

In [132]:
sensationalist_phrases_dict = {k: v for v, k in enumerate(sensationalist_phrases_vocab)}

In [83]:
sensationalist_phrases_dict

{'Assassination': 0,
 'Attack': 1,
 'Domestic security': 2,
 'Law enforcement': 3,
 'Disaster': 4,
 'National preparedness': 5,
 'Response': 6,
 'Recovery': 7,
 'Emergency response': 8,
 'First responder': 9,
 'Militia': 10,
 'Shooting': 11,
 'Evacuation': 12,
 'Hostage': 13,
 'Explosion': 14,
 'Organized crime': 15,
 'Gangs': 16,
 'National security': 17,
 'State of emergency': 18,
 'Security breach': 19,
 'Threat': 20,
 'Standoff': 21,
 'Lockdown': 22,
 'Bomb': 23,
 'Riot': 24,
 'Emergency Landing': 25,
 'Incident': 26,
 'Suspicious': 27,
 'Nuclear threat': 28,
 'Hazardous': 29,
 'Infection': 30,
 'Outbreak': 31,
 'Contamination': 32,
 'Terror': 33,
 'Epidemic': 34,
 'Critical Infrastructure': 35,
 'National infrastructure': 36,
 'Transportation security': 37,
 'Grid': 38,
 'Outage': 39,
 'Disruption': 40,
 'Violence': 41,
 'Drug cartel': 42,
 'Narcotics': 43,
 'Shootout': 44,
 'Trafficking': 45,
 'Kidnap': 46,
 'Illegal': 47,
 'Smuggling': 48,
 'Al Qaeda': 49,
 'Terror attack': 50,


In [103]:
train_df = pd.read_csv(DATA_FOLDER+TRAIN_PATH,sep='\t', header = None)

In [107]:
train_df.columns = ["id", "label", "statement", "subject", "speaker", "speaker_title", "State", "party_affiliation", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire","context"]



In [108]:
import nltk
#nltk.download()
from nltk.corpus import stopwords
stop = stopwords.words('english')

train_df_1['stopwords'] = train_df_1['statement'].apply(lambda x: len([x for x in x.split() if x in stop]))
train_df_1[['statement','stopwords']].head()


Unnamed: 0,statement,stopwords
0,Says Annies List political group supports thir...,0
1,When decline coal start? It started natural ga...,0
2,"Hillary Clinton agrees John McCain ""by voting ...",0
3,Health care reform legislation likely mandate ...,0
4,The economic turnaround started end term.,0


In [109]:
train_df['statement'] = train_df['statement'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
train_df.head(10)

Unnamed: 0,id,label,statement,subject,speaker,speaker_title,State,party_affiliation,barely_true,false,half_true,mostly_true,pants_on_fire,context
0,2635.json,false,Says Annies List political group supports thir...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When decline coal start? It started natural ga...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees John McCain ""by voting ...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation likely mandate ...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started end term.,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
5,12465.json,true,The Chicago Bears starting quarterbacks last 1...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece
6,2342.json,barely-true,Jim Dunnam lived district represents years now.,candidates-biography,republican-party-texas,,Texas,republican,3.0,1.0,1.0,3.0,1.0,a press release.
7,153.json,half-true,I'm person stage worked actively last year pas...,ethics,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa."
8,5602.json,half-true,"However, took $19.5 million Oregon Lottery fun...",jobs,oregon-lottery,,,organization,0.0,0.0,1.0,0.0,1.0,a website
9,9741.json,mostly-true,Says GOP primary opponents Glenn Grothman Joe ...,"energy,message-machine-2014,voting-record",duey-stroebel,State representative,Wisconsin,republican,0.0,0.0,0.0,1.0,0.0,an online video


In [110]:
train_df['statement'] = train_df['statement'].apply(lambda x: x.lower())
train_df['statement'] = train_df['statement'].str.replace('[^\w\s]','')


In [111]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
train_df['statement'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
train_df['statement'].head()

0    says annies list political group supports thir...
1    when decline coal start it started natural gas...
2    hillary clinton agrees john mccain by voting g...
3    health care reform legislation likely mandate ...
4             the economic turnaround started end term
Name: statement, dtype: object

In [112]:
from textblob import Word
train_df['statement'] = train_df['statement'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train_df['statement'].head()

0    say annies list political group support thirdt...
1    when decline coal start it started natural gas...
2    hillary clinton agrees john mccain by voting g...
3    health care reform legislation likely mandate ...
4             the economic turnaround started end term
Name: statement, dtype: object

In [113]:
train_df['statement'] = train_df['statement'].replace('[^a-zA-Z0-9]', ' ', regex = True)
train_df['statement'].head(5)

0    say annies list political group support thirdt...
1    when decline coal start it started natural gas...
2    hillary clinton agrees john mccain by voting g...
3    health care reform legislation likely mandate ...
4             the economic turnaround started end term
Name: statement, dtype: object

In [114]:
train_df['statement'] = train_df['statement'].replace('\d+', '', regex = True)
train_df['statement'].head(10)

0    say annies list political group support thirdt...
1    when decline coal start it started natural gas...
2    hillary clinton agrees john mccain by voting g...
3    health care reform legislation likely mandate ...
4             the economic turnaround started end term
5    the chicago bear starting quarterback last  ye...
6        jim dunnam lived district represents year now
7    im person stage worked actively last year pass...
8    however took  million oregon lottery fund port...
9    say gop primary opponent glenn grothman joe le...
Name: statement, dtype: object

In [115]:
train_df['statement'] = train_df['statement'].replace('\s{2,}', '')
train_df['statement'].head(10)

0    say annies list political group support thirdt...
1    when decline coal start it started natural gas...
2    hillary clinton agrees john mccain by voting g...
3    health care reform legislation likely mandate ...
4             the economic turnaround started end term
5    the chicago bear starting quarterback last  ye...
6        jim dunnam lived district represents year now
7    im person stage worked actively last year pass...
8    however took  million oregon lottery fund port...
9    say gop primary opponent glenn grothman joe le...
Name: statement, dtype: object

In [116]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline 
y = train_df_1.label 
 
# Drop the `label` column 
train_df_1.drop("label", axis=1) 
 
# Make training and test sets 
#X_train, X_test, y_train, y_test = train_test_split(train_df_1['statement'], y, test_size=0.33, random_state=53)
X_train = train_df['statement']
X_test = test_df['statement']
y_train = train_df.label
y_test = test_df.label

In [117]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [125]:
vectorizer = TfidfVectorizer(vocabulary = sensationalist_phrases_vocab,norm='l2',ngram_range = (1,3),use_idf=True, smooth_idf=True,
                sublinear_tf=False) 

In [126]:
tfidf = vectorizer.fit_transform(X_train)

In [127]:
tfidf.shape

(10240, 102)

In [128]:
vectorizer.vocabulary_

{'Assassination': 0,
 'Attack': 1,
 'Domestic security': 2,
 'Law enforcement': 3,
 'Disaster': 4,
 'National preparedness': 5,
 'Response': 6,
 'Recovery': 7,
 'Emergency response': 8,
 'First responder': 9,
 'Militia': 10,
 'Shooting': 11,
 'Evacuation': 12,
 'Hostage': 13,
 'Explosion': 14,
 'Organized crime': 15,
 'Gangs': 16,
 'National security': 17,
 'State of emergency': 18,
 'Security breach': 19,
 'Threat': 20,
 'Standoff': 21,
 'Lockdown': 22,
 'Bomb': 23,
 'Riot': 24,
 'Emergency Landing': 25,
 'Incident': 26,
 'Suspicious': 27,
 'Nuclear threat': 28,
 'Hazardous': 29,
 'Infection': 30,
 'Outbreak': 31,
 'Contamination': 32,
 'Terror': 33,
 'Epidemic': 34,
 'Critical Infrastructure': 35,
 'National infrastructure': 36,
 'Transportation security': 37,
 'Grid': 38,
 'Outage': 39,
 'Disruption': 40,
 'Violence': 41,
 'Drug cartel': 42,
 'Narcotics': 43,
 'Shootout': 44,
 'Trafficking': 45,
 'Kidnap': 46,
 'Illegal': 47,
 'Smuggling': 48,
 'Al Qaeda': 49,
 'Terror attack': 50,


In [129]:
print(tfidf)

  (0, 79)	1.0
  (8, 91)	1.0
  (28, 99)	1.0
  (37, 101)	1.0
  (43, 95)	1.0
  (73, 86)	1.0
  (75, 88)	1.0
  (78, 79)	1.0
  (95, 79)	1.0
  (98, 86)	1.0
  (104, 92)	1.0
  (138, 79)	0.6002262913313283
  (138, 78)	0.7998302314833063
  (144, 101)	1.0
  (147, 94)	1.0
  (151, 79)	1.0
  (159, 96)	1.0
  (176, 86)	1.0
  (177, 89)	1.0
  (197, 96)	1.0
  (201, 86)	1.0
  (202, 98)	1.0
  (216, 79)	1.0
  (226, 86)	1.0
  (238, 79)	1.0
  :	:
  (9993, 79)	1.0
  (10007, 81)	1.0
  (10008, 97)	1.0
  (10014, 96)	1.0
  (10016, 79)	1.0
  (10028, 89)	1.0
  (10034, 94)	1.0
  (10037, 101)	1.0
  (10039, 94)	1.0
  (10050, 89)	1.0
  (10061, 83)	1.0
  (10065, 98)	1.0
  (10069, 95)	1.0
  (10092, 86)	1.0
  (10129, 82)	1.0
  (10135, 79)	1.0
  (10172, 86)	1.0
  (10174, 98)	1.0
  (10178, 78)	1.0
  (10181, 101)	1.0
  (10191, 79)	1.0
  (10192, 79)	1.0
  (10216, 97)	1.0
  (10217, 101)	1.0
  (10222, 86)	1.0


In [130]:
train_df_1['statement'][138]

'Research shows vast majority arriving immigrants today come believe government source prosperity, thats support.'

### Checking cosine similarity

In [131]:
from scipy import spatial
cos_doc1 = 1 - spatial.distance.cosine(tfidf[0].toarray(), tfidf[138].toarray())
cos_doc1

0.6002262913313283

In [133]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

nb_pipeline = Pipeline([('NBCV',vectorizer),('nb_clf',MultinomialNB())])

nb_pipeline.fit(X_train,y_train)
predicted_nb = nb_pipeline.predict(X_test)
print(np.mean(predicted_nb == y_test))

0.2123125493291239
