In [1]:
"""
Relevance Detection
"""

'\nRelevance Detection\n'

In [2]:
import pandas as pd
import nltk
import numpy as np
import preprocessing
import importlib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

In [182]:
#run this cell to reload the preprocessing module
importlib.reload(preprocessing)

<module 'preprocessing' from '/Users/dannyyang/Documents/GitHub/Insights-FakeNews/preprocessing.py'>

In [4]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
train_stances["Stance"] = train_stances["Stance"].apply(lambda x: "related" if x != "unrelated" else x)
print(train_stances.shape)
train_stances.head()

(49972, 3)


Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,related
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,related


In [5]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [29]:
idx = np.random.permutation(np.arange(len(train_stances))) 
stances = train_stances.values[idx]
train = int(len(stances)*0.8)
stances_tr = stances[:train]
stances_val = stances[train:]

In [37]:
def get_body_list(data):
    return [preprocessing.get_body(i, train_bodies) for i in set([x[1] for x in data])]

In [43]:
body_list = get_body_list(stances_tr)

In [44]:
corpus = [preprocessing.get_clean_tokens(x) for x in body_list]

In [95]:
idf = preprocessing.build_idf_tokens(corpus)

In [109]:
body = preprocessing.get_body(5, train_bodies)

#no IDF
processed2 = preprocessing.process_body(body)
print(processed2['common_nouns'],processed2['common_verbs'])

#with IDF
processed = preprocessing.process_body(body, idf)
print(processed['common_nouns'],processed['common_verbs'])

['burger', 'year', 'friend', 'australians', 'mcdonald', 's', 'news', 'report', 'charity', 'quarter'] ['say', 'bought', 'showed', 'started', 's', 'wonder', 'went', 'pretty', 'holding', 'add']
['burger', 'australians', 'mcdonald', 'charity', 'mickey', 'friend', 'depression', 'specimen', 'nitz', 'anxiety'] ['bought', 'dissuaded', 'sauce', 'wrapping', 'likes', 'showed', 'preserved', 'started', 'blue', 'wonder']


In [183]:
body_info = preprocessing.process_bodies(train_bodies, idf)

processed 100
processed 200
processed 300
processed 400
processed 500
processed 600
processed 700
processed 800
processed 900
processed 1000
processed 1100
processed 1200
processed 1300
processed 1400
processed 1500
processed 1600
done! processed 1683


In [169]:
def get_feats(data, body_dict):
    headline, body_id = data[0],int(data[1])
    headline_data = preprocessing.process_sentence(headline)
    shared_common_nouns = len(set(headline_data['nouns']).intersection(set(body_dict[body_id]['common_nouns'])))
    shared_common_verbs = len(set(headline_data['verbs']).intersection(set(body_dict[body_id]['common_verbs'])))
    shared_bigrams = len(set(headline_data['bigrams']).intersection(set(body_dict[body_id]['common_bigrams'])))
    sentiment_diff = {
        "pos": headline_data['sentiment']['pos']-body_dict[body_id]['sentiment']['pos'],
        "neg": headline_data['sentiment']['neg']-body_dict[body_id]['sentiment']['neg'],
        "neu": headline_data['sentiment']['neu']-body_dict[body_id]['sentiment']['neu'],
        "compound": headline_data['sentiment']['compound']-body_dict[body_id]['sentiment']['compound']
    }
    sentiment_diff_first = {
        "pos": headline_data['sentiment']['pos']-body_dict[body_id]['first_sentence']['sentiment']['pos'],
        "neg": headline_data['sentiment']['neg']-body_dict[body_id]['first_sentence']['sentiment']['neg'],
        "neu": headline_data['sentiment']['neu']-body_dict[body_id]['first_sentence']['sentiment']['neu'],
        "compound": headline_data['sentiment']['compound']-body_dict[body_id]['first_sentence']['sentiment']['compound']
    }
    shared_nouns_first = len(set(headline_data['nouns']).intersection(set(body_dict[body_id]['first_sentence']['nouns'])))
    shared_verbs_first = len(set(headline_data['verbs']).intersection(set(body_dict[body_id]['first_sentence']['verbs'])))
    shared_bigrams_first = len(set(headline_data['bigrams']).intersection(set(body_dict[body_id]['first_sentence']['bigrams'])))
    return {
        'shared_nouns': shared_common_nouns,
        'shared_verbs': shared_common_verbs,
        'shared_bigrams': shared_bigrams,
        'sentiment_pos': sentiment_diff['pos'],
        'sentiment_neg': sentiment_diff['neg'],
        'sentiment_neu': sentiment_diff['neu'],
        'sentiment_compound':sentiment_diff_first['compound'],
        'sentiment_pos_fst': sentiment_diff_first['pos'],
        'sentiment_neg_fst': sentiment_diff_first['neg'],
        'sentiment_neu_fst': sentiment_diff_first['neu'],
        'sentiment_compound_fst':sentiment_diff_first['compound'],
        'shared_nouns_fst':shared_nouns_first,
        'shared_verbs_fst':shared_verbs_first,
        'shared_bigrams_fst':shared_bigrams_first   
    }

In [184]:
feats_list = ['shared_nouns',
        'shared_verbs',
        'shared_bigrams',
        'sentiment_pos',
        'sentiment_neg',
        'sentiment_neu',
        'sentiment_compound',
        'sentiment_pos_fst',
        'sentiment_neg_fst',
        'sentiment_neu_fst',
        'sentiment_compound_fst',
        'shared_nouns_fst',
        'shared_verbs_fst',
        'shared_bigrams_fst']
#train data
data_feats = [get_feats(i, body_info) for i in stances_tr]
train_df = pd.DataFrame()
train_df['label'] = [1 if x[2] == "unrelated" else -1 for x in stances_tr]
for i in feats_list:
    train_df[i] = [x[i] for x in data_feats]

#val data
val_feats = [get_feats(i, body_info) for i in stances_val]
val_df = pd.DataFrame()
val_df['label'] = [1 if x[2] == "unrelated" else -1 for x in stances_val]
for i in feats_list:
    val_df[i] = [x[i] for x in val_feats]

In [185]:
Counter(train_df['label'])

Counter({-1: 10700, 1: 29277})

In [186]:
pd.DataFrame(stances_tr).head(10)

Unnamed: 0,0,1,2
0,Woman Stays In KFC For A Week To Get Over Her ...,971,unrelated
1,Experts: More facts needed on purported audio ...,195,unrelated
2,Angry mob chops off man's genitals with butche...,17,unrelated
3,Former U.S. soldier says IS used chemical weap...,1369,related
4,These LEGO Instructions from 1974 Are Awesome ...,1169,unrelated
5,Seth Rogen set to play Steve Wozniak in Danny ...,1586,unrelated
6,$YUM Serving Up #Marijuana & Fried Chicken,1826,unrelated
7,The gold Apple Watch Edition could set you bac...,1700,related
8,Airport worker strips naked at security scanne...,2248,unrelated
9,Canadian official identifies dead Ottawa gunma...,2002,unrelated


In [187]:
stances_tr[3][0]
print(' '.join(body_info[1369]['tokens']))
print("\n\n")
preprocessing.get_body(1369, train_bodies)

unconfirmed report circulating social medium islamic state group carried chemical attack battling kurdish force kobani journalist reporting ground tuesday night kurdish official said aware report doctor lack necessary equipment diagnose cause kurdish victim complaint patient said difficulty breathing burn skin teary eye swollen lip syria iraq thought posse chemical weapon u s coalition force feared weapon fall isis hand silent missile missile placed neighborhood chair democratic union party pyd asya abdullah told kurdish question people lost consciousness struggling breathe investigating situation necessary technical equipment expertise journalists reporting border kobani turkey reportedly spoke doctor scene confirmed pyd s description victim al aan tv reporter jenan moussa posted twitter doctor said victim way clinic speak following symptom teary eye suffocation skin burn victims swollen lip moussa added kurdish affair analyst mutlu civiroglu spoke remaining doctor inside kobani told 

'Unconfirmed reports are circulating on social media that the Islamic State group carried out a chemical attack while battling Kurdish forces in Kobani. Several journalists reporting from the ground on Tuesday night -- and at least one Kurdish official -- said they were aware of such reports, but that doctors lack necessary equipment to diagnose the cause of Kurdish victims\' complaints. The patients said they had difficulty breathing, and many had burns on their skin, teary eyes and swollen lips. Syria (as well as Iraq) is thought to possess chemical weapons, and the U.S. and coalition forces have feared that such weapons might fall into ISIS hands.\n\n"It could have been a silent missile or a missile placed in the neighborhood beforehand,” co-chair of the Democratic Union Party (PYD) Asya Abdullah told the Kurdish Question. “Many people have lost consciousness and are struggling to breathe and see. We are investigating the situation but do not have the necessary technical equipment o

In [212]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
model = LogisticRegression()
model.fit(train_df.iloc[:,1:], train_df.iloc[:,0].values.reshape(-1))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [213]:
tr_acc = model.score(train_df.iloc[:,1:], train_df.iloc[:,0].values.reshape(-1))
print('{0:.2f}% training accuracy'.format(tr_acc*100))

93.44% training accuracy


In [214]:
val_acc = model.score(val_df.iloc[:,1:], val_df.iloc[:,0].values.reshape(-1))
print('{0:.2f}% validation accuracy'.format(val_acc*100))

93.25% validation accuracy


In [215]:
[(feats_list[i],model.coef_[0][i]) for i in list(range(len(feats_list)))]

[('shared_nouns', -2.8139141614841305),
 ('shared_verbs', -2.5805092593519139),
 ('shared_bigrams', 0.0),
 ('sentiment_pos', -1.0617313526481831),
 ('sentiment_neg', -0.15746176680818072),
 ('sentiment_neu', -0.17507668645143698),
 ('sentiment_compound', -0.17246418205308636),
 ('sentiment_pos_fst', 1.0884129464244918),
 ('sentiment_neg_fst', -0.82848845357876655),
 ('sentiment_neu_fst', -0.27055575798928239),
 ('sentiment_compound_fst', -0.17246418205308636),
 ('shared_nouns_fst', -1.0136099527997449),
 ('shared_verbs_fst', -1.1997171149430081),
 ('shared_bigrams_fst', -2.2068885647881622)]

In [216]:
from sklearn.metrics import confusion_matrix
true_label = val_df.iloc[:,0]
prediction = model.predict(val_df.iloc[:,1:])
matrix = confusion_matrix(true_label,prediction)
print('confusion matrix: \n{}\n'.format(matrix))
tn1, fp1, fn1, tp1 = matrix.ravel()

confusion matrix: 
[[2230  497]
 [ 178 7090]]



In [217]:
val_df.shape

(9995, 15)