In [None]:
"""
Relevance Detection
"""

In [107]:
import pandas as pd
import nltk
import numpy as np
import preprocessing
import utils
import importlib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter
from sklearn.metrics import confusion_matrix
import score
from sklearn.preprocessing import StandardScaler
import scipy
import matplotlib.pyplot as plt

In [155]:
#run this cell to reload the preprocessing module
importlib.reload(preprocessing)
importlib.reload(utils)

<module 'utils' from '/Users/dannyyang/Documents/GitHub/Insights-FakeNews/utils.py'>

In [109]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
print(train_stances.shape)
train_stances.head()

(49972, 3)


Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [110]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [111]:
stances_tr, stances_val = preprocessing.train_test_split(train_bodies, train_stances)
stances_tr.shape, stances_val.shape

((40206, 3), (9766, 3))

In [112]:
# this one takes a while!
idf = preprocessing.build_idf(train_bodies, stances_tr)

In [114]:
#this is just a comparison between using IDF score and not using IDF score - not related to the model
#change the body id to see
body = preprocessing.get_body(5, train_bodies)
#no IDF
processed2 = preprocessing.process_body(body)
print(processed2['common_nouns'],processed2['common_verbs'])

#with IDF
processed = preprocessing.process_body(body, idf)
print(processed['common_nouns'],processed['common_verbs'])

['burger', 'year', 'friend', 'news', 'report'] ['bought', 'showed', 'started', 'wonder', 'went']
['burger', 'year', 'friend', 'charity', 'mcjordan'] ['bought', 'started', 'showed', 'dissuaded', 'sauce']


In [113]:
body = preprocessing.get_body(1369, train_bodies)
processed = preprocessing.process_body(body, idf)
print(processed['first_sentence']['tokens'])
print(processed['significant_sentence']['tokens'])
print(processed['first_sentence']['adverbs'],processed['significant_sentence']['adverbs'])
print(processed['first_sentence']['adjectives'],processed['significant_sentence']['adjectives'])
print(processed['first_sentence']['verbs'],processed['significant_sentence']['verbs'])

['unconfirmed', 'report', 'circulating', 'social', 'medium', 'islamic', 'state', 'group', 'carried', 'chemical', 'attack', 'battling', 'kurdish', 'force', 'kobani']
['kurdish', 'affair', 'analyst', 'mutlu', 'civiroglu', 'spoke', 'remaining', 'doctor', 'inside', 'kobani', 'told', 'victim', 'civilian']
[] []
['unconfirmed', 'social', 'islamic', 'chemical', 'kurdish'] ['kurdish', 'civiroglu']
['circulating', 'carried', 'battling'] ['spoke', 'remaining', 'told']


In [117]:
#this takes a while!
body_info = preprocessing.process_bodies(train_bodies, idf)

processed 100
processed 200
processed 300
processed 400
processed 500
processed 600
processed 700
processed 800
processed 900
processed 1000
processed 1100
processed 1200
processed 1300
processed 1400
processed 1500
processed 1600
done! processed 1683


In [120]:
feats_list = [
    'shared_nouns',
    'shared_verbs',
    'shared_bigrams',
    'shared_tokens',

    'shared_nouns_fst',
    'shared_verbs_fst',
    'shared_bigrams_fst',
    'shared_tokens_fst',

    'shared_nouns_sig',
    'shared_verbs_sig',
    'shared_bigrams_sig',
    'shared_tokens_sig',

    'svo_s_fst',
    'svo_v_fst',
    'svo_o_fst',
    
    'svo_s_sig',
    'svo_v_sig',
    'svo_o_sig',
    
    'cos_nouns_sig',
    'cos_bigrams_sig',
    'cos_tokens_sig',

    'cos_nouns_fst',
    'cos_bigrams_fst',
    'cos_tokens_fst',
    
#     'sentiment_pos',
#     'sentiment_neg',
#     'sentiment_neu',
#     'sentiment_compound',
#     'sentiment_pos_fst',
#     'sentiment_neg_fst',
#     'sentiment_neu_fst',
#     'sentiment_compound_fst',
#     'sentiment_pos_sig',
#     'sentiment_neg_sig',
#     'sentiment_neu_sig',
#     'sentiment_compound_sig',
]

In [136]:
import time
# this one takes a while also! ~10 mins
start = time.time()
#train data
data_feats = [preprocessing.get_feats(i, body_info) for i in stances_tr.values]
val_feats = [preprocessing.get_feats(i, body_info) for i in stances_val.values]
end = time.time()
print(int(end-start))

1541125124.573781 1541125740.718914 616.1451330184937


In [125]:
#training data
train_df = pd.DataFrame()
for i in feats_list:
    print(i)
    train_df[i] = [x[i] for x in data_feats]

#val data
val_df = pd.DataFrame()
for i in feats_list:
    val_df[i] = [x[i] for x in val_feats]

shared_nouns
shared_verbs
shared_bigrams
shared_tokens
shared_nouns_fst
shared_verbs_fst
shared_bigrams_fst
shared_tokens_fst
shared_nouns_sig
shared_verbs_sig
shared_bigrams_sig
shared_tokens_sig
svo_s_fst
svo_v_fst
svo_o_fst
svo_s_sig
svo_v_sig
svo_o_sig
cos_nouns_sig
cos_bigrams_sig
cos_tokens_sig
cos_nouns_fst
cos_bigrams_fst
cos_tokens_fst


In [126]:
train_df.head()

Unnamed: 0,shared_nouns,shared_verbs,shared_bigrams,shared_tokens,shared_nouns_fst,shared_verbs_fst,shared_bigrams_fst,shared_tokens_fst,shared_nouns_sig,shared_verbs_sig,...,svo_o_fst,svo_s_sig,svo_v_sig,svo_o_sig,cos_nouns_sig,cos_bigrams_sig,cos_tokens_sig,cos_nouns_fst,cos_bigrams_fst,cos_tokens_fst
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,1,3,0,0,0,0,1,0,...,0,0,0,0,0.333333,0.0,0.158114,0.0,0.0,0.0
4,4,0,10,12,3,0,2,8,5,0,...,0,0,0,0,0.912871,0.667424,0.783349,0.46291,0.161165,0.596285


In [127]:
train_df['label'] = [0 if x == "unrelated" else 1 for x in list(stances_tr['Stance'])]
val_df['label'] = [0 if x == "unrelated" else 1 for x in list(stances_val['Stance'])]
train_df.head()

Unnamed: 0,shared_nouns,shared_verbs,shared_bigrams,shared_tokens,shared_nouns_fst,shared_verbs_fst,shared_bigrams_fst,shared_tokens_fst,shared_nouns_sig,shared_verbs_sig,...,svo_s_sig,svo_v_sig,svo_o_sig,cos_nouns_sig,cos_bigrams_sig,cos_tokens_sig,cos_nouns_fst,cos_bigrams_fst,cos_tokens_fst,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1,0,1,3,0,0,0,0,1,0,...,0,0,0,0.333333,0.0,0.158114,0.0,0.0,0.0,1
4,4,0,10,12,3,0,2,8,5,0,...,0,0,0,0.912871,0.667424,0.783349,0.46291,0.161165,0.596285,1


In [None]:
scaler = StandardScaler()

for i in feats_list:
    train_df[i] = scaler.fit_transform(train_df[i].values.reshape(-1,1))
    val_df[i] = scaler.fit_transform(val_df[i].values.reshape(-1,1))
train_df.head()

In [137]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.ensemble import RandomForestClassifier
#adjust params as you see fit
model = RandomForestClassifier(n_estimators = 100, min_samples_split = 10, min_samples_leaf = 5, max_depth = 6)
model2 = LogisticRegression()
# Lasso(alpha=0.01)
# train_df.iloc[:,-1].values.reshape(-1)
model.fit(train_df.iloc[:,:-1], train_df.iloc[:,-1])
model2.fit(train_df.iloc[:,:-1], train_df.iloc[:,-1])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [129]:
tr_acc = model.score(train_df.iloc[:,:-1], train_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% training accuracy'.format(tr_acc*100))
val_acc = model.score(val_df.iloc[:,:-1], val_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% validation accuracy'.format(val_acc*100))

tr_acc = model2.score(train_df.iloc[:,:-1], train_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% training accuracy'.format(tr_acc*100))
val_acc = model2.score(val_df.iloc[:,:-1], val_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% validation accuracy'.format(val_acc*100))

96.26% training accuracy
95.70% validation accuracy
96.11% training accuracy
95.86% validation accuracy


In [130]:
# get coefficients - logistic
[(feats_list[i],model2.coef_[0][i]) for i in list(range(len(feats_list)))]

#coefficients - lasso
#model.coef_

[('shared_nouns', 0.9445529445664204),
 ('shared_verbs', 0.5627097897457684),
 ('shared_bigrams', 0.5882515891618841),
 ('shared_tokens', 0.9923697856542205),
 ('shared_nouns_fst', -0.524771664105064),
 ('shared_verbs_fst', -0.30206612724325316),
 ('shared_bigrams_fst', -0.7727822286944236),
 ('shared_tokens_fst', 0.657592170120743),
 ('shared_nouns_sig', -0.11078943400719751),
 ('shared_verbs_sig', -0.9193868905287218),
 ('shared_bigrams_sig', -1.1170127874372646),
 ('shared_tokens_sig', 0.13029587005899385),
 ('svo_s_fst', 0.33137469895071614),
 ('svo_v_fst', 0.08039467066230246),
 ('svo_o_fst', 1.2671948487726286),
 ('svo_s_sig', -0.08745631957778915),
 ('svo_v_sig', 0.05754306383376062),
 ('svo_o_sig', 0.7576735379101213),
 ('cos_nouns_sig', 0.4563105777768112),
 ('cos_bigrams_sig', -0.7098969804968825),
 ('cos_tokens_sig', 6.438465194457702),
 ('cos_nouns_fst', 0.647538934319205),
 ('cos_bigrams_fst', -3.8168400899139585),
 ('cos_tokens_fst', 8.488825484813695)]

In [156]:
# #usage example for json dump
utils.rf_json_dump(model, list(train_df.iloc[:,:-1]), "test_rf_dump.json")

In [139]:
#dump validation data to CSV
val_df.to_csv('test_val_dump.csv')

In [133]:
true_label = val_df.iloc[:,-1]
prediction = model.predict(val_df.iloc[:,:-1])
matrix = confusion_matrix(true_label,prediction)
print('confusion matrix: \n{}\n'.format(matrix))
tn1, fp1, fn1, tp1 = matrix.ravel()

confusion matrix: 
[[6863  193]
 [ 227 2483]]



In [134]:
#use FNC scorer to generate score report
label_prediction = ["discuss" if i == 1 else "unrelated" for i in prediction]
label_actual = pd.DataFrame(stances_val)['Stance']
score.report_score(label_actual, label_prediction)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     0     |     0     |    619    |    66     |
-------------------------------------------------------------
| disagree  |     0     |     0     |    109    |    19     |
-------------------------------------------------------------
|  discuss  |     0     |     0     |   1755    |    142    |
-------------------------------------------------------------
| unrelated |     0     |     0     |    193    |   6863    |
-------------------------------------------------------------
Score: 3652.75 out of 4474.0	(81.6439427805096%)


81.6439427805096