In [2]:
"""
Relevance Detection
"""

'\nRelevance Detection\n'

In [65]:
import pandas as pd
import nltk
import numpy as np
import preprocessing
import utils
import importlib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter
from sklearn.metrics import confusion_matrix
import score
from sklearn.preprocessing import StandardScaler

In [73]:
#run this cell to reload the preprocessing module
importlib.reload(preprocessing)
importlib.reload(utils)

<module 'utils' from '/Users/dannyyang/Documents/GitHub/Insights-FakeNews/utils.py'>

In [5]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
print(train_stances.shape)
train_stances.head()

(49972, 3)


Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [6]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [7]:
stances_tr, stances_val = preprocessing.train_test_split(train_bodies, train_stances)
stances_tr.shape, stances_val.shape

((39092, 3), (10880, 3))

In [8]:
# this one takes a while!
idf = preprocessing.build_idf(train_bodies, stances_tr)

In [9]:
#this is just a comparison between using IDF score and not using IDF score - not related to the model
#change the body id to see
body = preprocessing.get_body(5, train_bodies)
#no IDF
processed2 = preprocessing.process_body(body)
print(processed2['common_nouns'],processed2['common_verbs'])

#with IDF
processed = preprocessing.process_body(body, idf)
print(processed['common_nouns'],processed['common_verbs'])

['burger', 'year', 'friend', 'news', 'report'] ['bought', 'showed', 'started', 'wonder', 'went']
['burger', 'year', 'friend', 'charity', 'mcjordan'] ['bought', 'started', 'showed', 'dissuaded', 'wrapping']


In [10]:
body = preprocessing.get_body(1369, train_bodies)
processed = preprocessing.process_body(body, idf)
print(processed['first_sentence']['tokens'])
print(processed['significant_sentence']['tokens'])
print(processed['first_sentence']['adverbs'],processed['significant_sentence']['adverbs'])
print(processed['first_sentence']['adjectives'],processed['significant_sentence']['adjectives'])
print(processed['first_sentence']['verbs'],processed['significant_sentence']['verbs'])

['unconfirmed', 'report', 'circulating', 'social', 'medium', 'islamic', 'state', 'group', 'carried', 'chemical', 'attack', 'battling', 'kurdish', 'force', 'kobani']
['kurdish', 'affair', 'analyst', 'mutlu', 'civiroglu', 'spoke', 'remaining', 'doctor', 'inside', 'kobani', 'told', 'victim', 'civilian']
[] []
['unconfirmed', 'social', 'islamic', 'chemical', 'kurdish'] ['kurdish', 'civiroglu']
['circulating', 'carried', 'battling'] ['spoke', 'remaining', 'told']


In [11]:
#this takes a while!
body_info = preprocessing.process_bodies(train_bodies, idf)

processed 100
processed 200
processed 300
processed 400
processed 500
processed 600
processed 700
processed 800
processed 900
processed 1000
processed 1100
processed 1200
processed 1300
processed 1400
processed 1500
processed 1600
done! processed 1683


In [12]:
feats_list = [
    'shared_nouns',
    'shared_verbs',
    'shared_bigrams',
    'shared_trigrams',
    'shared_tokens',
#     'sentiment_pos',
#     'sentiment_neg',
#     'sentiment_neu',
#     'sentiment_compound',

    'shared_nouns_fst',
    'shared_verbs_fst',
    'shared_bigrams_fst',
    'shared_trigrams_fst',
    'shared_tokens_fst',
#     'sentiment_pos_fst',
#     'sentiment_neg_fst',
#     'sentiment_neu_fst',
#     'sentiment_compound_fst',

    'shared_nouns_sig',
    'shared_verbs_sig',
    'shared_bigrams_sig',
    'shared_trigrams_sig',
    'shared_tokens_sig',
#     'sentiment_pos_sig',
#     'sentiment_neg_sig',
#     'sentiment_neu_sig',
#     'sentiment_compound_sig',
    
    'cos_nouns_sig',
    'cos_bigrams_sig',
    'cos_trigrams_sig',
    'cos_tokens_sig',

    'cos_nouns_fst',
    'cos_bigrams_fst',
    'cos_trigrams_fst',
    'cos_tokens_fst',
]

In [13]:
# this one takes a while also!

#train data
data_feats = [preprocessing.get_feats(i, body_info) for i in stances_tr.values]
train_df = pd.DataFrame()
for i in feats_list:
    train_df[i] = [x[i] for x in data_feats]

#val data
val_feats = [preprocessing.get_feats(i, body_info) for i in stances_val.values]
val_df = pd.DataFrame()
for i in feats_list:
    val_df[i] = [x[i] for x in val_feats]

In [14]:
train_df.head()

Unnamed: 0,shared_nouns,shared_verbs,shared_bigrams,shared_trigrams,shared_tokens,shared_nouns_fst,shared_verbs_fst,shared_bigrams_fst,shared_trigrams_fst,shared_tokens_fst,...,shared_trigrams_sig,shared_tokens_sig,cos_nouns_sig,cos_bigrams_sig,cos_trigrams_sig,cos_tokens_sig,cos_nouns_fst,cos_bigrams_fst,cos_trigrams_fst,cos_tokens_fst
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1,0,0,0,3,0,0,0,0,0,...,0,1,0.666667,1.0,1.0,0.841886,1.0,1.0,1.0,1.0
3,4,0,0,0,12,3,0,2,1,8,...,6,9,0.087129,0.332576,0.367544,0.216651,0.53709,0.838835,0.912294,0.403715
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
train_df['label'] = [-1 if x == "unrelated" else 1 for x in list(stances_tr['Stance'])]
val_df['label'] = [-1 if x == "unrelated" else 1 for x in list(stances_val['Stance'])]
Counter(train_df['label'])

Counter({-1: 28461, 1: 10631})

In [16]:
scaler = StandardScaler()

for i in feats_list:
    train_df[i] = scaler.fit_transform(train_df[i].values.reshape(-1,1))
    val_df[i] = scaler.fit_transform(val_df[i].values.reshape(-1,1))
train_df.head()



Unnamed: 0,shared_nouns,shared_verbs,shared_bigrams,shared_trigrams,shared_tokens,shared_nouns_fst,shared_verbs_fst,shared_bigrams_fst,shared_trigrams_fst,shared_tokens_fst,...,shared_tokens_sig,cos_nouns_sig,cos_bigrams_sig,cos_trigrams_sig,cos_tokens_sig,cos_nouns_fst,cos_bigrams_fst,cos_trigrams_fst,cos_tokens_fst,label
0,-0.40347,-0.213539,0.0,0.0,-0.73469,-0.497478,-0.263532,-0.350397,-0.1648,-0.545021,...,-0.428456,0.33864,0.223402,0.082722,0.441556,0.462431,0.236175,0.169702,0.559241,-1
1,-0.40347,-0.213539,0.0,0.0,-0.73469,-0.497478,-0.263532,-0.350397,-0.1648,-0.545021,...,-0.428456,0.33864,0.223402,0.082722,0.441556,0.462431,0.236175,0.169702,0.559241,-1
2,1.169056,-0.213539,0.0,0.0,0.457675,-0.497478,-0.263532,-0.350397,-0.1648,-0.545021,...,0.560515,-1.826376,0.223402,0.082722,-1.000884,0.462431,0.236175,0.169702,0.559241,1
3,5.886634,-0.213539,0.0,0.0,4.034773,2.482053,-0.263532,2.256457,2.374555,4.119287,...,8.472286,-5.590501,-12.468218,-4.790421,-6.70478,-2.028037,-0.922746,-0.131111,-3.440219,1
4,-0.40347,-0.213539,0.0,0.0,-0.73469,-0.497478,-0.263532,-0.350397,-0.1648,-0.545021,...,-0.428456,0.33864,0.223402,0.082722,0.441556,0.462431,0.236175,0.169702,0.559241,-1


In [57]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100, min_samples_split = 6, min_samples_leaf = 3, max_depth = 10)
# Lasso(alpha=0.01)
# train_df.iloc[:,-1].values.reshape(-1)
model.fit(train_df.iloc[:,:-1], train_df.iloc[:,-1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=6,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [58]:
tr_acc = model.score(train_df.iloc[:,:-1], train_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% training accuracy'.format(tr_acc*100))
val_acc = model.score(val_df.iloc[:,:-1], val_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% validation accuracy'.format(val_acc*100))

96.82% training accuracy
96.38% validation accuracy


In [29]:
#get coefficients - logistic
#[(feats_list[i],model.coef_[0][i]) for i in list(range(len(feats_list)))]

#coefficients - lasso
#model.coef_

In [74]:
#usage example for json dump
utils.rf_json_dump(model, list(train_df), "test_rf_dump.json")

In [62]:
true_label = val_df.iloc[:,-1]
prediction = model.predict(val_df.iloc[:,:-1])
matrix = confusion_matrix(true_label,prediction)
print('confusion matrix: \n{}\n'.format(matrix))
tn1, fp1, fn1, tp1 = matrix.ravel()

confusion matrix: 
[[7891  193]
 [ 201 2595]]



In [63]:
#use FNC scorer to generate score report
label_prediction = ["discuss" if i == 1 else "unrelated" for i in prediction]
label_actual = pd.DataFrame(stances_val)['Stance']
score.report_score(label_actual, label_prediction)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     0     |     0     |    730    |    50     |
-------------------------------------------------------------
| disagree  |     0     |     0     |    163    |    19     |
-------------------------------------------------------------
|  discuss  |     0     |     0     |   1702    |    132    |
-------------------------------------------------------------
| unrelated |     0     |     0     |    193    |   7891    |
-------------------------------------------------------------
Score: 3898.0 out of 4817.0	(80.92173552003321%)


80.92173552003321