
# Relevance Detection


In [1]:
import sys
sys.path.insert(0, './preprocessing/')
import pandas as pd
import nltk
import numpy as np
import preprocessing.main
import preprocessing.helpers
import preprocessing.utils
import preprocessing.feature_engineering
import preprocessing.word_embeddings
import utils
import importlib
from collections import Counter
from sklearn.metrics import confusion_matrix
import score
from sklearn.preprocessing import StandardScaler

  from numpy.core.umath_tests import inner1d


In [2]:
importlib.reload(preprocessing.main)
importlib.reload(preprocessing.utils)
importlib.reload(preprocessing.helpers)
importlib.reload(preprocessing.feature_engineering)
importlib.reload(preprocessing.word_embeddings)
preprocess = preprocessing.main.Preprocessing()

In [3]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
print(train_stances.shape)
train_stances.head()

(49972, 3)


Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [4]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [6]:
stances_tr, stances_val = preprocess.train_test_split(train_bodies, train_stances)
stances_tr.shape, stances_val.shape

((40635, 3), (9337, 3))

In [6]:
stances_tr.to_csv("saved_data/stances_tr.csv")
stances_val.to_csv("saved_data/stances_val.csv")

In [7]:
# this one takes a while!
idf = preprocess.build_idf(train_bodies, stances_tr)

In [8]:
import json

with open('saved_data/idf.json', 'w') as fp:
    json.dump(idf, fp)

In [9]:
Counter(train_stances['Stance'])

Counter({'agree': 3678, 'disagree': 840, 'discuss': 8909, 'unrelated': 36545})

In [10]:
#this is just a comparison between using IDF score and not using IDF score - not related to the model
#change the body id to see
body = preprocess.get_body(5, train_bodies)
#no IDF
processed2 = preprocess.process_body(body)
print(processed2['common_nouns'],processed2['common_verbs'])

#with IDF
processed = preprocess.process_body(body, idf)
print(processed['common_nouns'],processed['common_verbs'])

['burger', 'year', 'friend', 'news', 'report'] ['say', 'bought', 'showed', 'started', 'wonder']
['burger', 'year', 'friend', 'charity', 'mcjordan'] ['bought', 'started', 'showed', 'dissuaded', 'sauce']


In [11]:
body = preprocess.get_body(1369, train_bodies)
processed = preprocess.process_body(body, idf)
print(processed['first_sentence']['tokens'])
print(processed['significant_sentence']['tokens'])
print(processed['first_sentence']['adverbs'],processed['significant_sentence']['adverbs'])
print(processed['first_sentence']['adjectives'],processed['significant_sentence']['adjectives'])
print(processed['first_sentence']['verbs'],processed['significant_sentence']['verbs'])

['unconfirmed', 'report', 'circulating', 'social', 'medium', 'islamic', 'state', 'group', 'carried', 'chemical', 'attack', 'battling', 'kurdish', 'force', 'kobani']
['symptom', 'chlorine', 'attack', 'include', 'teary', 'eye', 'burning', 'sensation', 'throat', 'sensation', 'suffocation', 'headache']
[] []
['unconfirmed', 'social', 'islamic', 'chemical', 'kurdish'] ['symptom', 'teary']
['circulating', 'carried', 'battling'] ['include', 'burning']


In [8]:
#this takes a while!
body_info = preprocess.process_bodies(train_bodies, idf)

processed 100
processed 200
processed 300
processed 400
processed 500
processed 600
processed 700
processed 800
processed 900
processed 1000
processed 1100
processed 1200
processed 1300
processed 1400
processed 1500
processed 1600
done! processed 1683


In [10]:
import json
json_body_info = {}
for k in body_info:
    body_info[k]['vocabulary'] = list(body_info[k]['vocabulary'])
    json_body_info[str(k)] = body_info[k]
with open('saved_data/body_info.json', 'w') as fp:
    json.dump(json_body_info, fp)

In [14]:
feats_list = [
    'shared_nouns',
    'shared_verbs',
    'shared_bigrams',
    'shared_tokens',

    'shared_nouns_fst',
    'shared_verbs_fst',
    'shared_bigrams_fst',
    'shared_tokens_fst',

    'shared_nouns_sig',
    'shared_verbs_sig',
    'shared_bigrams_sig',
    'shared_tokens_sig',

    'svo_s_fst',
    'svo_v_fst',
    'svo_o_fst',
    
    'svo_s_sig',
    'svo_v_sig',
    'svo_o_sig',
    
    'cos_nouns_sig',
    'cos_bigrams_sig',
    'cos_tokens_sig',

    'cos_nouns_fst',
    'cos_bigrams_fst',
    'cos_tokens_fst',
    
    'sentiment_pos',
    'sentiment_neg',
    'sentiment_neu',
    'sentiment_compound',
    
    'sentiment_pos_fst',
    'sentiment_neg_fst',
    'sentiment_neu_fst',
    'sentiment_compound_fst',
    
    'sentiment_pos_sig',
    'sentiment_neg_sig',
    'sentiment_neu_sig',
    'sentiment_compound_sig',
]

In [15]:
import time
# this one takes a while also! ~10 mins
start = time.time()
#train data
data_feats = [preprocess.get_feats(i, body_info) for i in stances_tr.values]
val_feats = [preprocess.get_feats(i, body_info) for i in stances_val.values]
end = time.time()
print(int(end-start))

641


In [16]:
with open('saved_data/train_feats.json', 'w') as fp:
    json.dump(data_feats, fp)
with open('saved_data/val_feats.json', 'w') as fp:
    json.dump(val_feats, fp)

In [18]:
#training data
train_df = pd.DataFrame()
for i in feats_list:
    print(i)
    train_df[i] = [x[i] for x in data_feats]

#val data
val_df = pd.DataFrame()
for i in feats_list:
    val_df[i] = [x[i] for x in val_feats]

shared_nouns
shared_verbs
shared_bigrams
shared_tokens
shared_nouns_fst
shared_verbs_fst
shared_bigrams_fst
shared_tokens_fst
shared_nouns_sig
shared_verbs_sig
shared_bigrams_sig
shared_tokens_sig
svo_s_fst
svo_v_fst
svo_o_fst
svo_s_sig
svo_v_sig
svo_o_sig
cos_nouns_sig
cos_bigrams_sig
cos_tokens_sig
cos_nouns_fst
cos_bigrams_fst
cos_tokens_fst
sentiment_pos
sentiment_neg
sentiment_neu
sentiment_compound
sentiment_pos_fst
sentiment_neg_fst
sentiment_neu_fst
sentiment_compound_fst
sentiment_pos_sig
sentiment_neg_sig
sentiment_neu_sig
sentiment_compound_sig


In [19]:
train_df.head()

Unnamed: 0,shared_nouns,shared_verbs,shared_bigrams,shared_tokens,shared_nouns_fst,shared_verbs_fst,shared_bigrams_fst,shared_tokens_fst,shared_nouns_sig,shared_verbs_sig,...,sentiment_neu,sentiment_compound,sentiment_pos_fst,sentiment_neg_fst,sentiment_neu_fst,sentiment_compound_fst,sentiment_pos_sig,sentiment_neg_sig,sentiment_neu_sig,sentiment_compound_sig
0,0,0,0,0,0,0,0,0,0,0,...,-0.125444,-0.703411,-0.098,0.194,-0.096,-0.8167,-0.124,0.194,-0.07,-0.8167
1,0,0,0,0,0,0,0,0,0,0,...,0.098,0.372425,0.0,-0.186,0.186,0.83,0.0,0.0,0.0,0.0
2,1,0,0,1,0,0,0,0,1,0,...,0.100029,-0.110026,0.0,-0.198,0.198,0.4939,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,-0.254083,-0.785625,0.0,0.246,-0.246,-0.232,0.0,0.343,-0.343,-0.6908
4,0,1,0,0,0,1,0,2,0,0,...,-0.029455,-0.634673,-0.069,0.102,-0.033,0.0,-0.313,0.198,0.115,-1.2855


In [20]:
train_df['label'] = [0 if x == "unrelated" else 1 for x in list(stances_tr['Stance'])]
val_df['label'] = [0 if x == "unrelated" else 1 for x in list(stances_val['Stance'])]
train_df.head()

Unnamed: 0,shared_nouns,shared_verbs,shared_bigrams,shared_tokens,shared_nouns_fst,shared_verbs_fst,shared_bigrams_fst,shared_tokens_fst,shared_nouns_sig,shared_verbs_sig,...,sentiment_compound,sentiment_pos_fst,sentiment_neg_fst,sentiment_neu_fst,sentiment_compound_fst,sentiment_pos_sig,sentiment_neg_sig,sentiment_neu_sig,sentiment_compound_sig,label
0,0,0,0,0,0,0,0,0,0,0,...,-0.703411,-0.098,0.194,-0.096,-0.8167,-0.124,0.194,-0.07,-0.8167,0
1,0,0,0,0,0,0,0,0,0,0,...,0.372425,0.0,-0.186,0.186,0.83,0.0,0.0,0.0,0.0,0
2,1,0,0,1,0,0,0,0,1,0,...,-0.110026,0.0,-0.198,0.198,0.4939,0.0,0.0,0.0,0.0,1
3,0,0,0,0,0,0,0,0,0,0,...,-0.785625,0.0,0.246,-0.246,-0.232,0.0,0.343,-0.343,-0.6908,0
4,0,1,0,0,0,1,0,2,0,0,...,-0.634673,-0.069,0.102,-0.033,0.0,-0.313,0.198,0.115,-1.2855,1


In [21]:
train_df.to_csv("saved_data/train_data.csv")
val_df.to_csv("saved_data/val_data.csv")

In [22]:
scaler = StandardScaler()

for i in feats_list:
    train_df[i] = scaler.fit_transform(train_df[i].values.reshape(-1,1))
    val_df[i] = scaler.fit_transform(val_df[i].values.reshape(-1,1))
train_df.head()



Unnamed: 0,shared_nouns,shared_verbs,shared_bigrams,shared_tokens,shared_nouns_fst,shared_verbs_fst,shared_bigrams_fst,shared_tokens_fst,shared_nouns_sig,shared_verbs_sig,...,sentiment_compound,sentiment_pos_fst,sentiment_neg_fst,sentiment_neu_fst,sentiment_compound_fst,sentiment_pos_sig,sentiment_neg_sig,sentiment_neu_sig,sentiment_compound_sig,label
0,-0.398302,-0.213156,0.0,-0.439242,-0.495243,-0.264359,-0.34473,-0.543312,-0.372422,-0.168255,...,-1.52742,-0.766033,0.771855,-0.28088,-1.510595,-0.307755,0.66907,-0.345443,-1.22438,0
1,-0.398302,-0.213156,0.0,-0.439242,-0.495243,-0.264359,-0.34473,-0.543312,-0.372422,-0.168255,...,1.298953,0.071125,-1.189612,1.042358,1.598287,0.387839,-0.226392,-0.077032,0.393939,0
2,1.180317,-0.213156,0.0,0.894055,-0.495243,-0.264359,-0.34473,-0.543312,1.119214,-0.168255,...,0.031487,0.071125,-1.251553,1.098666,0.963748,0.387839,-0.226392,-0.077032,0.393939,1
3,-0.398302,-0.213156,0.0,-0.439242,-0.495243,-0.264359,-0.34473,-0.543312,-0.372422,-0.168255,...,-1.743408,0.071125,1.040266,-0.98473,-0.406712,0.387839,1.356822,-1.392247,-0.974905,0
4,-0.398302,4.092797,0.0,-0.439242,-0.495243,2.884222,-0.34473,0.599421,-0.372422,-0.168255,...,-1.346835,-0.518303,0.296973,0.014737,0.031291,-1.367974,0.687534,0.36393,-2.153323,1


In [23]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#adjust params as you see fit
model = DecisionTreeClassifier(min_samples_split = 10, min_samples_leaf = 5, max_depth = 15)
model2 = RandomForestClassifier(n_estimators = 100, min_samples_split = 10, min_samples_leaf = 5, max_depth = 6)
# model2 = LogisticRegression()
model.fit(train_df.iloc[:,:-1], train_df.iloc[:,-1])
model2.fit(train_df.iloc[:,:-1], train_df.iloc[:,-1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
tr_acc = model.score(train_df.iloc[:,:-1], train_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% training accuracy'.format(tr_acc*100))
val_acc = model.score(val_df.iloc[:,:-1], val_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% validation accuracy'.format(val_acc*100))

tr_acc = model2.score(train_df.iloc[:,:-1], train_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% training accuracy'.format(tr_acc*100))
val_acc = model2.score(val_df.iloc[:,:-1], val_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% validation accuracy'.format(val_acc*100))

97.26% training accuracy
94.22% validation accuracy
95.35% training accuracy
95.44% validation accuracy


In [25]:
# get coefficients - logistic
# [(feats_list[i],model2.coef_[0][i]) for i in list(range(len(feats_list)))]

#coefficients - lasso
#model.coef_

In [26]:
# #usage example for json dump
utils.rf_json_dump(model2, list(train_df.iloc[:,:-1]), "saved_models/random_forest_dump.json")

In [27]:
#dump validation data to CSV
val_df.to_csv('saved_data/test_val_dump.csv')

In [28]:
def score_model(predictions):
    true_label = [(1 if x[-1] == "discuss" else 0) for x in stances_val.values]
    matrix = confusion_matrix(true_label,predictions)
    print('confusion matrix: \n{}\n'.format(matrix))
    #use FNC scorer to generate score report
    label_prediction = [("discuss" if x == 1 else "unrelated") for x in predictions]
    label_actual = pd.DataFrame(stances_val)['Stance']
    score.report_score(label_actual, label_prediction)

In [29]:
true_label = val_df.iloc[:,-1]
prediction = model.predict(val_df.iloc[:,:-1])
score_model(prediction)

confusion matrix: 
[[7437 1008]
 [ 219 1492]]

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     0     |     0     |    604    |    73     |
-------------------------------------------------------------
| disagree  |     0     |     0     |    132    |    23     |
-------------------------------------------------------------
|  discuss  |     0     |     0     |   1492    |    219    |
-------------------------------------------------------------
| unrelated |     0     |     0     |    272    |   7341    |
-------------------------------------------------------------
Score: 3511.25 out of 4446.25	(78.97104301377566%)


In [30]:
from joblib import dump, load
dump(model, 'saved_models/rf_trained.joblib')

['saved_models/rf_trained.joblib']