In [1]:
"""
Relevance Detection
"""

'\nRelevance Detection\n'

In [2]:
import pandas as pd
import nltk
import numpy as np
import preprocessing
import importlib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

In [182]:
#run this cell to reload the preprocessing module
importlib.reload(preprocessing)

<module 'preprocessing' from '/Users/dannyyang/Documents/GitHub/Insights-FakeNews/preprocessing.py'>

In [219]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
print(train_stances.shape)
train_stances.head()

(49972, 3)


Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [5]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

(1683, 2)


Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [220]:
idx = np.random.permutation(np.arange(len(train_stances))) 
stances = train_stances.values[idx]
train = int(len(stances)*0.8)
stances_tr = stances[:train]
stances_val = stances[train:]

In [264]:
idx = np.random.permutation(np.arange(len(train_bodies))) 
bodies = train_bodies.values[idx]
train = int(len(bodies)*0.8)
bodies_tr = set([i[0] for i in bodies[:train]])
bodies_val = set([i[0] for i in bodies[train:]])

In [283]:
stances_tr = train_stances.loc[train_stances["Body ID"].isin(bodies_tr), :]
stances_val = train_stances.loc[train_stances["Body ID"].isin(bodies_val), :]

In [284]:
stances_tr.shape, stances_val.shape

((40854, 3), (9118, 3))

In [288]:
#this one takes a while
corpus = [preprocessing.get_clean_tokens(preprocessing.get_body(x, train_bodies)) for x in list(stances_tr['Body ID'])]

In [289]:
idf = preprocessing.build_idf_tokens(corpus)

In [290]:
body = preprocessing.get_body(5, train_bodies)

#no IDF
processed2 = preprocessing.process_body(body)
print(processed2['common_nouns'],processed2['common_verbs'])

#with IDF
processed = preprocessing.process_body(body, idf)
print(processed['common_nouns'],processed['common_verbs'])

['burger', 'year', 'friend', 'australians', 'mcdonald', 's', 'news', 'report', 'charity', 'quarter'] ['say', 'bought', 'showed', 'started', 's', 'wonder', 'went', 'pretty', 'holding', 'add']
['burger', 'australians', 'mcdonald', 'charity', 'mickey', 'friend', 'depression', 'nitz', 'pounder', 'eduard'] ['bought', 'dissuaded', 'sauce', 'likes', 'wrapping', 'preserved', 'showed', 'started', 'blue', 'selling']


In [292]:
body_info = preprocessing.process_bodies(train_bodies, idf)

processed 100
processed 200
processed 300
processed 400
processed 500
processed 600
processed 700
processed 800
processed 900
processed 1000
processed 1100
processed 1200
processed 1300
processed 1400
processed 1500
processed 1600
done! processed 1683


In [311]:
def get_feats(data, body_dict):
    headline, body_id = data[0],int(data[1])
    headline_data = preprocessing.process_sentence(headline)
    shared_common_nouns = len(set(headline_data['nouns']).intersection(set(body_dict[body_id]['common_nouns'])))
    shared_common_verbs = len(set(headline_data['verbs']).intersection(set(body_dict[body_id]['common_verbs'])))
    shared_bigrams = len(set(headline_data['bigrams']).intersection(set(body_dict[body_id]['common_bigrams'])))
    sentiment_diff = {
        "pos": headline_data['sentiment']['pos']-body_dict[body_id]['sentiment']['pos'],
        "neg": headline_data['sentiment']['neg']-body_dict[body_id]['sentiment']['neg'],
        "neu": headline_data['sentiment']['neu']-body_dict[body_id]['sentiment']['neu'],
        "compound": headline_data['sentiment']['compound']-body_dict[body_id]['sentiment']['compound']
    }
    sentiment_diff_first = {
        "pos": headline_data['sentiment']['pos']-body_dict[body_id]['first_sentence']['sentiment']['pos'],
        "neg": headline_data['sentiment']['neg']-body_dict[body_id]['first_sentence']['sentiment']['neg'],
        "neu": headline_data['sentiment']['neu']-body_dict[body_id]['first_sentence']['sentiment']['neu'],
        "compound": headline_data['sentiment']['compound']-body_dict[body_id]['first_sentence']['sentiment']['compound']
    }
    shared_nouns_first = len(set(headline_data['nouns']).intersection(set(body_dict[body_id]['first_sentence']['nouns'])))
    shared_verbs_first = len(set(headline_data['verbs']).intersection(set(body_dict[body_id]['first_sentence']['verbs'])))
    shared_bigrams_first = len(set(headline_data['bigrams']).intersection(set(body_dict[body_id]['first_sentence']['bigrams'])))
    return {
        'shared_nouns': shared_common_nouns,
        'shared_verbs': shared_common_verbs,
        'shared_bigrams': shared_bigrams,
        'sentiment_pos': sentiment_diff['pos'],
        'sentiment_neg': sentiment_diff['neg'],
        'sentiment_neu': sentiment_diff['neu'],
        'sentiment_compound':sentiment_diff_first['compound'],
        'sentiment_pos_fst': sentiment_diff_first['pos'],
        'sentiment_neg_fst': sentiment_diff_first['neg'],
        'sentiment_neu_fst': sentiment_diff_first['neu'],
        'sentiment_compound_fst':sentiment_diff_first['compound'],
        'shared_nouns_fst':shared_nouns_first,
        'shared_verbs_fst':shared_verbs_first,
        'shared_bigrams_fst':shared_bigrams_first   
    }

In [325]:
# this one takes a while also
feats_list = ['shared_nouns',
        'shared_verbs',
        'shared_bigrams',
#         'sentiment_pos',
#         'sentiment_neg',
#         'sentiment_neu',
#         'sentiment_compound',
#         'sentiment_pos_fst',
#         'sentiment_neg_fst',
#         'sentiment_neu_fst',
#         'sentiment_compound_fst',
        'shared_nouns_fst',
        'shared_verbs_fst',
        'shared_bigrams_fst']
#train data
data_feats = [get_feats(i, body_info) for i in stances_tr.values]
train_df = pd.DataFrame()
for i in feats_list:
    train_df[i] = [x[i] for x in data_feats]

#val data
val_feats = [get_feats(i, body_info) for i in stances_val.values]
val_df = pd.DataFrame()
for i in feats_list:
    val_df[i] = [x[i] for x in val_feats]

In [347]:
train_df.head()

Unnamed: 0,shared_nouns,shared_verbs,shared_bigrams,shared_nouns_fst,shared_verbs_fst,shared_bigrams_fst,label
0,-0.472473,-0.288717,0.0,-0.495007,-0.266476,-0.34531,unrelated
1,2.852854,-0.288717,0.0,2.515265,-0.266476,0.985943,related
2,-0.472473,-0.288717,0.0,-0.495007,-0.266476,-0.34531,unrelated
3,-0.472473,-0.288717,0.0,-0.495007,-0.266476,-0.34531,unrelated
4,0.635969,-0.288717,0.0,-0.495007,-0.266476,-0.34531,related


In [344]:
train_df['label'] = [-1 if x == "unrelated" else 1 for x in list(stances_tr['Stance'])]
val_df['label'] = [-1 if x == "unrelated" else 1 for x in list(stances_val['Stance'])]
Counter(train_df['label'])

Counter({'related': 10946, 'unrelated': 29908})

In [345]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

for i in feats_list:
    train_df[i] = scaler.fit_transform(train_df[i].values.reshape(-1,1))
    val_df[i] = scaler.fit_transform(val_df[i].values.reshape(-1,1))

In [349]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
model = LogisticRegression()
model.fit(train_df.iloc[:,:-1], train_df.iloc[:,-1].values.reshape(-1))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [350]:
tr_acc = model.score(train_df.iloc[:,:-1], train_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% training accuracy'.format(tr_acc*100))

93.33% training accuracy


In [351]:
val_acc = model.score(val_df.iloc[:,:-1], val_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% validation accuracy'.format(val_acc*100))

92.77% validation accuracy


In [352]:
[(feats_list[i],model.coef_[0][i]) for i in list(range(len(feats_list)))]

[('shared_nouns', -2.5250044479641383),
 ('shared_verbs', -0.86364673328689112),
 ('shared_bigrams', 0.0),
 ('shared_nouns_fst', -1.0776500591997549),
 ('shared_verbs_fst', -0.39168422241426637),
 ('shared_bigrams_fst', -1.4871903709783718)]

In [353]:
from sklearn.metrics import confusion_matrix
true_label = val_df.iloc[:,-1]
prediction = model.predict(val_df.iloc[:,:-1])
matrix = confusion_matrix(true_label,prediction)
print('confusion matrix: \n{}\n'.format(matrix))
tn1, fp1, fn1, tp1 = matrix.ravel()

confusion matrix: 
[[1944  537]
 [ 122 6515]]



In [354]:
val_df.shape

(9118, 7)

In [355]:
label_prediction = ["discuss" if i == -1 else "unrelated" for i in prediction] 

In [360]:
label_actual = pd.DataFrame(stances_val)['Stance']


In [361]:
import score
score.report_score(label_actual, label_prediction)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     0     |     0     |     0     |    772    |
-------------------------------------------------------------
| disagree  |     0     |     0     |     0     |    147    |
-------------------------------------------------------------
|  discuss  |     0     |     0     |     0     |   1562    |
-------------------------------------------------------------
| unrelated |     0     |     0     |     0     |   6637    |
-------------------------------------------------------------
Score: 1659.25 out of 4140.25	(40.076082362176194%)


40.076082362176194