In [None]:
"""
Relevance Detection
"""

In [30]:
import pandas as pd
import nltk
import numpy as np
import preprocessing
import utils
import importlib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter
from sklearn.metrics import confusion_matrix
import score
from sklearn.preprocessing import StandardScaler
import scipy

In [37]:
#run this cell to reload the preprocessing module
importlib.reload(preprocessing)
importlib.reload(utils)

<module 'utils' from '/Users/dannyyang/Documents/GitHub/Insights-FakeNews/utils.py'>

In [None]:
train_stances = pd.read_csv("fn_data/train_stances.csv")
print(train_stances.shape)
train_stances.head()

In [None]:
train_bodies = pd.read_csv("fn_data/train_bodies.csv")
print(train_bodies.shape)
train_bodies.head()

In [None]:
stances_tr, stances_val = preprocessing.train_test_split(train_bodies, train_stances)
stances_tr.shape, stances_val.shape

In [None]:
# this one takes a while!
idf = preprocessing.build_idf(train_bodies, stances_tr)

In [None]:
#this is just a comparison between using IDF score and not using IDF score - not related to the model
#change the body id to see
body = preprocessing.get_body(5, train_bodies)
#no IDF
processed2 = preprocessing.process_body(body)
print(processed2['common_nouns'],processed2['common_verbs'])

#with IDF
processed = preprocessing.process_body(body, idf)
print(processed['common_nouns'],processed['common_verbs'])

In [None]:
body = preprocessing.get_body(1369, train_bodies)
processed = preprocessing.process_body(body, idf)
print(processed['first_sentence']['tokens'])
print(processed['significant_sentence']['tokens'])
print(processed['first_sentence']['adverbs'],processed['significant_sentence']['adverbs'])
print(processed['first_sentence']['adjectives'],processed['significant_sentence']['adjectives'])
print(processed['first_sentence']['verbs'],processed['significant_sentence']['verbs'])

In [None]:
#this takes a while!
body_info = preprocessing.process_bodies(train_bodies, idf)

In [None]:
feats_list = [
    'shared_nouns',
    'shared_verbs',
    'shared_bigrams',
    'shared_trigrams',
    'shared_tokens',

    'shared_nouns_fst',
    'shared_verbs_fst',
    'shared_bigrams_fst',
    'shared_trigrams_fst',
    'shared_tokens_fst',

    'shared_nouns_sig',
    'shared_verbs_sig',
    'shared_bigrams_sig',
    'shared_trigrams_sig',
    'shared_tokens_sig',

    
    'cos_nouns_sig',
    'cos_bigrams_sig',
    'cos_trigrams_sig',
    'cos_tokens_sig',

    'cos_nouns_fst',
    'cos_bigrams_fst',
    'cos_trigrams_fst',
    'cos_tokens_fst',
    
#     'sentiment_pos',
#     'sentiment_neg',
#     'sentiment_neu',
#     'sentiment_compound',
#     'sentiment_pos_fst',
#     'sentiment_neg_fst',
#     'sentiment_neu_fst',
#     'sentiment_compound_fst',
#     'sentiment_pos_sig',
#     'sentiment_neg_sig',
#     'sentiment_neu_sig',
#     'sentiment_compound_sig',
]

In [None]:
# this one takes a while also!

#train data
data_feats = [preprocessing.get_feats(i, body_info) for i in stances_tr.values]
train_df = pd.DataFrame()
for i in feats_list:
    train_df[i] = [x[i] for x in data_feats]

#val data
val_feats = [preprocessing.get_feats(i, body_info) for i in stances_val.values]
val_df = pd.DataFrame()
for i in feats_list:
    val_df[i] = [x[i] for x in val_feats]

In [None]:
train_df.head()

In [None]:
train_df['label'] = [0 if x == "unrelated" else 1 for x in list(stances_tr['Stance'])]
val_df['label'] = [0 if x == "unrelated" else 1 for x in list(stances_val['Stance'])]
train_df.head()

In [None]:
scaler = StandardScaler()

for i in feats_list:
    train_df[i] = scaler.fit_transform(train_df[i].values.reshape(-1,1))
    val_df[i] = scaler.fit_transform(val_df[i].values.reshape(-1,1))
train_df.head()

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.ensemble import RandomForestClassifier
#adjust params as you see fit
model = RandomForestClassifier(n_estimators = 250, min_samples_split = 10, min_samples_leaf = 5, max_depth = 10)
model2 = LogisticRegression()
# Lasso(alpha=0.01)
# train_df.iloc[:,-1].values.reshape(-1)
model.fit(train_df.iloc[:,:-1], train_df.iloc[:,-1])
model2.fit(train_df.iloc[:,:-1], train_df.iloc[:,-1])

In [None]:
tr_acc = model.score(train_df.iloc[:,:-1], train_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% training accuracy'.format(tr_acc*100))
val_acc = model.score(val_df.iloc[:,:-1], val_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% validation accuracy'.format(val_acc*100))

tr_acc = model2.score(train_df.iloc[:,:-1], train_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% training accuracy'.format(tr_acc*100))
val_acc = model2.score(val_df.iloc[:,:-1], val_df.iloc[:,-1].values.reshape(-1))
print('{0:.2f}% validation accuracy'.format(val_acc*100))

In [None]:
# get coefficients - logistic
[(feats_list[i],model2.coef_[0][i]) for i in list(range(len(feats_list)))]

#coefficients - lasso
#model.coef_

In [None]:
# #usage example for json dump
utils.rf_json_dump(model, list(train_df.iloc[:,:-1]), "test_rf_dump.json")

In [None]:
true_label = val_df.iloc[:,-1]
prediction = model.predict(val_df.iloc[:,:-1])
matrix = confusion_matrix(true_label,prediction)
print('confusion matrix: \n{}\n'.format(matrix))
tn1, fp1, fn1, tp1 = matrix.ravel()

In [None]:
#use FNC scorer to generate score report
label_prediction = ["discuss" if i == 1 else "unrelated" for i in prediction]
label_actual = pd.DataFrame(stances_val)['Stance']
score.report_score(label_actual, label_prediction)

In [11]:
preprocessing.extract_word_embeddings("glove.6B.50d")

In [12]:
glove = preprocessing.get_glove_dict("glove.6B.50d")

In [53]:
preprocessing.cosine_similarity(glove['agree'], glove['disagree'])

0.7695979204587756

In [49]:
preprocessing.get_sentiment("confirm")

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

In [50]:
preprocessing.get_sentiment("deny")

{'compound': -0.34, 'neg': 1.0, 'neu': 0.0, 'pos': 0.0}

In [51]:
preprocessing.get_sentiment("agree")

{'compound': 0.3612, 'neg': 0.0, 'neu': 0.0, 'pos': 1.0}

In [52]:
preprocessing.get_sentiment("disagree")

{'compound': -0.3818, 'neg': 1.0, 'neu': 0.0, 'pos': 0.0}