In [None]:
# DO NOT RUN THIS AGAIN
#!pip install -U textblob
#!python -m textblob.download_corpora

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import tree
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from textblob import Blobber
from textblob import TextBlob
from textblob.taggers import NLTKTagger
from textblob.wordnet import Synset
from textblob import Word
from textblob.wordnet import NOUN
from textblob.wordnet import VERB
#from textblob.wordnet import ADJ
#from textblob.wordnet import ADVERB

In [None]:
# Load the data into a pandas dataframe and split it up
df = pd.read_csv("train.csv")

df_train = df.sample(frac=0.8)
df_test = df.drop(df_train.index)

y_train = df_train.iloc[:,[-1]]
X_train = df_train.drop(y_train.columns, axis=1)

y_test = df_test.iloc[:,[-1]]
X_test = df_test.drop(y_test.columns, axis=1)

## Steps
1. Take fake news articles and create TF-IDF (might want to cut duplicate titles)
2. Strip out articles & prepositions
3. See what words are most "fake newsy"

In [None]:
# # Drop duplicates to prevent headlines that appear a lot from skewing the data
# fake_news = X_train["title1_en"].drop_duplicates().tolist()

# # Gross, disgusting regex to cut stop words and other silliness
# for i in range(len(fake_news)):
#     fake_news[i] = re.sub('(\s+)(a|an|and|the|of|from|to|by|in|is|#|aaa+(\.*)*|<\s+i\s+>)(\s+)', ' ', fake_news[i])
#     fake_news[i] = re.sub('A(\s+)', '', fake_news[i])

In [None]:
# # Cram this stuff into a vectorizer
# vectorizer = TfidfVectorizer()
# fake_news_tfidf = vectorizer.fit_transform(fake_news)
# words = vectorizer.get_feature_names()

In [None]:
# # We only need the first document, turn it into a dataframe so we can have a look
# first_doc_vector = fake_news_tfidf[0]
# df_fake_news_tfidf = pd.DataFrame(data=first_doc_vector.T.todense(), index=words, columns=["score"])
# df_fake_news_tfidf.sort_values(by="score", ascending=False)

## Now we do something with this: Jaccard Similarity

Now that we know what looks kinda like fake news, try and see which articles are related.

In [None]:
def jaccard(string1, string2):
    string1 = re.sub('(\s+)(a|an|and|the|of|from|to|by|in|is|#|aaa+(\.*)*|<\s+i\s+>)(\s+)', ' ', string1)
    string1 = re.sub('A(\s+)|The(\s+)', '', string1)
    string1 = re.sub(r'[^\w\s]', '', string1)

    string2 = re.sub('(\s+)(a|an|and|the|of|from|to|by|in|is|#|aaa+(\.*)*|<\s+i\s+>)(\s+)', ' ', string2)
    string2 = re.sub('A(\s+)', '', string2)
    string2 = re.sub(r'[^\w\s]', '', string2)
    
    list1 = string1.split()
    set1 = set(list1)
    
    list2 = string2.split()
    set2 = set(list2)
    
    numerator = float(len(set1.intersection(set2)))
    denominator = len(set1.union(set2))
    
    return numerator/denominator

In [None]:
def jaccard_sim(list1, list2):
    set1 = set(list1)
    
    set2 = set(list2)
    
    numerator = float(len(set1.intersection(set2)))
    denominator = len(set1.union(set2))
    
    if denominator == 0:
        return 0
    else:
        return numerator/denominator

Extract Jaccard score as a feature.

In [None]:
# total = []

# for index, x in X_train.iterrows():
#     total.append(jaccard(x["title1_en"], x["title2_en"]))

# X_train["jaccard"] = total

# X_objective = X_train[["id", "jaccard"]]

In [None]:
# clf = tree.DecisionTreeClassifier(max_depth=1)
# clf.fit(X_objective, y_train)

In [None]:
# y_train.value_counts() # So we can label things correctly

In [None]:
# listo = []

# for index, x in X_test.iterrows():
#     listo.append(jaccard(x["title1_en"], x["title2_en"]))
    
# X_test["jaccard"] = listo

# X_test_o = X_test[["id", "jaccard"]]

# y_pred = clf.predict(X_test_o)

# print(metrics.classification_report(y_test, y_pred, target_names=["agreed", "disagreed", "unrelated"]))

In [None]:
#tree.plot_tree(clf)

## On to the next thing

We achieve 75% accuracy by checking title similarities, but we can go even further beyond. This will require more in-depth analysis. Notably, we don't detect *any* disagreed articles and our detection of agreed articles is still not quite there yet.

## Let's diagram the sentences and give them a score

The idea is as follows:
1. Tokenize & diagram headline 1 and headline 2
2. Give each a score based on noun, verb, adj/adv similarity (normalized by length of sentence)
3. See where this takes us

In [None]:
def score_pos_sims(string1, string2):
    tagger = Blobber(pos_tagger=NLTKTagger())

    pnoun_list = ["NNP", "NNPS"]
    noun_list = ["NN", "NNS", "PRP", "PRP$"]
    verb_list = ["VB", "VBP", "VBZ", "VBD", "VBN", "VBG"]
    adj_list = ["CD", "JJS", "JJR"]
    adv_list = ["RBS", "RBR", "RB"]

    nouns1 = []
    verbs1 = []
    pnouns1 = []
    adjs1 = []
    advs1 = []
    
    nouns2 = []
    verbs2 = []
    pnouns2 = []
    adjs2 = []
    advs2 = []

    blob1 = tagger(string1)
    blob2 = tagger(string2)

    for x in blob1.tags:
        if x[1] in noun_list:
            w = Word(x[0])
            w = w.lemmatize("n")
            nouns1.append(w.lower())
        elif x[1] in pnoun_list:
            pnouns1.append(x[0].lower())
        elif x[1] in verb_list:
            w = Word(x[0])
            w = w.lemmatize("v")
            verbs1.append(x[0].lower())
        elif x[1] in adj_list:
            adjs1.append(x[0].lower())
        elif x[1] in adv_list:
            advs1.append(x[0].lower())

    for x in blob2.tags:
        if x[1] in noun_list:
            w = Word(x[0])
            w = w.lemmatize("n")
            nouns2.append(w.lower())
        elif x[1] in pnoun_list:
            pnouns2.append(x[0].lower())
        elif x[1] in verb_list:
            w = Word(x[0])
            w = w.lemmatize("v")
            verbs2.append(x[0].lower())
        elif x[1] in adj_list:
            adjs2.append(x[0].lower())
        elif x[1] in adv_list:
            advs2.append(x[0].lower())

    result = [None] * 5
    result[0] = jaccard_sim(pnouns1, pnouns2)
    result[1] = jaccard_sim(nouns1, nouns2)
    result[2] = jaccard_sim(verbs1, verbs2)
    result[3] = jaccard_sim(adjs1, adjs2)
    result[4] = jaccard_sim(advs1, advs2)
    return result

In [None]:
pnoun_sim = []
noun_sim = []
verb_sim = []
adj_sim = []
adv_sim = []
tot_sim = []

for index, x in X_train.iterrows():
    string1 = x["title1_en"]
    string2 = x["title2_en"]
    
    result = score_pos_sims(string1, string2)
    
    pnoun_sim.append(result[0])
    noun_sim.append(result[1])
    verb_sim.append(result[2])
    adj_sim.append(result[3])
    adv_sim.append(result[4])
    
    tot_sim.append(jaccard(string1, string2))

X_train["pnoun_sim"] = pnoun_sim
X_train["noun_sim"] = noun_sim
X_train["verb_sim"] = verb_sim
X_train["adj_sim"] = adj_sim
X_train["adv_sim"] = adv_sim
X_train["tot_sim"] = tot_sim

In [None]:
# Cram this into a Naive Bayes
#mnb = MultinomialNB()

# Or a Decision Tree, whichever does better
#clf = tree.DecisionTreeClassifier(max_depth=5)

X_train_objs = X_train.copy()
X_train_objs = X_train_objs.drop(["id", "tid1", "tid2", "title1_en", "title2_en"], axis=1)

#clf.fit(X_train_objs, y_train)

#mnb.fit(X_train_objs, y_train["label"])

In [None]:
pnoun_sim = []
noun_sim = []
verb_sim = []
adj_sim = []
adv_sim = []
tot_sim = []

for index, x in X_test.iterrows():
    string1 = x["title1_en"]
    string2 = x["title2_en"]
    
    result = score_pos_sims(string1, string2)
    
    pnoun_sim.append(result[0])
    noun_sim.append(result[1])
    verb_sim.append(result[2])
    adj_sim.append(result[3])
    adv_sim.append(result[4])
    
    tot_sim.append(jaccard(string1, string2))

X_test["pnoun_sim"] = pnoun_sim
X_test["noun_sim"] = noun_sim
X_test["verb_sim"] = verb_sim
X_test["adj_sim"] = adj_sim
X_test["adv_sim"] = adv_sim
X_test["tot_sim"] = tot_sim

In [None]:
X_test_o = X_test.copy()
X_test_o = X_test_o.drop(["id", "tid1", "tid2", "title1_en", "title2_en"], axis=1)

In [None]:
y_pred = OneVsRestClassifier(RandomForestClassifier(max_depth=5, random_state=0)).fit(X_train_objs, y_train).predict(X_test_o)

print(metrics.classification_report(y_test, y_pred, target_names=["agreed", "disagreed", "unrelated"]))

NameError: ignored