In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from secScraper import *

[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from __future__ import division
import string
import math
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_6 = "Vladimir Putin was found to be riding a horse, again, without a shirt on while hunting deer. Vladimir Putin always seems so serious about things - even riding horses."

all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]

In [5]:
def jaccard_similarity(query, document):
    query = query.lower().split(" ")
    document = document.lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [6]:
def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)
 
def sublinear_term_frequency(term, tokenized_document):
    count = tokenized_document.count(term)
    if count == 0:
        return 0
    return 1 + math.log(count)
 
def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))
 
def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values
 
def his_tfidf(str1, str2):
    tokenize = lambda doc: doc.lower().split(" ")
    tokenized_documents = [tokenize(d) for d in [str1, str2]]
    idf = inverse_document_frequencies(tokenized_documents)
    
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return cosine_similarity(*tfidf_documents)

In [7]:
def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude

def sk_cosine_tf(str1, str2):
    #tokenize = lambda doc: doc.lower().split(" ")
    #sklearn_tf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=tokenize)
    sklearn_tf = TfidfVectorizer(norm='l2', use_idf=False)
    sklearn_representation = sklearn_tf.fit_transform([str1, str2])
    return cosine_similarity(*sklearn_representation.toarray())

def sk_cosine_tf_idf(str1, str2):
    #tokenize = lambda doc: doc.lower().split(" ")
    #sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
    sklearn_tfidf = TfidfVectorizer(norm='l2', use_idf=True, sublinear_tf=True)
    sklearn_representation = sklearn_tfidf.fit_transform([str1, str2])
    return cosine_similarity(*sklearn_representation.toarray())

In [8]:
# Jaccard
my_jaccard = np.zeros((7, 7))
his_jaccard = np.zeros((7, 7))

# Cosine tf
my_cosine_tf = np.zeros((7, 7))
his_cosine_tf = np.zeros((7, 7))
sklearn_cosine_tf = np.zeros((7, 7))

# Cosine tf-idf
my_cosine_tf_idf = np.zeros((7, 7))
his_cosine_tf_idf = np.zeros((7, 7))
sklearn_cosine_tf_idf = np.zeros((7, 7))

# Min edits
my_minEdit = np.zeros((7, 7))
gfg_editDistDP = np.zeros((7, 7))

# simpleEdit
my_simpleEdit = np.zeros((7, 7))

In [12]:
"""COMPARISONS"""
sw = None
for idx1, doc_1 in enumerate(all_documents):
    for idx2, doc_2 in enumerate(all_documents):
        doc1 = processing.normalize_text(doc_1)
        doc2 = processing.normalize_text(doc_2)
        
        my_jaccard[idx1, idx2] = metrics.diff_jaccard(doc1, doc2)
        his_jaccard[idx1, idx2] = jaccard_similarity(doc_1, doc_2)
        
        my_cosine_tf[idx1, idx2] = metrics.diff_sk_cosine_tf(doc_1, doc_2, sw)
        his_cosine_tf[idx1, idx2] = 0
        sklearn_cosine_tf[idx1, idx2] = sk_cosine_tf(doc_1, doc_2)
        
        my_cosine_tf_idf[idx1, idx2] = metrics.diff_sk_cosine_tf_idf(doc_1, doc_2, sw)
        his_cosine_tf_idf[idx1, idx2] = his_tfidf(doc_1, doc_2)
        sklearn_cosine_tf_idf[idx1, idx2] = sk_cosine_tf_idf(doc_1, doc_2)
        
        #my_minEdit[idx1, idx2] = metrics.diff_minEdit(doc1, doc2)
        gfg_editDistDP[idx1, idx2] = metrics.diff_gfg_editDistDP(doc1, doc2)
        
        my_simpleEdit[idx1, idx2] = metrics.diff_simple(doc1, doc2)

In [13]:
# Inter correlations - pretty much all the same
# Jaccard
print("Jaccard correlation")
print(np.corrcoef(my_jaccard.reshape(-1,), his_jaccard.reshape(-1,))[0, 1])
print()

# TF
print("TF correlation")
print(np.corrcoef(my_cosine_tf.reshape(-1,), his_cosine_tf.reshape(-1,))[0, 1])
print(np.corrcoef(my_cosine_tf.reshape(-1,), sklearn_cosine_tf.reshape(-1,))[0, 1])
print(np.corrcoef(his_cosine_tf.reshape(-1,), sklearn_cosine_tf.reshape(-1,))[0, 1])
print()

# TF-IDF
print("TF-IDF correlation")
print(np.corrcoef(my_cosine_tf_idf.reshape(-1,), his_cosine_tf_idf.reshape(-1,))[0, 1])
print(np.corrcoef(my_cosine_tf_idf.reshape(-1,), sklearn_cosine_tf_idf.reshape(-1,))[0, 1])
print(np.corrcoef(his_cosine_tf_idf.reshape(-1,), sklearn_cosine_tf_idf.reshape(-1,))[0, 1])

Jaccard correlation
0.9977940992259946

TF correlation
nan
0.9999999999999998
nan

TF-IDF correlation
0.9917110417585476
0.9996895393745222
0.9942233407477253


  c /= stddev[:, None]
  c /= stddev[None, :]


In [11]:
# Cross correlations
print(np.corrcoef(my_jaccard.reshape(-1,), my_cosine_tf.reshape(-1,))[0, 1])
print(np.corrcoef(my_jaccard.reshape(-1,), my_cosine_tf_idf.reshape(-1,))[0, 1])
print(np.corrcoef(my_jaccard.reshape(-1,), my_minEdit.reshape(-1,))[0, 1])
print(np.corrcoef(my_jaccard.reshape(-1,), gfg_editDistDP.reshape(-1,))[0, 1])
print(np.corrcoef(my_jaccard.reshape(-1,), my_simpleEdit.reshape(-1,))[0, 1])


0.9701424599362335
0.9936895911613337
nan
0.9555059103135598
0.9911887436381733


In [12]:
n = 100
doc1 = processing.normalize_text(document_1, rm_stop_words=True, lemmatize=True)
doc2 = processing.normalize_text(document_2, rm_stop_words=True, lemmatize=True)
%timeit metrics.diff_gfg_editDistDP(doc1*n, doc2*n)

1.01 s ± 15.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
