# Finding differences in similar news using doc2vec

In [11]:

import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')



import time
import pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from operator import itemgetter
from sklearn.neighbors import NearestNeighbors
import numpy as np
import multiprocessing

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

models_folder = 'models/'


In [None]:
cores = multiprocessing.cpu_count()
print('Number of cores=', cores)

cores_to_use = cores - 1


In [13]:
news_folder = 'data/'
news_file = 'articles1.csv'

news_df = pd.read_csv(news_folder+news_file)
news_df = news_df['content']
print(news_df.head(4))

0    WASHINGTON  —   Congressional Republicans have...
1    After the bullet shells get counted, the blood...
2    When Walt Disney’s “Bambi” opened in 1942, cri...
3    Death may be the great equalizer, but it isn’t...
Name: content, dtype: object


In [14]:
print('Total documents: %d' % news_df.shape[0])

Total documents: 50000


In [15]:
number_of_news_documents = 50000
short_news_df = news_df[0:number_of_news_documents]
print(short_news_df.shape)
print(short_news_df.head(6))

(50000,)
0    WASHINGTON  —   Congressional Republicans have...
1    After the bullet shells get counted, the blood...
2    When Walt Disney’s “Bambi” opened in 1942, cri...
3    Death may be the great equalizer, but it isn’t...
4    SEOUL, South Korea  —   North Korea’s leader, ...
5    LONDON  —   Queen Elizabeth II, who has been b...
Name: content, dtype: object


In [16]:
del news_df
duplicate = short_news_df[0]
duplicate_half = int(len(duplicate)/2)+1
duplicate = duplicate[duplicate_half:]

# Append the "duplicate" news to the end
short_news_df = short_news_df.append(pd.Series(duplicate), 
                                     ignore_index=True)

#print(short_news_df[5])

In [18]:
docs_folder = 'documents/'
texts = ['msn.txt', 'bloomberg.txt']

for text in texts:
    f = open(docs_folder+text, 'r')
    lines = [line for line in f.readlines() if line.strip()]
    # remove the first 2 (agency, title)
    f.close()
    lines = lines[2:]
    lines = ' '.join(lines)
    print(lines[:100])
    
    # Add the text as a new entry in the news_df
    short_news_df = short_news_df.append(pd.Series(lines), 
                                     ignore_index=True)
print(short_news_df.shape)

Supreme Court nominee Brett Kavanaugh angrily and "unequivocally" denied sexually assaulting Christi
Supreme Court nominee Brett Kavanaugh angrily, tearfully and "unequivocally" denied sexually assault
(50005,)


In [19]:
total_documents = short_news_df.shape[0]
print('The half document has the index', total_documents-3)
print('The msn article has the index', total_documents-2)
print('The bloomberg article has the index', total_documents-1)

The half document has the index 50002
The msn article has the index 50003
The bloomberg article has the index 50004


In [23]:
sentences_per_doc = {}
text_sentences_per_doc = {}

# convert series to dict
data = short_news_df.to_dict()

# Traverse each document and count how many sentences it has
# put in a dictionary the {document_id: num_of_sentences}
total_sentences = 0
for k,v in data.items():
    sentences = sent_tokenize(v, language='english')
    #print(k, len(sentences))
    sentences_per_doc[k]=len(sentences)
    text_sentences_per_doc[k] = sentences
    total_sentences = total_sentences + len(sentences)

#print('Sentences per document')
#print(sentences_per_doc)

print('Total sentences for the corpus: %d' % total_sentences)
print('Average number of sentences per document: %f' % 
      (total_sentences/number_of_news_documents))

print()
print('As an example, this document has only %d sentences' % sentences_per_doc[5])
for i, sent in enumerate(text_sentences_per_doc[5]):
    print(i+1,' ',sent)


Total sentences for the corpus: 1520898
Average number of sentences per document: 30.417960

As an example, this document has only 7 sentences
1   LONDON  —   Queen Elizabeth II, who has been battling a cold for more than a week, missed a New Year’s Day church service at her country estate in Sandringham, Buckingham Palace said on Sunday.
2   A week earlier, the queen, who is 90, missed a Christmas Day church service, for the first time since 1988, because of the illness.
3   “The Queen does not yet feel ready to attend church as she is still recuperating from a heavy cold,” the palace said in a statement.
4   The queen’s husband, Prince Philip, who had also been ill, was well enough to attend both services, in the church at Sandringham, which is in Norfolk, on the east coast of England.
5   The queen, who ascended to the throne in 1952, became the world’s   monarch following the death of King Bhumibol Adulyadej of Thailand in October.
6   She is also Britain’s   monarch, having last y

In [26]:
# Create a list with the labels for each sentence in each document
# with the form: docid_sentid


labels = []
for k,v in sentences_per_doc.items():
    initial_value = 0
    end_value = initial_value + v
    #print(k, initial_value, end_value, (v, end_value-initial_value))
    for sent_id in range(initial_value, end_value):
        #print(str(k) + '_' +str(sent_id))
        label = str(k) + '_' +str(sent_id)
        labels.append(label)
    
    initial_value = end_value

print(labels[:100])
##['0_0', '0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7', '0_8',...]
print()


['0_0', '0_1', '0_2', '0_3', '0_4', '0_5', '0_6', '0_7', '0_8', '0_9', '0_10', '0_11', '0_12', '0_13', '0_14', '0_15', '0_16', '0_17', '0_18', '0_19', '0_20', '0_21', '0_22', '0_23', '0_24', '0_25', '1_0', '1_1', '1_2', '1_3', '1_4', '1_5', '1_6', '1_7', '1_8', '1_9', '1_10', '1_11', '1_12', '1_13', '1_14', '1_15', '1_16', '1_17', '1_18', '1_19', '1_20', '1_21', '1_22', '1_23', '1_24', '1_25', '1_26', '1_27', '1_28', '1_29', '1_30', '1_31', '1_32', '1_33', '1_34', '1_35', '1_36', '1_37', '1_38', '1_39', '1_40', '1_41', '1_42', '1_43', '1_44', '1_45', '1_46', '1_47', '1_48', '1_49', '1_50', '1_51', '1_52', '1_53', '1_54', '1_55', '1_56', '1_57', '1_58', '1_59', '1_60', '1_61', '1_62', '1_63', '1_64', '1_65', '1_66', '1_67', '1_68', '1_69', '1_70', '1_71', '1_72', '1_73']



In [27]:
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')

def simple_preprocessing(sent):
    sent = sent.lower()
    sent = sent.replace("—","")
    sent = sent.replace('“', '')
    sent = sent.replace('”','')
    sent = sent.replace(":","")
    sent = sent.replace(",","")
    sent = sent.replace(".","")
    sent = sent.replace("(","")
    sent = sent.replace(")","")
    
     # remove stop words
    #sent_no_stop_words = [word for word in sent.split() if word not in STOPWORDS]
    #sent = ' '.join(sent_no_stop_words)
    return sent

def remove_stop_words(sent):
    # remove stop words
    sent_no_stop_words = [word for word in sent.split() if word not in STOPWORDS]
    sent = ' '.join(sent_no_stop_words)
    return sent


#print(text_sentences_per_doc[1])
#print(len(text_sentences_per_doc[1]))
all_sentences = []
for doc, text in text_sentences_per_doc.items():
    for sent in text:
        sent = simple_preprocessing(sent)
        sent = remove_stop_words(sent)
        all_sentences.append(sent)
#print(all_sentences[:10])

all_sentence_labels = zip(all_sentences, labels)
#x_list = list(all_sentence_labels)
#print(x_list[:5])

# Make tuples (sentence, label). 
#[TaggedDocument(words=['i', 'love', 'machine', 'learningits', 'awesome'], tags=['0']),...

# Tag the sentences 
tagged_data = [TaggedDocument(words=word_tokenize(sentence),
                             tags= [label]) for sentence, label in all_sentence_labels]

In [35]:
print("Show how tagged sentences look like")
print(tagged_data[25])
print()
print(all_sentences[25])
print()
print(short_news_df[0][-203:])

Show how tagged sentences look like
TaggedDocument(['complicated', 'set', 'dynamics', 'illustrating', 'quick', 'legal', 'victory', 'house', 'trump', 'era', 'might', 'come', 'costs', 'republicans', 'never', 'anticipated', 'took', 'obama', 'white', 'house'], ['0_25'])

complicated set dynamics illustrating quick legal victory house trump era might come costs republicans never anticipated took obama white house

It is a complicated set of dynamics illustrating how a quick legal victory for the House in the Trump era might come with costs that Republicans never anticipated when they took on the Obama White House.


In [37]:
'''
start = time.time()

max_epochs = 100
vec_size = 300
alpha = 0.025
seed = 42
context_window = 30

model = Doc2Vec(vector_size=vec_size, window=context_window, workers=cores_to_use, 
                alpha=alpha, min_alpha=0.00025, min_count=1, dm=1, seed=seed)
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save(models_folder+"d2v.model")
print("Model saved")

print(time.time() - start)
'''

'\nstart = time.time()\n\nmax_epochs = 100\nvec_size = 300\nalpha = 0.025\nseed = 42\ncontext_window = 30\n\nmodel = Doc2Vec(vector_size=vec_size, window=context_window, workers=cores_to_use, \n                alpha=alpha, min_alpha=0.00025, min_count=1, dm=1, seed=seed)\nmodel.build_vocab(tagged_data)\n\nfor epoch in range(max_epochs):\n    print(\'iteration {0}\'.format(epoch))\n    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)\n    # decrease the learning rate\n    model.alpha -= 0.0002\n    # fix the learning rate, no decay\n    model.min_alpha = model.alpha\n\nmodel.save(models_folder+"d2v.model")\nprint("Model saved")\n\nprint(time.time() - start)\n'

In [38]:
model = Doc2Vec.load(models_folder+"d2v.model")  # you can continue training with the loaded model!

In [40]:
print(model.docvecs['0_23'])


[ 1.84445035e+00  7.21494198e-01  6.03644699e-02 -1.38297930e-01
  5.91027141e-01  6.03812742e+00 -1.41148388e+00 -5.12961268e-01
 -2.42788410e+00  1.08142889e+00 -2.42075419e+00 -2.82275629e+00
  1.95072663e+00 -2.44059587e+00  1.06435668e+00 -1.75341618e+00
 -1.37170851e-01  5.03636956e-01 -2.80203128e+00  2.74401736e+00
 -5.19004427e-02 -1.53500748e+00 -1.52062607e+00  7.98701167e-01
  1.96566701e+00 -2.07759261e+00 -1.29905319e+00  7.58102119e-01
 -2.03614044e+00  6.26642704e-01 -3.06225002e-01 -5.98171532e-01
  8.70930180e-02  1.01052117e+00  6.85774267e-01 -3.04163873e-01
 -3.79774714e+00  1.17267084e+00 -2.41367638e-01 -3.29442835e+00
 -2.16527796e+00  2.75537181e+00  1.09594011e+00  2.26118398e+00
 -2.08677697e+00  2.86010098e+00 -2.54139185e+00 -3.34972739e+00
  6.14363074e-01  2.91762662e+00 -2.21987534e+00 -1.39930713e+00
  2.68269444e+00 -1.82928455e+00  1.80063570e+00 -2.25001082e-01
  2.74404502e+00 -6.63519919e-01  1.19418979e+00 -2.34047413e+00
  6.40871227e-01 -4.54237

Doc2vec uses word2vec. Do some test examples

In [41]:
word = 'obama'

print('Most similar words to', word)
print(model.most_similar(word))

print()

word = 'china'
print('Most similar words to', word)
print(model.most_similar(word))

print()

word = 'machine'
print('Most similar words to', word)
print(model.most_similar(word))

print()

word = 'good'
print('Most similar words to', word)
print(model.most_similar(word))

print()

word = 'innocent'
print('Most similar words to', word)
print(model.most_similar(word))
print()

word = 'guilty'
print('Most similar words to', word)
print(model.most_similar(word))

print()


Most similar words to obama


  after removing the cwd from sys.path.


[('trump', 0.509588897228241), ('incoming', 0.49790042638778687), ('bush', 0.4892093241214752), ('pence', 0.48554351925849915), ('netanyahu', 0.47789430618286133), ('kerry', 0.47299259901046753), ('congress', 0.46616506576538086), ('sisi', 0.4641965925693512), ('kaine', 0.45805624127388), ('ryan', 0.4576532244682312)]

Most similar words to china
[('beijing', 0.767208456993103), ('chinese', 0.761801540851593), ('japan', 0.6881505250930786), ('asia', 0.6422123908996582), ('pyongyang', 0.622802734375), ('nato', 0.6147975921630859), ('iran', 0.6020537614822388), ('india', 0.5979095101356506), ('europe', 0.5766202211380005), ('nuclear', 0.5762708783149719)]

Most similar words to machine
[('machines', 0.5711404085159302), ('computer', 0.4750906527042389), ('robot', 0.47467562556266785), ('cloud', 0.4681369364261627), ('computers', 0.4675503969192505), ('everything', 0.4641663730144501), ('stuff', 0.45496082305908203), ('camera', 0.45097923278808594), ('software', 0.4473637044429779), ('dev

  # Remove the CWD from sys.path while we load stuff.
  app.launch_new_instance()


In [46]:
sentence_to_check = '12_5'
original_doc, original_sentence = sentence_to_check.split("_")
print('From document %s, check sentence number %s:' % (original_doc, original_sentence))
sentence = text_sentences_per_doc[int(original_doc)][int(original_sentence)]
print(sentence)

similar_docs = model.docvecs.most_similar(sentence_to_check)
print()

max_similar_doc, probability = max(similar_docs, key=itemgetter(1))

similar_doc, similar_sentence = max_similar_doc.split("_")

#print(similar_doc, similar_sentence)
print('The most similar sentence is from document %s sentence number %s, with a probability of %f:' % 
                                    (similar_doc, similar_sentence, probability))

sentence = text_sentences_per_doc[int(similar_doc)][int(similar_sentence)]
print(sentence)

From document 12, check sentence number 5:
And that was always the intention of Marjorie Meriweather Post, the cereal heiress and the property’s original owner, who left    to the federal government when she died in 1973, hoping it would serve as a home for presidents.

The most similar sentence is from document 3372 sentence number 8, with a probability of 0.453919:
There’s Mad Max’s Thunderdome, where gladiators died.


In [50]:
total_documents = short_news_df.shape[0]
print('The half document has the index', total_documents-3)
print('The msn article has the index', total_documents-2)
print('The bloomberg article has the index', total_documents-1)


The half document has the index 50002
The msn article has the index 50003
The bloomberg article has the index 50004


In [49]:
sentence_to_check = '0_20'
original_doc, original_sentence = sentence_to_check.split("_")
print('From document %s, check sentence number %s:' % (original_doc, original_sentence))
sentence = text_sentences_per_doc[int(original_doc)][int(original_sentence)]
print(sentence)

similar_docs = model.docvecs.most_similar(sentence_to_check)
print()

max_similar_doc, probability = max(similar_docs, key=itemgetter(1))

similar_doc, similar_sentence = max_similar_doc.split("_")

#print(similar_doc, similar_sentence)
print('The most similar sentence is from document %s sentence number %s, with a probability of %f:' % 
                                    (similar_doc, similar_sentence, probability))

sentence = text_sentences_per_doc[int(similar_doc)][int(similar_sentence)]
print(sentence)


From document 0, check sentence number 20:
House Republicans contend that Congress never appropriated the money for the subsidies, as required by the Constitution.

The most similar sentence is from document 50000 sentence number 7, with a probability of 0.975474:
House Republicans contend that Congress never appropriated the money for the subsidies, as required by the Constitution.


In [51]:
doc_P = 50001    # 0 Complete document    <- This is read first
doc_Q = 50002    # number_of_news_documents <= ~Half of document 0 <- Should I read this later?


def similarity_between_sentences(model, vec_sent_q):
    return model.kneighbors(vec_sent_q.reshape(1, -1), return_distance=True)


def print_if_similar(doc, doc_Q, sent_q, sent, distance):
    d = 1 / (1+distance)
    if d > 0.65:
        ##print('***',text_sentences_per_doc[doc][sent])
        ##print()
        pass
    else:
        no_similars.append((doc_Q, sent_q))
        sentence = text_sentences_per_doc[doc_Q][sent_q]
        #sent_label = str(doc_Q) + "_" + str(sent_q)
        print('#' * 50)
        print('So, are new ideas in this sentence? [%d]=> %s' % (len(no_similars),sentence))
        print('#' * 50)
        print()
        
        
total_sentences_P = sentences_per_doc[doc_P]
total_sentences_Q = sentences_per_doc[doc_Q]

print('Document P has %d sentences' % total_sentences_P)
print('Document Q has %d sentences' % total_sentences_Q)
print()

# Need an array X with all the sentence vector for P
X = []

for id_sent in range(total_sentences_P):
    sent_label = str(doc_P) + "_" + str(id_sent)
    sent_vector = model.docvecs[sent_label]
    X.append(sent_vector)

X = np.array(X)
print('Size of X= ',X.shape)
knn = NearestNeighbors(n_neighbors=1, metric='cosine')
knn.fit(X)

print()
no_similars = []

print('Sentences in document Q not similar those in document P are:\n')
for sent_q in range(total_sentences_Q):
    sentence = text_sentences_per_doc[doc_Q][sent_q]
    sent_label = str(doc_Q) + "_" + str(sent_q)
    #print(sent_label, sentence)
    sent_vector = model.docvecs[sent_label]
    distance, nearest_sent = similarity_between_sentences(knn, sent_vector)
    
    print(sentence)
    print('\t\tThe most similar sentence is %d, with a similarity of %f' % (nearest_sent[0][0], 
                                                                        (1/(1+distance[0][0]))))    
    print(text_sentences_per_doc[doc_P][nearest_sent[0][0]])
    print()
    

    print_if_similar(doc_P, doc_Q, sent_q, nearest_sent[0][0], distance[0][0])
    
#print('%d sentences in document Q are different from document P' % len(no_similars))
#print('Document Q is %.2f %% similar to document P' % 
#                        ( ((total_sentences_P - len(no_similars))/total_sentences_P)*100 ))

Document P has 65 sentences
Document Q has 71 sentences

Size of X=  (65, 300)

Sentences in document Q not similar those in document P are:

Supreme Court nominee Brett Kavanaugh angrily, tearfully and "unequivocally" denied sexually assaulting Christine Blasey Ford, after she told senators at a dramatic hearing that she’s "one hundred percent" certain he is the one who attacked her when they were teenagers.
		The most similar sentence is 0, with a similarity of 0.953718
Supreme Court nominee Brett Kavanaugh angrily and "unequivocally" denied sexually assaulting Christine Blasey Ford, after she told senators at a dramatic hearing that she’s "one hundred percent" certain he is the one who attacked her when they were teenagers.

"I was not at the party described by Dr. Ford," Kavanaugh told the Senate Judiciary Committee Thursday as he tried to save his nomination in the face of public claims of sexual misconduct by three women.
		The most similar sentence is 1, with a similarity of 0.9

In [56]:
from rake_nltk import Rake

r = Rake()

#print(no_similars)

for no_sim in no_similars:
    doc, sent = no_sim
    text = text_sentences_per_doc[doc][sent]
    text = simple_preprocessing(text)
    print(text)
    
    r.extract_keywords_from_text(text)
    r.get_ranked_phrases()
    print(r.get_ranked_phrases_with_scores()[0])
    print()

the nominee was tearful through portions of his opening statement while expressing gratitude to his friends saying he had no sexual intercourse until well after high school and saying he drank beer in high school
(4.0, 'sexual intercourse')

he said his calendar for the summer of 1982 "shows all but definitely that i was not there"
(1.0, 'summer')

if an unproven allegation "is enough to destroy a person’s life and career we will have abandoned the basic principles of fairness and due process that define our legal system and our country" kavanaugh said
(4.0, 'unproven allegation')



In [57]:
from gensim.summarization import summarize
from gensim.summarization import keywords

In [58]:
for no_sim in no_similars:
    doc, sent = no_sim
    text = text_sentences_per_doc[doc][sent]
    text = simple_preprocessing(text)
    print(text)
    if keywords(text):
        print('Keywords')
        print(keywords(text))
    print()

the nominee was tearful through portions of his opening statement while expressing gratitude to his friends saying he had no sexual intercourse until well after high school and saying he drank beer in high school
Keywords
friends saying

he said his calendar for the summer of 1982 "shows all but definitely that i was not there"

if an unproven allegation "is enough to destroy a person’s life and career we will have abandoned the basic principles of fairness and due process that define our legal system and our country" kavanaugh said
Keywords
kavanaugh



In [59]:
from nltk import word_tokenize, pos_tag
#sentence = "At eight o'clock on Thursday film morning word line test best beautiful Ram Aaron design"

def extract_nouns_from_POS(sent):
    nouns = [token for token, pos in pos_tag(word_tokenize(sent)) if pos.startswith('N')]
    return nouns

for no_sim in no_similars:
    doc, sent = no_sim
    text = text_sentences_per_doc[doc][sent]
    text = simple_preprocessing(text)
    print(text)
    if extract_nouns_from_POS(text):
        print('Keywords')
        print(','.join(extract_nouns_from_POS(text)))
    print()

the nominee was tearful through portions of his opening statement while expressing gratitude to his friends saying he had no sexual intercourse until well after high school and saying he drank beer in high school
Keywords
nominee,portions,statement,gratitude,friends,intercourse,school,beer,school

he said his calendar for the summer of 1982 "shows all but definitely that i was not there"
Keywords
calendar,summer,i

if an unproven allegation "is enough to destroy a person’s life and career we will have abandoned the basic principles of fairness and due process that define our legal system and our country" kavanaugh said
Keywords
allegation,person,’,life,career,principles,fairness,process,system,country,kavanaugh



In [60]:
from nltk.chunk import RegexpParser

def extract_NN(sent):
    grammar = r"""
    NBAR:
        # Nouns and Adjectives, terminated with Nouns
        #{<NN.*>*<NN.*>}
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

    NP:
        {<NBAR>}
        # Above, connected with in/of/etc...
        {<NBAR><IN><NBAR>}
    """
    chunker = RegexpParser(grammar)
    ne = set()
    chunk = chunker.parse(pos_tag(word_tokenize(sent)))
    for tree in chunk.subtrees(filter=lambda t: t.label() == 'NP'):
        ne.add(' '.join([child[0] for child in tree.leaves()]))
    return ne

In [None]:
for no_sim in no_similars:
    doc, sent = no_sim
    text = text_sentences_per_doc[doc][sent]
    text = simple_preprocessing(text)
    print(text)
    if extract_NN(text):
        print('Keywords')
        print(extract_NN(text))
    print()

#text = 'the black dog is brave'
#print(extract_NN(text))

