# Keyword modeling with Word2Vec Neutral Network

**Potential Applications:** TODO
https://stackoverflow.com/questions/8897593/similarity-between-two-text-documents

In [3]:
import gensim
import pandas as pd
import json

In [4]:
import pandas as pd
import json

articles = pd.read_pickle('ArticleMetadata.pkl')
articles.DatePublished = pd.to_datetime(articles.DatePublished)
articles.Tags = articles.Tags.map(lambda x: str(x))
articles.TagArray = articles.Tags.map(lambda x: x.split(','))
articles.TagArray[0]
articles.head(1)

Unnamed: 0_level_0,Url,Title,Tags,Topic,DatePublished,Abstract,FullText
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12897,/articles/amc-s-halt-and-catch-fire-is-capital...,"AMC’s ""Halt and Catch Fire"" Is Capitalism's Fi...","Capitalism,Competition,Property Rights,Entrepr...",Economics,2015-09-02 10:56:24,"""The show is a vibrant look at the early PC in...","""AMC's Halt and Catch Fire is a brilliant achi..."


In [5]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def StripHtml(html):
    return strip_tags(html)

print(StripHtml('<b>hello</b>'))
print(type(StripHtml('<b>hello</b>')))

hello
<class 'str'>


In [6]:
articles.fillna('',inplace=True)
articles.reset_index(inplace=True)

In [7]:
# prepare new field for LDA:    

articles["RawText"] = articles.FullText.map(lambda x: StripHtml(x))
articles["RawText"] = articles.RawText + ' ' +  articles.Title + ' ' + articles.Tags + ' ' + articles.Abstract  + ' ' + articles.Topic 
articles["RawText"].head()
articles.shape

(13835, 9)

In [8]:
articles.dropna(subset=['RawText'],inplace=True)
articles.shape

(13835, 9)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english',min_df=3).fit_transform(articles.RawText.dropna())
# no need to normalize, since Vectorizer will return normalized tf-idf
pairwise_similarity = tfidf * tfidf.T

In [8]:
# a way of comparing 2 articles
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

cosine_sim('a little bird', 'a little bird chirps')
cosine_sim('a little bird', 'a little dog barks')
cosine_sim(articles.ix[10000].RawText,articles.ix[10634].RawText)

0.26055567105626237

In [10]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()
print(cosine_similarities)
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
print(related_docs_indices)

[ 1.          0.02176245  0.01683795 ...,  0.01550311  0.01973996
  0.01401568]
[   0 4168 3370 3115]


In [11]:
cosine_similarities = linear_kernel(tfidf[1], tfidf).flatten()
cosine_similarities
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
related_docs_indices

array([12975,     1, 13144, 11421])

In [25]:
def FindSimiliarArticles(url, tfidf_matrix, articles):
    matches = articles.loc[articles['Url']  == url].index.tolist()
    originalArticleIndex = int(matches[0])
    print("original index: %s" % originalArticleIndex)    
    cosine_similarities = linear_kernel(tfidf_matrix[originalArticleIndex], tfidf_matrix).flatten()
    print("cosine_similarities: %s" % cosine_similarities)    
    related_docs_indices = cosine_similarities.argsort()[:-5:-1]
    print('related articles: ' % related_docs_indices)
    related_articles = []
    [related_articles.append(articles.iloc[index]) for index in related_docs_indices]    
    return related_articles

In [26]:
related = FindSimiliarArticles('/articles/how-america-can-keep-the-entrepreneurs-we-train/',tfidf,articles)
print(related)

original index: 13795
cosine_similarities: [ 0.05772269  0.04216487  0.04971372 ...,  0.00795316  0.03272186  0.024919  ]
related articles: 
[ArticleId                                                   139417
Url              /articles/how-america-can-keep-the-entrepreneu...
Title              How America Can Keep the Entrepreneurs We Train
Tags             Entrepreneurship,Immigration,Business,Higher E...
Topic                                                        World
DatePublished                                  2016-07-15 13:00:00
Abstract         "Brilliant, talented people are trained at Ame...
FullText         "\nAmerican colleges and universities attract ...
RawText          "\nAmerican colleges and universities attract ...
Name: 13795, dtype: object, ArticleId                                                   132805
Url              /articles/immigrants-are-twice-as-likely-to-st...
Title            Immigrants Are Twice as Likely to Start a Busi...
Tags             Immigrati

In [27]:
related[0]

ArticleId                                                   139417
Url              /articles/how-america-can-keep-the-entrepreneu...
Title              How America Can Keep the Entrepreneurs We Train
Tags             Entrepreneurship,Immigration,Business,Higher E...
Topic                                                        World
DatePublished                                  2016-07-15 13:00:00
Abstract         "Brilliant, talented people are trained at Ame...
FullText         "\nAmerican colleges and universities attract ...
RawText          "\nAmerican colleges and universities attract ...
Name: 13795, dtype: object