#Document Similarity - K Nearest Neighbors

In [77]:
import pandas as pd
import numpy as np

from pandas import DataFrame, Series

# Visualization
import seaborn as sns

# this allows plots to appear directly in the notebook
import matplotlib.pyplot as plt
%matplotlib inline

##Load some text data - from wikipedia, pages on people

In [36]:
people_data_df = pd.read_csv('people_wiki.csv')

Data contains:  link to wikipedia article, name of person, text of article.

In [75]:
people_data_df.head(10)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...
5,<http://dbpedia.org/resource/Sam_Henderson>,Sam Henderson,sam henderson born october 18 1969 is an ameri...
6,<http://dbpedia.org/resource/Aaron_LaCrate>,Aaron LaCrate,aaron lacrate is an american music producer re...
7,<http://dbpedia.org/resource/Trevor_Ferguson>,Trevor Ferguson,trevor ferguson aka john farrow born 11 novemb...
8,<http://dbpedia.org/resource/Grant_Nelson>,Grant Nelson,grant nelson born 27 april 1971 in london also...
9,<http://dbpedia.org/resource/Cathy_Caruth>,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes pro...


### Preparing a corpus

Here we will be using here is called a **tf-idf** model. In this kind of model we simplify documents to a multi-set of tf-idf scores.

In [38]:
import re, nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [39]:
def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = 'word', tokenizer=tokenize, lowercase=True, stop_words='english', max_features=200)

In [45]:
# The method fit_transform does two functions: First, it fits the model and learns the vocabulary; 
# second, it transforms corpus data into tf-idf feature vectors.
people_text_tfidfm = vectorizer.fit_transform(people_data_df.text)

In [46]:
people_text_tfidfm_nd = people_text_tfidfm.toarray()
people_text_tfidfm_nd.shape

(59071, 200)

In [70]:
# Take a look at how the Document Tf-Idf Matrix looks
people_text_tfidfm_df = pd.DataFrame(people_text_tfidfm_nd, columns=vectorizer.get_feature_names())
people_text_tfidfm_df.head(5)

Unnamed: 0,academi,age,album,american,appear,appoint,april,art,artist,assist,...,went,win,women,won,work,world,write,writer,year,york
0,0,0.057292,0.0,0.0,0,0,0,0,0,0.061127,...,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0,0,0,0,0,0.0,...,0.0,0,0,0.0,0.318631,0,0,0.0,0.0,0.0
2,0,0.0,0.135355,0.0,0,0,0,0,0,0.0,...,0.0,0,0,0.097528,0.070843,0,0,0.0,0.137813,0.0
3,0,0.0,0.0,0.059951,0,0,0,0,0,0.0,...,0.0,0,0,0.0,0.244976,0,0,0.188993,0.190623,0.076131
4,0,0.0,0.573702,0.0,0,0,0,0,0,0.0,...,0.088211,0,0,0.0,0.0,0,0,0.0,0.097354,0.0


In [72]:
len(people_data_df)

59071

In [73]:
len(people_text_tfidfm_df)

59071

In [88]:
tf_idf = list()
for i in range(0,len(people_data_df)):
    tf_idf_dict = people_text_tfidfm_df.iloc[i].to_dict()
    tf_idf_dict = {key: value for key, value in word_count_dict.items() if value != 0}
    #print(d)
    tf_idf.append(word_count_dict)

In [95]:
people_data_df['tf-idf'] = Series(tf_idf)
people_data_df.head(10)

Unnamed: 0,URI,name,text,tf-idf
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'committe': 0.155187364651, 'unit': 0.2286124..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'committe': 0.155187364651, 'unit': 0.2286124..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'committe': 0.155187364651, 'unit': 0.2286124..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'committe': 0.155187364651, 'unit': 0.2286124..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'committe': 0.155187364651, 'unit': 0.2286124..."
5,<http://dbpedia.org/resource/Sam_Henderson>,Sam Henderson,sam henderson born october 18 1969 is an ameri...,"{'committe': 0.155187364651, 'unit': 0.2286124..."
6,<http://dbpedia.org/resource/Aaron_LaCrate>,Aaron LaCrate,aaron lacrate is an american music producer re...,"{'committe': 0.155187364651, 'unit': 0.2286124..."
7,<http://dbpedia.org/resource/Trevor_Ferguson>,Trevor Ferguson,trevor ferguson aka john farrow born 11 novemb...,"{'committe': 0.155187364651, 'unit': 0.2286124..."
8,<http://dbpedia.org/resource/Grant_Nelson>,Grant Nelson,grant nelson born 27 april 1971 in london also...,"{'committe': 0.155187364651, 'unit': 0.2286124..."
9,<http://dbpedia.org/resource/Cathy_Caruth>,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes pro...,"{'committe': 0.155187364651, 'unit': 0.2286124..."


In [58]:
from sklearn.neighbors import NearestNeighbors
import sklearn.metrics.pairwise as smp
# Create and train the Logistic NearestNeighbors Classifier

# euclidean distance metrics
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(people_text_tfidfm)

# cosine similarity does not work with sparse matrix, so use people_text_tfidfm.toarray()
#neigh = NearestNeighbors(n_neighbors=5, metric=smp.cosine_similarity)
#neigh.fit(people_text_tfidfm.toarray())

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_neighbors=5, p=2, radius=1.0)


###Exploring the entry for president Obama

In [59]:
obama = people_data_df[people_data_df['name'] == 'Barack Obama']
obama['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

In [60]:
obama_text_tfidfm = people_text_tfidfm.getrow(obama.index.get_values()[0])

In [61]:
# euclidean distance metrics
closer_to_obama = neigh.kneighbors(obama_text_tfidfm, 5, return_distance=False)

# cosine similarity or ball tree does not woark with sparse data, so use obama_text_tfidfm.toarray()
#closer_to_obama = neigh.kneighbors(obama_text_tfidfm.toarray(), 5, return_distance=False) 

In [62]:
# people who are close to Barack Obama
people_data_df.iloc[closer_to_obama[0]]

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...
16212,<http://dbpedia.org/resource/Charles_Levin_(ju...,Charles Levin (judge),charles leonard levin born april 28 1926 in de...
58309,<http://dbpedia.org/resource/Mitch_Daniels>,Mitch Daniels,mitchell elias mitch daniels jr born april 7 1...
15199,<http://dbpedia.org/resource/Dan_Quayle>,Dan Quayle,james danforth dan quayle kwel born february 4...
45586,<http://dbpedia.org/resource/Robbie_Wills>,Robbie Wills,former state representative robert d robbie wi...


###Exploring the entry for singer Taylor Swift

In [63]:
swift = people_data_df[people_data_df['name'] == 'Taylor Swift']

In [64]:
swift_text_tfidfm = people_text_tfidfm.getrow(swift.index.get_values()[0])

In [65]:
# euclidean distance metrics
closer_to_swift = neigh.kneighbors(swift_text_tfidfm, 5, return_distance=False)

# cosine similarity or ball tree does not woark with sparse data
#closer_to_swift = neigh.kneighbors(swift_text_tfidfm.toarray(), 5, return_distance=False)

In [66]:
# people who are close to Taylor Swift
people_data_df.iloc[closer_to_swift[0]]

Unnamed: 0,URI,name,text
54264,<http://dbpedia.org/resource/Taylor_Swift>,Taylor Swift,taylor alison swift born december 13 1989 is a...
42211,<http://dbpedia.org/resource/LeAnn_Rimes>,LeAnn Rimes,margaret leann rimes cibrian born august 28 19...
17552,<http://dbpedia.org/resource/Ricky_Martin>,Ricky Martin,enrique martn morales born december 24 1971 co...
56098,<http://dbpedia.org/resource/Chely_Wright>,Chely Wright,richell rene chely wright li rat born october ...
6215,<http://dbpedia.org/resource/Amy_Grant>,Amy Grant,amy lee grant born november 25 1960 is an amer...
