# Document Retrieval from Wikipedia Data

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import distance as dist
import warnings
from collections import Counter
from itertools import chain, count
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
warnings.filterwarnings('ignore')

# Load some Text Data from Wikipedia

In [2]:
people = pd.read_csv('people_wiki.csv')

In [3]:
people

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...
...,...,...,...
59066,<http://dbpedia.org/resource/Olari_Elts>,Olari Elts,olari elts born april 27 1971 in tallinn eston...
59067,<http://dbpedia.org/resource/Scott_F._Crago>,Scott F. Crago,scott francis crago born july 26 1963 twin bro...
59068,<http://dbpedia.org/resource/David_Cass_(footb...,David Cass (footballer),david william royce cass born 27 march 1962 in...
59069,<http://dbpedia.org/resource/Keith_Elias>,Keith Elias,keith hector elias born february 3 1972 in lac...


# Explore Data

## Taking a Look at the Entry for President Obama

In [4]:
obama = people[people['name'] == 'Barack Obama']

In [5]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [6]:
obama['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

## Explore the Entry for Actor George Clooney

In [7]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

# Word counts for Obama acticle

In [8]:
obama['word_count'] = obama['text'].apply(lambda x: Counter(str(x).split()))

In [9]:
obama

Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ..."


# Compute TF-IDF for the entire corpus of articles

In [10]:
people['word_count'] = people['text'].apply(lambda x: Counter(str(x).split()))

In [11]:
people

Unnamed: 0,URI,name,text,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'digby': 1, 'morrell': 5, 'born': 1, '10': 1,..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'alfred': 1, 'j': 1, 'lewy': 3, 'aka': 1, 'sa..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'harpdog': 2, 'brown': 2, 'is': 7, 'a': 7, 's..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'franz': 1, 'rottensteiner': 3, 'born': 1, 'i..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'henry': 1, 'krvits': 1, 'born': 1, '30': 1, ..."
...,...,...,...,...
59066,<http://dbpedia.org/resource/Olari_Elts>,Olari Elts,olari elts born april 27 1971 in tallinn eston...,"{'olari': 2, 'elts': 3, 'born': 1, 'april': 1,..."
59067,<http://dbpedia.org/resource/Scott_F._Crago>,Scott F. Crago,scott francis crago born july 26 1963 twin bro...,"{'scott': 1, 'francis': 1, 'crago': 5, 'born':..."
59068,<http://dbpedia.org/resource/David_Cass_(footb...,David Cass (footballer),david william royce cass born 27 march 1962 in...,"{'david': 1, 'william': 1, 'royce': 1, 'cass':..."
59069,<http://dbpedia.org/resource/Keith_Elias>,Keith Elias,keith hector elias born february 3 1972 in lac...,"{'keith': 1, 'hector': 1, 'elias': 4, 'born': ..."


In [12]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
word_count_matrix = vectorizer.fit_transform(people['text'])

In [13]:
word_count_matrix

<59071x548465 sparse matrix of type '<class 'numpy.int64'>'
	with 10379588 stored elements in Compressed Sparse Row format>

In [14]:
tfid_vectorizer = TfidfVectorizer()
tf_idf_matrix = tfid_vectorizer.fit_transform(people['text'])

In [15]:
tf_idf_matrix

<59071x548429 sparse matrix of type '<class 'numpy.float64'>'
	with 10244028 stored elements in Compressed Sparse Row format>

In [16]:
people['tfidf'] = tf_idf_matrix
people

Unnamed: 0,URI,name,text,word_count,tfidf
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'digby': 1, 'morrell': 5, 'born': 1, '10': 1,...","(0, 323069)\t0.04943650649482413\n (0, 1309..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'alfred': 1, 'j': 1, 'lewy': 3, 'aka': 1, 'sa...","(0, 323069)\t0.04943650649482413\n (0, 1309..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'harpdog': 2, 'brown': 2, 'is': 7, 'a': 7, 's...","(0, 323069)\t0.04943650649482413\n (0, 1309..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'franz': 1, 'rottensteiner': 3, 'born': 1, 'i...","(0, 323069)\t0.04943650649482413\n (0, 1309..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'henry': 1, 'krvits': 1, 'born': 1, '30': 1, ...","(0, 323069)\t0.04943650649482413\n (0, 1309..."
...,...,...,...,...,...
59066,<http://dbpedia.org/resource/Olari_Elts>,Olari Elts,olari elts born april 27 1971 in tallinn eston...,"{'olari': 2, 'elts': 3, 'born': 1, 'april': 1,...","(0, 323069)\t0.04943650649482413\n (0, 1309..."
59067,<http://dbpedia.org/resource/Scott_F._Crago>,Scott F. Crago,scott francis crago born july 26 1963 twin bro...,"{'scott': 1, 'francis': 1, 'crago': 5, 'born':...","(0, 323069)\t0.04943650649482413\n (0, 1309..."
59068,<http://dbpedia.org/resource/David_Cass_(footb...,David Cass (footballer),david william royce cass born 27 march 1962 in...,"{'david': 1, 'william': 1, 'royce': 1, 'cass':...","(0, 323069)\t0.04943650649482413\n (0, 1309..."
59069,<http://dbpedia.org/resource/Keith_Elias>,Keith Elias,keith hector elias born february 3 1972 in lac...,"{'keith': 1, 'hector': 1, 'elias': 4, 'born': ...","(0, 323069)\t0.04943650649482413\n (0, 1309..."


## Examine the TF-IDF for the Obama article

In [17]:
obama = people[people['name'] == 'Barack Obama']
obama

Unnamed: 0,URI,name,text,word_count,tfidf
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ...","(0, 323069)\t0.04943650649482413\n (0, 1309..."


## Examine the TF-IDF for Clooney

In [18]:
clooney = people[people['name'] == 'George Clooney']
clooney

Unnamed: 0,URI,name,text,word_count,tfidf
38514,<http://dbpedia.org/resource/George_Clooney>,George Clooney,george timothy clooney born may 6 1961 is an a...,"{'george': 1, 'timothy': 1, 'clooney': 4, 'bor...","(0, 323069)\t0.04943650649482413\n (0, 1309..."


In [19]:
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

# Manually evaluate the distance between certain people's articles

In [20]:
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton or to Beckham?

In [21]:
obama_clinton = cosine_distances(tf_idf_matrix[obama.index], tf_idf_matrix[clinton.index])
obama_clinton

array([[0.67497775]])

In [22]:
obama_beckham = cosine_distances(tf_idf_matrix[obama.index], tf_idf_matrix[beckham.index])
obama_beckham

array([[0.8420454]])

In [23]:
if obama_clinton > obama_beckham :
    print('Obama is closer to Clinton')
else :
    print('Obama is closer to Beckham')

Obama is closer to Beckham


# Apply nearest neighbors for retrieval of Wikipedia articles

## Build the NN model

In [24]:
clf_tf_idf = NearestNeighbors(metric='cosine', algorithm='brute')
model_tf_idf = clf_tf_idf.fit(tf_idf_matrix)

clf_word_count = NearestNeighbors(metric='cosine', algorithm='brute')
model_word_count = clf_word_count.fit(word_count_matrix)

## Use model for retrieval... for example, who is closest to Obama?

In [25]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[obama.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

Unnamed: 0,id,name,distance,rank
1,35817,Barack Obama,0.0,1
0,24478,Joe Biden,0.570781,2
4,57108,Hillary Rodham Clinton,0.615934,3
2,38376,Samantha Power,0.624993,4
3,38714,Eric Stern (politician),0.649765,5


## Other examples of retrieval

In [26]:
swift = people[people['name'] == 'Taylor Swift']

In [27]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[swift.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

Unnamed: 0,id,name,distance,rank
4,54264,Taylor Swift,3.330669e-16,1
0,317,Carrie Underwood,0.6161387,2
2,27793,Adele,0.6247446,3
3,29297,Kelly Clarkson,0.6375446,4
1,1341,Dolly Parton,0.6487036,5


In [28]:
jolie = people[people['name'] == 'Angelina Jolie']

In [29]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[jolie.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

Unnamed: 0,id,name,distance,rank
2,39521,Angelina Jolie,2.220446e-16,1
0,29009,Barbara Hershey,0.627905,2
4,57434,Glenn Close,0.6337704,3
1,34756,Maggie Smith,0.6438354,4
3,44992,Julianne Moore,0.6499563,5


In [30]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [31]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[arnold.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

Unnamed: 0,id,name,distance,rank
1,16018,Arnold Schwarzenegger,1.110223e-16,1
2,35293,Paul Grant (bodybuilder),0.7397827,2
4,58965,Bonnie Garcia,0.7465629,3
3,36682,Abel Maldonado,0.7598034,4
0,10499,David Israel,0.7676966,5


## Assignments

In [32]:
def top_words(name):
    """
    Get a table of the most frequent words in the given person's peoplepedia page.
    """
    row = people[people['name'] == name]
    dic = row['word_count'].iloc[0]
    word_count_ = pd.DataFrame(dic.items(), columns=['word','count'])
    word_count_table = word_count_.sort_values(['count'], ascending=False)
    return word_count_table

In [33]:
elton = people[people['name'] == 'Elton John']
elton

Unnamed: 0,URI,name,text,word_count,tfidf,id
19923,<http://dbpedia.org/resource/Elton_John>,Elton John,sir elton hercules john cbe born reginald kenn...,"{'sir': 1, 'elton': 3, 'hercules': 1, 'john': ...","(0, 323069)\t0.04943650649482413\n (0, 1309...",19923


In [34]:
elton_top_words = top_words('Elton John')
elton_top_words.head(5)

Unnamed: 0,word,count
56,the,27
60,in,18
21,and,15
55,of,13
125,a,10


In [35]:
vic_beckham = people[people['name'] == 'Victoria Beckham']
vic_beckham

Unnamed: 0,URI,name,text,word_count,tfidf,id
50411,<http://dbpedia.org/resource/Victoria_Beckham>,Victoria Beckham,victoria caroline beckham ne adams born 17 apr...,"{'victoria': 4, 'caroline': 1, 'beckham': 8, '...","(0, 323069)\t0.04943650649482413\n (0, 1309...",50411


In [36]:
elton_victoria = cosine_distances(tf_idf_matrix[elton.index], tf_idf_matrix[vic_beckham.index])
elton_victoria

array([[0.85192118]])

In [37]:
mccartney = people[people['name'] == 'Paul McCartney']
elton_mccartney = cosine_distances(tf_idf_matrix[elton.index], tf_idf_matrix[mccartney.index])
elton_mccartney

array([[0.69231325]])

In [38]:
if elton_victoria > elton_mccartney :
    print('Victoria Beckham is closer to Elton John')
else :
    print('Paul McCartney is closer to Elton John')

Victoria Beckham is closer to Elton John


In [39]:
distances, indices = model_word_count.kneighbors(word_count_matrix[elton.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

Unnamed: 0,id,name,distance,rank
0,19923,Elton John,2.664535e-15,1
3,41668,Cliff Richard,0.1614242,2
1,25798,Sandro Petrone,0.1682254,3
2,28825,Rod Stewart,0.1683272,4
4,51884,Malachi O\'Doherty,0.1773155,5


In [40]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[elton.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

Unnamed: 0,id,name,distance,rank
0,19923,Elton John,0.0,1
3,28825,Rod Stewart,0.589361,2
4,31595,Phil Collins,0.633658,3
2,27793,Adele,0.636524,4
1,26049,Sting (musician),0.642397,5


In [41]:
distances, indices = model_word_count.kneighbors(word_count_matrix[vic_beckham.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] = [i for i in range(1, len(neighbors)+1)]
neighbors

Unnamed: 0,id,name,distance,rank
4,50411,Victoria Beckham,5.551115e-16,1
0,669,Mary Fitzgerald (artist),0.207307,2
3,45129,Adrienne Corri,0.2145098,3
2,39504,Beverly Jane Fry,0.2174665,4
1,13937,Raman Mundair,0.2176955,5


In [42]:
obama_words = top_words('Elton John')
print('Elton John Word Count')
display(obama_words.head(10))
print('\n')
barrio_words = top_words('Francisco Barrio')
print('Barrio Word Count')
display(barrio_words.head(10))

Elton John Word Count


Unnamed: 0,word,count
56,the,27
60,in,18
21,and,15
55,of,13
125,a,10
25,has,9
24,he,7
3,john,7
39,on,6
34,since,5




Barrio Word Count


Unnamed: 0,word,count
14,the,36
22,of,24
24,and,18
26,in,17
19,he,10
13,to,9
23,chihuahua,7
9,a,6
21,governor,6
106,his,5


In [43]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[vic_beckham.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)

Unnamed: 0,id,name,distance
4,50411,Victoria Beckham,0.0
2,23386,David Beckham,0.546477
1,17264,Mel B,0.718422
3,39144,Stephen Dow Beckham,0.745956
0,5385,Hilary Alexander,0.751848
