# Document Retrieval from Wikipedia Data

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import distance as dist
import warnings
from collections import Counter
from itertools import chain, count
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
warnings.filterwarnings('ignore')

# Load some Text Data from Wikipedia

In [2]:
people = pd.read_csv('people_wiki.csv')

In [3]:
people

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...
...,...,...,...
59066,<http://dbpedia.org/resource/Olari_Elts>,Olari Elts,olari elts born april 27 1971 in tallinn eston...
59067,<http://dbpedia.org/resource/Scott_F._Crago>,Scott F. Crago,scott francis crago born july 26 1963 twin bro...
59068,<http://dbpedia.org/resource/David_Cass_(footb...,David Cass (footballer),david william royce cass born 27 march 1962 in...
59069,<http://dbpedia.org/resource/Keith_Elias>,Keith Elias,keith hector elias born february 3 1972 in lac...


# Explore Data

## Taking a Look at the Entry for President Obama

In [4]:
obama = people[people['name'] == 'Barack Obama']

In [5]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [6]:
obama['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

## Explore the Entry for Actor George Clooney

In [7]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

# Word counts for Obama acticle

In [8]:
obama['word_count'] = obama['text'].apply(lambda x: Counter(str(x).split()))

In [9]:
obama

Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ..."


# Compute TF-IDF for the entire corpus of articles

In [None]:
people['word_count'] = people['text'].apply(lambda x: Counter(str(x).split()))

In [None]:
people

In [None]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
word_count_matrix = vectorizer.fit_transform(people['text'])

In [None]:
word_count_matrix

In [None]:
tfid_vectorizer = TfidfVectorizer()
tf_idf_matrix = tfid_vectorizer.fit_transform(people['text'])

In [None]:
tf_idf_matrix

In [None]:
people['tfidf'] = tf_idf_matrix
people

## Examine the TF-IDF for the Obama article

In [None]:
obama = people[people['name'] == 'Barack Obama']
obama

## Examine the TF-IDF for Clooney

In [None]:
clooney = people[people['name'] == 'George Clooney']
clooney

In [None]:
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

# Manually evaluate the distance between certain people's articles

In [None]:
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton or to Beckham?

In [None]:
obama_clinton = cosine_distances(tf_idf_matrix[obama.index], tf_idf_matrix[clinton.index])
obama_clinton

In [None]:
obama_beckham = cosine_distances(tf_idf_matrix[obama.index], tf_idf_matrix[beckham.index])
obama_beckham

In [None]:
if obama_clinton > obama_beckham :
    print('Obama is closer to Clinton')
else :
    print('Obama is closer to Beckham')

# Apply nearest neighbors for retrieval of Wikipedia articles

## Build the NN model

In [None]:
clf_tf_idf = NearestNeighbors(metric='cosine', algorithm='brute')
model_tf_idf = clf_tf_idf.fit(tf_idf_matrix)

clf_word_count = NearestNeighbors(metric='cosine', algorithm='brute')
model_word_count = clf_word_count.fit(word_count_matrix)

## Use model for retrieval... for example, who is closest to Obama?

In [None]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[obama.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

## Other examples of retrieval

In [None]:
swift = people[people['name'] == 'Taylor Swift']

In [None]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[swift.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

In [None]:
jolie = people[people['name'] == 'Angelina Jolie']

In [None]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[jolie.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

In [None]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [None]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[arnold.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

## Assignments

In [None]:
def top_words(name):
    """
    Get a table of the most frequent words in the given person's peoplepedia page.
    """
    row = people[people['name'] == name]
    dic = row['word_count'].iloc[0]
    word_count_ = pd.DataFrame(dic.items(), columns=['word','count'])
    word_count_table = word_count_.sort_values(['count'], ascending=False)
    return word_count_table

In [None]:
elton = people[people['name'] == 'Elton John']
elton

In [None]:
elton_top_words = top_words('Elton John')
elton_top_words.head(5)

In [None]:
vic_beckham = people[people['name'] == 'Victoria Beckham']
vic_beckham

In [None]:
elton_victoria = cosine_distances(tf_idf_matrix[elton.index], tf_idf_matrix[vic_beckham.index])
elton_victoria

In [None]:
mccartney = people[people['name'] == 'Paul McCartney']
elton_mccartney = cosine_distances(tf_idf_matrix[elton.index], tf_idf_matrix[mccartney.index])
elton_mccartney

In [None]:
if elton_victoria < elton_mccartney :
    print('Victoria Beckham is closer to Elton John')
else :
    print('Paul McCartney is closer to Elton John')

In [None]:
distances, indices = model_word_count.kneighbors(word_count_matrix[elton.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

In [None]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[elton.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] =  [i for i in range(1, len(neighbors)+1)]
neighbors

In [None]:
distances, indices = model_word_count.kneighbors(word_count_matrix[vic_beckham.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
neighbors = pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)
neighbors['rank'] = [i for i in range(1, len(neighbors)+1)]
neighbors

In [None]:
elton_words = top_words('Elton John')
print('Elton John Word Count')
display(elton_words.head(10))
print('\n')
barrio_words = top_words('Francisco Barrio')
print('Barrio Word Count')
display(barrio_words.head(10))

In [None]:
distances, indices = model_tf_idf.kneighbors(tf_idf_matrix[vic_beckham.index])

neighbors = pd.DataFrame({'distance':distances.flatten(), 'id':indices.flatten()})
people['id'] = people.index
pd.merge(people, neighbors, how = 'inner', on='id')[['id','name','distance']].sort_values('distance', ascending = True)