In [55]:
import graphlab
people = graphlab.SFrame('people_wiki.gl/')

In [56]:
# people.head()

In [57]:
len(people)

59071

In [58]:
obama = people[people['name'] == 'Barack Obama']

In [59]:
obama

URI,name,text
<http://dbpedia.org/resou rce/Barack_Obama> ...,Barack Obama,barack hussein obama ii brk husen bm born august ...


In [60]:
# obama['text']

In [61]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])

In [62]:
# print obama['word_count']

## Sort the word counts for the Obama article

In [63]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])

### Sorting the word counts to show most common words at the top

In [64]:
# obama_word_count_table.head()

In [65]:
# obama_word_count_table.sort('count',ascending=False)

Most common words include uninformative words like "the", "in", "and",...

# Compute TF-IDF for the corpus 

To give more weight to informative words, we weigh them by their TF-IDF scores.

In [66]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
# people.head()

In [67]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])

# Earlier versions of GraphLab Create returned an SFrame rather than a single SArray
# This notebook was created using Graphlab Create version 1.7.1
if graphlab.version <= '1.6.1':
    tfidf = tfidf['docs']

# tfidf

In [68]:
# print(people.head())

In [69]:
people['tfidf'] = tfidf

## Examine the TF-IDF for the Obama article

In [140]:
obama = people[people['name'] == 'Barack Obama']
# print(type(obama[['tfidf']]))

In [142]:
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

word,tfidf
obama,43.2956530721
act,27.678222623
iraq,17.747378588
control,14.8870608452
law,14.7229357618
ordered,14.5333739509
military,13.1159327785
involvement,12.7843852412
response,12.7843852412
democratic,12.4106886973


Words with highest TF-IDF are much more informative.

# Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.  

In [72]:
clinton = people[people['name'] == 'Bill Clinton']

In [73]:
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton than to Beckham?

We will use cosine distance, which is given by

(1-cosine_similarity) 

and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [74]:
graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])

0.8339854936884276

In [75]:
graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])

0.9791305844747478

# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [76]:
knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama?

In [77]:
knn_model.query(obama)

query_label,reference_label,distance,rank
0,Barack Obama,0.0,1
0,Joe Biden,0.794117647059,2
0,Joe Lieberman,0.794685990338,3
0,Kelly Ayotte,0.811989100817,4
0,Bill Clinton,0.813852813853,5


In [78]:
swift = people[people['name'] == 'Taylor Swift']

In [79]:
knn_model.query(swift)

query_label,reference_label,distance,rank
0,Taylor Swift,0.0,1
0,Carrie Underwood,0.76231884058,2
0,Alicia Keys,0.764705882353,3
0,Jordin Sparks,0.769633507853,4
0,Leona Lewis,0.776119402985,5


In [80]:
jolie = people[people['name'] == 'Angelina Jolie']

In [81]:
knn_model.query(jolie)

query_label,reference_label,distance,rank
0,Angelina Jolie,0.0,1
0,Brad Pitt,0.784023668639,2
0,Julianne Moore,0.795857988166,3
0,Billy Bob Thornton,0.803069053708,4
0,George Clooney,0.8046875,5


In [82]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [83]:
knn_model.query(arnold)

query_label,reference_label,distance,rank
0,Arnold Schwarzenegger,0.0,1
0,Jesse Ventura,0.818918918919,2
0,John Kitzhaber,0.824615384615,3
0,Lincoln Chafee,0.833876221498,4
0,Anthony Foxx,0.833910034602,5


In [84]:
# Assignment 

In [170]:
a=people[people['name']=='Elton John'][['tfidf']]
print(a)
print(a.stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False))

+-------------------------------+
|             tfidf             |
+-------------------------------+
| {'all': 1.6431112434912472... |
+-------------------------------+
[? rows x 1 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use sf.materialize() to force materialization.
+---------------+---------------+
|      word     |     tfidf     |
+---------------+---------------+
|    furnish    |  18.38947184  |
|     elton     |  17.48232027  |
|   billboard   | 17.3036809575 |
|      john     | 13.9393127924 |
|  songwriters  |  11.250406447 |
|  overallelton | 10.9864953892 |
| tonightcandle | 10.9864953892 |
|    19702000   | 10.2933482087 |
|   fivedecade  | 10.2933482087 |
|      aids     |  10.262846934 |
+---------------+---------------+
[255 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



print people.head()

In [171]:
my_model = graphlab.nearest_neighbors.create(people, features = [ 'word_count' ] ,  distance = 'cosine', label ='name')

In [172]:
my_model.query(people[people['name']=='Elton John'])

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


In [205]:
a=people[people['name']=='Elton John']
b=people[people['name']=='Paul McCartney']
print(type(a['tfidf'][0]))
# graphlab.distances.cosine(a['tfidf'][0],b['tfidf'][0])
graphlab.distances.cosine(people[people['name']=='Elton John']['tfidf'][0],people[people['name']=='Paul McCartney']['tfidf'][0] )

<type 'dict'>


0.8250310029221779