In [37]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier

In [2]:
people = pd.read_csv('people_wiki.csv')

In [3]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')

In [5]:
obama = people[people['name'] == 'Barack Obama'].index[0]
beckham = people[people['name'] == 'David Beckham'].index[0]
clinton = people[people['name'] == 'Bill Clinton'].index[0]

In [6]:
sorted(cosine_similarity(docs[obama:obama+1], docs[clinton:clinton+1]), reverse=True)

NameError: name 'docs' is not defined

In [7]:
sorted(cosine_similarity(docs[obama:obama+1], docs[beckham:beckham+1]), reverse=True)

NameError: name 'docs' is not defined

In [8]:
obama = people[people['name'] == 'Barack Obama']
beckham = people[people['name'] == 'David Beckham']
clinton = people[people['name'] == 'Bill Clinton']

In [31]:
tfidf = TfidfVectorizer(analyzer='word')

In [32]:
docs = tfidf.fit_transform(people['text'])

In [33]:
obama_tfidf = tfidf.transform(obama['text'])
beckham_tfidf = tfidf.transform(beckham['text'])
clinton_tfidf = tfidf.transform(clinton['text'])

In [34]:
1 - cosine_similarity(obama_tfidf, clinton_tfidf)

array([[0.67497775]])

In [35]:
1 - cosine_similarity(beckham_tfidf, obama_tfidf)

array([[0.8420454]])

In [36]:
1 - cosine_similarity(obama_tfidf, obama_tfidf)

array([[-4.4408921e-16]])

## K-nearest neighbours

In [219]:
knn = KNeighborsClassifier(weights=lambda x: (1-cosine_similarity(x)), n_neighbors=6)

In [220]:
knn.fit(docs, people['name'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights=<function <lambda> at 0x7f96732dee60>)

In [221]:
def find_k_near(tfidf_req, n = 5):
    for i in range(n):
        print(people.iloc[knn.kneighbors(tfidf_req, return_distance=False)[0][i]]['name'])
find_k_near(obama_tfidf, 6)

Barack Obama
Joe Biden
Hillary Rodham Clinton
Samantha Power
Eric Stern (politician)
George W. Bush


In [222]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']
find_k_near(tfidf.transform(arnold['text']), 6)

Arnold Schwarzenegger
Paul Grant (bodybuilder)
Bonnie Garcia
Abel Maldonado
David Israel
John Garamendi


In [223]:
victoria = people[people['name'] == 'Victoria Beckham']
find_k_near(tfidf.transform(victoria['text']), 6)

Victoria Beckham
David Beckham
Mel B
Stephen Dow Beckham
Hilary Alexander
Mona al Mansouri


In [224]:
rooney = people[people['name'] == 'Wayne Rooney']
find_k_near(tfidf.transform(rooney['text']))

Wayne Rooney
Art Rooney II
Francis Rooney
Steven Gerrard
Michael Owen


In [225]:
knn.predict(tfidf.transform(rooney['text']))

array(['Art Rooney II'], dtype=object)

In [226]:
john = people[people['name'] == 'Elton John']
find_k_near(tfidf.transform(john['text']))

Elton John
Rod Stewart
Phil Collins
Adele
Sting (musician)
