# Import Scikit Learn, Pandas and Numpy

In [1]:
import sklearn
import numpy as np
import pandas as pd

# 1. Read the Dataset using Pandas

In [2]:
data = pd.read_csv("data/people_wiki.csv")

In [3]:
data

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...
...,...,...,...
59066,<http://dbpedia.org/resource/Olari_Elts>,Olari Elts,olari elts born april 27 1971 in tallinn eston...
59067,<http://dbpedia.org/resource/Scott_F._Crago>,Scott F. Crago,scott francis crago born july 26 1963 twin bro...
59068,<http://dbpedia.org/resource/David_Cass_(footb...,David Cass (footballer),david william royce cass born 27 march 1962 in...
59069,<http://dbpedia.org/resource/Keith_Elias>,Keith Elias,keith hector elias born february 3 1972 in lac...


# 2. Exploratory Data Analysis

In [4]:
data.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59071 entries, 0 to 59070
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URI     59071 non-null  object
 1   name    59071 non-null  object
 2   text    59071 non-null  object
dtypes: object(3)
memory usage: 1.4+ MB


In [6]:
data.describe()

Unnamed: 0,URI,name,text
count,59071,59071,59071
unique,59071,59070,59071
top,<http://dbpedia.org/resource/Peter_Emmerich>,author),leslie michael bethell born 12 february 1937 i...
freq,1,2,1


# 3. Data Preprocessing
## Build word count vectors

In [None]:
import pandas as pd
import numpy as np    
from sklearn.feature_extraction.text import CountVectorizer

def dictionarize(row):
    cv = CountVectorizer(
        analyzer = "word",  
    )
    text = [row.loc['text']]
    cv_fit=cv.fit_transform(text)    
    word_list = cv.get_feature_names()
    count_list = cv_fit.toarray().sum(axis=0)
    dictionary = dict(zip(word_list,count_list))
    row['word_count'] = dictionary
    return row

data = data.apply(dictionarize, axis=1)

In [None]:
data.head()

## Build TF-IDF vector

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

tfv = TfidfVectorizer(stop_words = 'english')
tfv.fit(data['text'])
feature_names = tfv.get_feature_names()

In [None]:
def dictionarize_TFIDF(row):
    tfv_matrix= tfv.transform([row['text']]).todense()
    feature_index = tfv_matrix[0,:].nonzero()[1]
    dictionary = dict(zip([feature_names[i] for i in feature_index], [tfv_matrix[0, x] for x in feature_index]))
    row['word_TFIDF'] = dictionary
    return row

data = data.apply(dictionarize_TFIDF, axis=1)
data.head()

## Explore some examples

In [None]:
def explore_by_name(name, sort_type='word_TFIDF'):
    tfidf = data.loc[data['name'] == name].to_dict('list')['word_TFIDF'][0]
    count = data.loc[data['name'] == name].to_dict('list')['word_count'][0]
    result = pd.DataFrame({'word_TFIDF':pd.Series(tfidf),'word_count':pd.Series(count)})\
                                    .sort_values(by=[sort_type], ascending=False)
    return result

In [None]:
obama = explore_by_name('Barack Obama')
obama.head(10)

In [None]:
clooney = explore_by_name('George Clooney')
clooney.head(10)

In [None]:
john = explore_by_name('Elton John', 'word_count')
john.head(10)

In [None]:
john = explore_by_name('Elton John')
john.head(10)

## Manual Cosine distance

In [None]:
from sklearn.metrics.pairwise import cosine_distances
import seaborn as sns
import matplotlib.pyplot as plt

def cosine_distance_by_text(names):
    texts = []
    for name in names:
        texts.append(data.loc[data['name'] == name].to_dict('list')['text'][0])
    
    tfv_matrix = tfv.transform(texts).todense()
    distance = cosine_distances(tfv_matrix, tfv_matrix)
    distance = pd.DataFrame(distance, index=names, columns=names)
    ax = sns.heatmap(distance, center=0)
    return distance

In [None]:
texts = ['Bill Clinton', 'Barack Obama', 'David Beckham', 'Victoria Beckham', 'Elton John','Paul McCartney']

cosine_distance_by_text(texts)

# 4. Unsupervised Learning Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

preprocessor = Pipeline(
    [
        ('TFIDF_Vectorizer', TfidfVectorizer()),
        ('TF-IDF', TfidfTransformer())
    ],
    verbose=True
)

kmeans_model = Pipeline(
    [
        ('Preprocessor', preprocessor),
        ('KMeans', KMeans())
    ],
    verbose=True
)

from sklearn import set_config
set_config(display='diagram')
kmeans_model

# 5. Pipeline Tuning

In [None]:
from sklearn.metrics import silhouette_score

silhouette_coefficients = []

for k in range(2, 11):
    print("Starting with n_clusters=", k)
    kmeans_model = Pipeline(
        [
            ('Preprocessor', preprocessor),
            ('KMeans', KMeans(n_clusters=k))
        ],
        verbose=True
    )
    
    kmeans_model.fit(data['text'])
    preprocessed_data = preprocessor.transform(data['text'])
    print("Pipeline fit finished")
    score = silhouette_score(preprocessed_data, kmeans_model["KMeans"].labels_)
    print("silhouette score finished")
    print()
    silhouette_coefficients.append(score)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 11), silhouette_coefficients)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

## Retrain the model with the best scoring number of clusters

In [None]:
k_best = np.argmax(silhouette_coefficients) + 2

kmeans_model = Pipeline(
    [
        ('Preprocessor', preprocessor),
        ('KMeans', KMeans(n_clusters=k_best, max_iter=500))
    ],
    verbose=True
)

kmeans_model.fit(data['text'])

# Assignment
### Build Nearest Neighbor Pipelines to measure distance between persons using CountVector and TFIDF

In [None]:
from sklearn.neighbors import NearestNeighbors

distance_knn_count = Pipeline(
    [
        ('Count_Vectorizer', CountVectorizer()),
        ('NN', NearestNeighbors(metric='cosine'))
    ],
    verbose=True
)

distance_knn_tfidf = Pipeline(
    [
        ('TFIDF_Vectorizer', TfidfVectorizer()),
        ('TF-IDF', TfidfTransformer()),
        ('NN', NearestNeighbors(metric='cosine'))
    ],
    verbose=True
)

In [None]:
distance_knn_count.fit(data['text'])

In [None]:
distance_knn_tfidf.fit(data['text'])

In [None]:
def query_distance_count(name):
    x = data.loc[data['name'] == name].to_dict('list')['text'][0]
    x = distance_knn_count['Count_Vectorizer'].transform([x])
    y = distance_knn_count['NN'].kneighbors(x)
    
    for i in y[1]:
        print(data.loc[i]['name'])
        
def query_distance_tfidf(name):
    x = data.loc[data['name'] == name].to_dict('list')['text'][0]
    x = distance_knn_tfidf['TFIDF_Vectorizer'].transform([x])
    y = distance_knn_tfidf['NN'].kneighbors(x)
    
    for i in y[1]:
        print(data.loc[i]['name'])

In [None]:
name = 'Elton John'
query_distance_count(name)

In [None]:
name = 'Elton John'
query_distance_tfidf(name)

In [None]:
name = 'Victoria Beckham'
query_distance_count(name)

In [None]:
name = 'Victoria Beckham'
query_distance_tfidf(name)