<a href="https://colab.research.google.com/github/AKSeavey/5_NLP-and-Clustering/blob/main/project_5_Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### set up environment

In [None]:
import numpy as np
import pandas as pd
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline

In [None]:
# Install textblob
%%capture
!pip3 install -U textblob

In [None]:
from textblob import TextBlob

In [None]:
# Download corpora
%%capture
!python -m textblob.download_corpora


In [None]:
import nltk
nltk.download('omw-1.4')


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### import csv and look around

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
nlp = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project_5/NLP.csv")

In [None]:
nlp.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [None]:
# create a random row selector tool
num_rows = len(nlp)

random_index = random.randint(0, num_rows -1)
random_row = nlp.iloc[random_index]
random_row

URI        <http://dbpedia.org/resource/Bronwyn_Bancroft>
name                                     Bronwyn Bancroft
text    bronwyn bancroft born 1958 is an australian ar...
Name: 29625, dtype: object

In [None]:
nlp.shape

(42786, 3)

In [None]:
nlp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42786 entries, 0 to 42785
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URI     42786 non-null  object
 1   name    42786 non-null  object
 2   text    42786 non-null  object
dtypes: object(3)
memory usage: 1002.9+ KB


### text preprocessing

In [None]:
# nlp_clean = nlp.copy()

In [None]:
# the full dataset was crashing colab... let's try an MVP with a smaller sample of the data
nlp_clean = nlp[:2000].copy()

In [None]:
nlp_clean.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [None]:
# convert text column to text blobs
nlp_clean['text_blobs'] = nlp_clean['text'].apply(lambda text: TextBlob(text))

In [None]:
nlp_clean.head()

Unnamed: 0,URI,name,text,text_blobs
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"(d, i, g, b, y, , m, o, r, r, e, l, l, , b, ..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"(a, l, f, r, e, d, , j, , l, e, w, y, , a, ..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"(h, a, r, p, d, o, g, , b, r, o, w, n, , i, ..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"(f, r, a, n, z, , r, o, t, t, e, n, s, t, e, ..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"(h, e, n, r, y, , k, r, v, i, t, s, , b, o, ..."


In [None]:
nlp_clean['text_blobs'][1200]

TextBlob("lithang tulku tenzin delek rinpoche or tenzing deleg tibetan wylie bstan dzin bde legs born 1950 in lithang tibet is a tibetan buddhist leader from garze sichuan he was arrested on april 7 2002 during a raid on jamyang choekhorling in garze sichuan china he was accused of being involved in a bomb attack on april 3 2002 on the central square of sichuans provincial capital chengduhe was convicted for alleged involvement in a series of unsolved bombings in his region by the chinese authorities and sentenced to death in december 2002 along with lobsang dhondup a 28year old assistant of his lobsang was executed almost immediately in late january 2003 marking the first execution of a tibetan for political crimes in 20 years tenzin deleks trial began on november 29 2002 before the local court in garze and was sentenced to death with a twoyear execution adjournment overseas human rights groups and united nations human rights experts protested that the case against him was seriously f

In [None]:
# tokenize the text blobs
tokenized = nlp_clean['text_blobs'].apply(lambda blob: blob.words)

In [None]:
tokenized[1200]

WordList(['lithang', 'tulku', 'tenzin', 'delek', 'rinpoche', 'or', 'tenzing', 'deleg', 'tibetan', 'wylie', 'bstan', 'dzin', 'bde', 'legs', 'born', '1950', 'in', 'lithang', 'tibet', 'is', 'a', 'tibetan', 'buddhist', 'leader', 'from', 'garze', 'sichuan', 'he', 'was', 'arrested', 'on', 'april', '7', '2002', 'during', 'a', 'raid', 'on', 'jamyang', 'choekhorling', 'in', 'garze', 'sichuan', 'china', 'he', 'was', 'accused', 'of', 'being', 'involved', 'in', 'a', 'bomb', 'attack', 'on', 'april', '3', '2002', 'on', 'the', 'central', 'square', 'of', 'sichuans', 'provincial', 'capital', 'chengduhe', 'was', 'convicted', 'for', 'alleged', 'involvement', 'in', 'a', 'series', 'of', 'unsolved', 'bombings', 'in', 'his', 'region', 'by', 'the', 'chinese', 'authorities', 'and', 'sentenced', 'to', 'death', 'in', 'december', '2002', 'along', 'with', 'lobsang', 'dhondup', 'a', '28year', 'old', 'assistant', 'of', 'his', 'lobsang', 'was', 'executed', 'almost', 'immediately', 'in', 'late', 'january', '2003', 'ma

In [None]:
type(tokenized)

pandas.core.series.Series

In [None]:
# this doesn't work on the full set (crashes)
# but works on the 2000 row sample
stemmed = tokenized.apply(lambda blob: blob.stem())

In [None]:
# may not be necessary and eats a lot of ram...
# singularized = tokenized.apply(lambda blob: blob.singularize())

In [None]:
# singularized_texts = []

In [None]:
# for tokens in tokenized_texts[0:5]:
#   singularized_tokens = tokens.singularize()
#   singularized_texts.append(singularized_tokens)

### text representation

#### bag of words

In [None]:
# Convert tokenized lists back to sentences (strings)
# 'sentences' will be a pandas Series with sentences formed by joining the tokens

sentences = tokenized.apply(lambda tokens: ' '.join(tokens))

In [None]:
# Initialize the CountVectorizer with stop words and any other desired parameters
# CountVectorizer is used to convert text data into a numerical representation (Bag of Words)

vectorizer = CountVectorizer(stop_words = 'english')

In [None]:
# Perform the count transformation on the 'sentences' Series
# 'bow_vec' will be a sparse matrix representing the Bag of Words representation
# Each row corresponds to a sentence, and each column represents the frequency of a word in that sentence

bow_vec = vectorizer.fit_transform(sentences)

In [None]:
# Convert 'bow_vec' to a dense array for better visibility

bow_vec.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

#### TF_IDF

In [None]:
# vec_tfidf = pd.DataFrame(bow_vec.toarray(), columns = vectorizer.get_feature_names_out())
# vec_tfidf

In [None]:
# Initialize the TfidfTransformer to apply Term Frequency-Inverse Document Frequency (TF-IDF) transformation

tf_idf_tran = TfidfTransformer()

# Perform the TF-IDF transformation on 'bow_vec' (sparse matrix)
# 'tf_idf_matrix' will be a new sparse matrix with TF-IDF values

tf_idf_matrix = tf_idf_tran.fit_transform(bow_vec)

In [None]:
# Convert the sparse TF-IDF matrix to a pandas DataFrame for better visibility
# 'tf_df' will be a DataFrame with TF-IDF values, where each row represents a sentence,
# and each column corresponds to a word's TF-IDF value in that sentence

tf_df = pd.DataFrame(tf_idf_matrix.toarray(), columns = vectorizer.get_feature_names_out())
tf_df

Unnamed: 0,00,000,000577,0006,003,005,007,008,009,0091857112,...,zvonimir,zwany,zwart,zwarts,zwierciado,zwigoff,zwines,zyl,zylberman,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
type(tf_df)

pandas.core.frame.DataFrame

### nearest neighbors

In [None]:
# Initialize the NearestNeighbors model
neighbors = NearestNeighbors()

# Fit the NearestNeighbors model on the TF-IDF DataFrame 'tf_df'
neighbors.fit(tf_df)

In [None]:
# Find the nearest neighbors for the sentence at index 1200 in 'tf_idf_matrix'
# 'n_neighbors = 11' means it will find the 11 nearest neighbors, including itself

dist, row = neighbors.kneighbors(tf_idf_matrix[1200], n_neighbors = 11)



In [None]:
dist

array([[2.35608046e-08, 1.29526819e+00, 1.35416830e+00, 1.35504313e+00,
        1.35812230e+00, 1.36443514e+00, 1.36699966e+00, 1.36739562e+00,
        1.37002601e+00, 1.37206434e+00, 1.37467355e+00]])

In [None]:
row

array([[1200, 1229, 1048,  248,  918, 1663,  615,  284, 1658, 1173, 1686]])

In [None]:
# Extract the rows from 'nlp_clean' corresponding to the nearest neighbors

closest_10 = nlp_clean.iloc[row[0]]
closest_10

Unnamed: 0,URI,name,text,text_blobs
1200,<http://dbpedia.org/resource/Tenzin_Delek_Rinp...,Tenzin Delek Rinpoche,lithang tulku tenzin delek rinpoche or tenzing...,"(l, i, t, h, a, n, g, , t, u, l, k, u, , t, ..."
1229,<http://dbpedia.org/resource/Philip_Denwood>,Philip Denwood,philip denwood is a british tibetologist noted...,"(p, h, i, l, i, p, , d, e, n, w, o, o, d, , ..."
1048,<http://dbpedia.org/resource/Marek_Antoni_Nowi...,Marek Antoni Nowicki,marek a nowicki is a human rights lawyer he wa...,"(m, a, r, e, k, , a, , n, o, w, i, c, k, i, ..."
248,<http://dbpedia.org/resource/Guy_Sorman>,Guy Sorman,guy sorman born march 10 1944 paris france is ...,"(g, u, y, , s, o, r, m, a, n, , b, o, r, n, ..."
918,<http://dbpedia.org/resource/Habibollah_Latifi>,Habibollah Latifi,habibollah latifi is a kurdish iranian law stu...,"(h, a, b, i, b, o, l, l, a, h, , l, a, t, i, ..."
1663,<http://dbpedia.org/resource/Manfred_Nowak>,Manfred Nowak,manfred nowak born 26 june 1950 in bad aussee ...,"(m, a, n, f, r, e, d, , n, o, w, a, k, , b, ..."
615,<http://dbpedia.org/resource/Phan_Thanh_H%E1%B...,Phan Thanh H%E1%BA%A3i,phan thanh hi born c 1969 is a vietnamese diss...,"(p, h, a, n, , t, h, a, n, h, , h, i, , b, ..."
284,<http://dbpedia.org/resource/Hadi_Ghabel>,Hadi Ghabel,hadi ghabel is an iranian cleric and member of...,"(h, a, d, i, , g, h, a, b, e, l, , i, s, , ..."
1658,<http://dbpedia.org/resource/Martin_O'Brien_(h...,Martin O'Brien (humanitarian),martin obrien born in belfast northern ireland...,"(m, a, r, t, i, n, , o, b, r, i, e, n, , b, ..."
1173,<http://dbpedia.org/resource/Cong_Thanh_Do>,Cong Thanh Do,cong thanh do vietnamese thnh cng born ca 1959...,"(c, o, n, g, , t, h, a, n, h, , d, o, , v, ..."


In [None]:
nlp_clean['text_blobs'][1200]

TextBlob("lithang tulku tenzin delek rinpoche or tenzing deleg tibetan wylie bstan dzin bde legs born 1950 in lithang tibet is a tibetan buddhist leader from garze sichuan he was arrested on april 7 2002 during a raid on jamyang choekhorling in garze sichuan china he was accused of being involved in a bomb attack on april 3 2002 on the central square of sichuans provincial capital chengduhe was convicted for alleged involvement in a series of unsolved bombings in his region by the chinese authorities and sentenced to death in december 2002 along with lobsang dhondup a 28year old assistant of his lobsang was executed almost immediately in late january 2003 marking the first execution of a tibetan for political crimes in 20 years tenzin deleks trial began on november 29 2002 before the local court in garze and was sentenced to death with a twoyear execution adjournment overseas human rights groups and united nations human rights experts protested that the case against him was seriously f

In [None]:
nlp_clean['text_blobs'][1686]

TextBlob("abdulwahab hussain ali ahmed esmael arabic is a bahraini political activist writer religious figure and philosopher he was one of the most prominent opposition leaders in the 1990s uprising when he was arrested twice for a total length of five years in which he was allegedly subjected to solitary confinement and torture after his release in 2001 he supported government reform plansin 2001 hussain chaired the committee which founded al wefaq bahrains main opposition party he urged opposition leaders to boycott 2002 parliamentary election after the king issued constitution of 2002 which he found going back on reform plans in 2003 hussain announced he would leave politics and stop issuing public statements and in 2005 he resigned from al wefaq in 2009 he returned to politics becoming the cofounder and official spokesman of al wafa islamic movementduring the bahraini uprising 2011present hussain played an important role leading protests calling for the downfall of regime and esta

In [None]:
nlp_clean['text_blobs'][1229]

TextBlob("philip denwood is a british tibetologist noted for his work on traditional tibetan arts and handicrafts himalayan architecture and tibetan linguisticshe was born on august 29 in 1941 in workington cumberland where he also grew up in 1962 he gained his ba in geography and history at university college london ucl and later in 1964 an ma in architecture and town planning from liverpool university he became a research fellow in tibetan at soas in 1965 and became lecturer in tibetan in 1973 denwood describes his interest in himalayan cultures as purely accidental having completed his study at liverpool and based in england denwood was living next door to a friend of his from college david snellgrove snellgrove had not long returned from a trip to the himalayas and had brought five tibetan refugees back with him around the time of the first wave of exiles post 1959 denwoods intrigue about the situation with his friends house guests coupled with his interest in oriental architecture

In [None]:
nlp_clean['text_blobs'][615]

TextBlob("phan thanh hi born c 1969 is a vietnamese dissident blogger blogging as anhbasg or anh ba si gn at the website independent journalists club phan discussed controversial topics including other dissidents vietnams maritime border with china and government corruption scandalsa lawyer by training phan was denied permission to practice due to his blogging and his involvement in protests in 2007 he was arrested after a protest against the beijing olympics and later placed under police surveillancein october 2010 he was arrested in ho chi minh city and charged with disseminating antistate information and materials carrying a maximum sentence of twenty years imprisonment he later confessed to the charges fellow independent journalists club posters nguyen van hai and ta phong tan were also arrested though their trial was scheduled to begin in august it was delayed indefinitely after tas mother dang thi kim lieng immolated herself in front of the government offices in bac lieu province

In [None]:
nlp_clean['text_blobs'][918]

TextBlob("habibollah latifi is a kurdish iranian law student at azad university and a kurdish activist who has been charged with moharebeh waging war against god and sentenced to death by an islamic revolutionary court in iran he is charged with committing acts of violence a charge he denies in cooperation with the party of free life of kurdistan pjak in 2007latifi has been described as a straight a student and an athlete who loves nature and his countrys mountains he was arrested in october 2007 and sentenced to death in 2008 in a court session that lasted a few minutes and that followed four months of interrogation and torture his sentence was upheld by an appeals court in 2009 his lawyer was informed on thursday december 22 2010 that latifi was scheduled to be executed on sunday december 26 2010 in sanandaj prison in kurdistan province iran his execution was halted after his lawyer requested further investigation into his case in a letter to judiciary chief sadeq larijani latifi met

### Sentiment and Subjectivity

In [None]:
# investigate one of our random samples
nlp.loc[6562]

URI                 <http://dbpedia.org/resource/Ping_Li>
name                                              Ping Li
text    ping li chinese pinyin l png is a professor of...
Name: 6562, dtype: object

In [None]:
sent_subj = nlp.loc[1200]['text']

In [None]:
nlp_clean['name'][1200]

'Tenzin Delek Rinpoche'

In [None]:
sent_subj

'lithang tulku tenzin delek rinpoche or tenzing deleg tibetan wylie bstan dzin bde legs born 1950 in lithang tibet is a tibetan buddhist leader from garze sichuan he was arrested on april 7 2002 during a raid on jamyang choekhorling in garze sichuan china he was accused of being involved in a bomb attack on april 3 2002 on the central square of sichuans provincial capital chengduhe was convicted for alleged involvement in a series of unsolved bombings in his region by the chinese authorities and sentenced to death in december 2002 along with lobsang dhondup a 28year old assistant of his lobsang was executed almost immediately in late january 2003 marking the first execution of a tibetan for political crimes in 20 years tenzin deleks trial began on november 29 2002 before the local court in garze and was sentenced to death with a twoyear execution adjournment overseas human rights groups and united nations human rights experts protested that the case against him was seriously flawed tha

In [None]:
blob_ss = TextBlob(sent_subj)

In [None]:
blob_ss

TextBlob("lithang tulku tenzin delek rinpoche or tenzing deleg tibetan wylie bstan dzin bde legs born 1950 in lithang tibet is a tibetan buddhist leader from garze sichuan he was arrested on april 7 2002 during a raid on jamyang choekhorling in garze sichuan china he was accused of being involved in a bomb attack on april 3 2002 on the central square of sichuans provincial capital chengduhe was convicted for alleged involvement in a series of unsolved bombings in his region by the chinese authorities and sentenced to death in december 2002 along with lobsang dhondup a 28year old assistant of his lobsang was executed almost immediately in late january 2003 marking the first execution of a tibetan for political crimes in 20 years tenzin deleks trial began on november 29 2002 before the local court in garze and was sentenced to death with a twoyear execution adjournment overseas human rights groups and united nations human rights experts protested that the case against him was seriously f

In [None]:
blob_ss.tags

[('lithang', 'NN'),
 ('tulku', 'NN'),
 ('tenzin', 'NN'),
 ('delek', 'NN'),
 ('rinpoche', 'NN'),
 ('or', 'CC'),
 ('tenzing', 'VBG'),
 ('deleg', 'JJ'),
 ('tibetan', 'JJ'),
 ('wylie', 'NN'),
 ('bstan', 'NN'),
 ('dzin', 'NN'),
 ('bde', 'NN'),
 ('legs', 'VBZ'),
 ('born', 'JJ'),
 ('1950', 'CD'),
 ('in', 'IN'),
 ('lithang', 'NN'),
 ('tibet', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('tibetan', 'JJ'),
 ('buddhist', 'NN'),
 ('leader', 'NN'),
 ('from', 'IN'),
 ('garze', 'JJ'),
 ('sichuan', 'NN'),
 ('he', 'PRP'),
 ('was', 'VBD'),
 ('arrested', 'VBN'),
 ('on', 'IN'),
 ('april', 'NN'),
 ('7', 'CD'),
 ('2002', 'CD'),
 ('during', 'IN'),
 ('a', 'DT'),
 ('raid', 'NN'),
 ('on', 'IN'),
 ('jamyang', 'NN'),
 ('choekhorling', 'NN'),
 ('in', 'IN'),
 ('garze', 'JJ'),
 ('sichuan', 'JJ'),
 ('china', 'NN'),
 ('he', 'PRP'),
 ('was', 'VBD'),
 ('accused', 'VBN'),
 ('of', 'IN'),
 ('being', 'VBG'),
 ('involved', 'VBN'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('bomb', 'NN'),
 ('attack', 'NN'),
 ('on', 'IN'),
 ('april', 'NN'),
 ('

In [None]:
blob_ss.sentiment

Sentiment(polarity=0.029166666666666664, subjectivity=0.23749999999999996)

In [None]:
blob_ss.sentences

[Sentence("lithang tulku tenzin delek rinpoche or tenzing deleg tibetan wylie bstan dzin bde legs born 1950 in lithang tibet is a tibetan buddhist leader from garze sichuan he was arrested on april 7 2002 during a raid on jamyang choekhorling in garze sichuan china he was accused of being involved in a bomb attack on april 3 2002 on the central square of sichuans provincial capital chengduhe was convicted for alleged involvement in a series of unsolved bombings in his region by the chinese authorities and sentenced to death in december 2002 along with lobsang dhondup a 28year old assistant of his lobsang was executed almost immediately in late january 2003 marking the first execution of a tibetan for political crimes in 20 years tenzin deleks trial began on november 29 2002 before the local court in garze and was sentenced to death with a twoyear execution adjournment overseas human rights groups and united nations human rights experts protested that the case against him was seriously 

In [None]:
len(blob_ss)

1741

### Wikipedia API

In [None]:
#install Wikipedia API
%%capture
!pip3 install wikipedia-api

In [None]:
import wikipediaapi

In [None]:
# Pull out page from Wikipedia
topic = 'Tenzin Delek Rinpoche'
wikip = wikipediaapi.Wikipedia('foobar')
page_ex = wikip.page(topic)
wiki_text = page_ex.text
wiki_text

'Lithang Tulku Tenzin Delek Rinpoche or Tenzing Deleg (Tibetan: བསྟན་འཛིན་བདེ་ལེགས་, Wylie: Bstan \'dzin bde legs) (1950 – 2015) was a Tibetan Buddhist leader from Garze, Sichuan. He is also known for working to develop social, medical, educational and religious institutions for Tibetan nomads in eastern Tibet, as an advocate for environmental conservation in the face of indiscriminate logging and mining projects, and as a mediator between Tibetans and Chinese.\n\nPersonal life\nRinpoche was born in Lithang, Tibet.\nOn 7 April 2002 he was arrested during a raid on Jamyang Choekhorling in Garze, Sichuan, China, and accused of being involved in a bomb attack on 3 April 2002 on the central square of Sichuan\'s provincial capital, Chengdu.He was convicted for alleged involvement in a series of unsolved bombings in his region by the Chinese authorities and sentenced to death in December 2002 along with Lobsang Dhondup, a 28-year-old assistant of his. Lobsang was executed almost immediately 

In [None]:
# Replace newline chars with spaces before doing any processing. Strip the ' and "s" from possessives
wiki_text_clean = wiki_text.replace("\n"," ").replace("\'s",'').replace('\'','').replace("(", "").replace(")", "").replace('"', "")
wiki_text_clean

'Lithang Tulku Tenzin Delek Rinpoche or Tenzing Deleg Tibetan: བསྟན་འཛིན་བདེ་ལེགས་, Wylie: Bstan dzin bde legs 1950 – 2015 was a Tibetan Buddhist leader from Garze, Sichuan. He is also known for working to develop social, medical, educational and religious institutions for Tibetan nomads in eastern Tibet, as an advocate for environmental conservation in the face of indiscriminate logging and mining projects, and as a mediator between Tibetans and Chinese.  Personal life Rinpoche was born in Lithang, Tibet. On 7 April 2002 he was arrested during a raid on Jamyang Choekhorling in Garze, Sichuan, China, and accused of being involved in a bomb attack on 3 April 2002 on the central square of Sichuan provincial capital, Chengdu.He was convicted for alleged involvement in a series of unsolved bombings in his region by the Chinese authorities and sentenced to death in December 2002 along with Lobsang Dhondup, a 28-year-old assistant of his. Lobsang was executed almost immediately in late Janua

In [None]:
# Convert to textblob
wiki_blob = TextBlob(wiki_text_clean)

In [None]:
wiki_blob.sentiment

Sentiment(polarity=0.09829931972789115, subjectivity=0.28775510204081634)

In [None]:
# create list with names of nearest neighbors
name_list = closest_10['name'].tolist()
name_list

['Tenzin Delek Rinpoche',
 'Philip Denwood',
 'Marek Antoni Nowicki',
 'Guy Sorman',
 'Habibollah Latifi',
 'Manfred Nowak',
 'Phan Thanh H%E1%BA%A3i',
 'Hadi Ghabel',
 "Martin O'Brien (humanitarian)",
 'Cong Thanh Do',
 'Abdulwahab Hussain']

In [None]:
# Create an empty dictionary to store Wikipedia pages for each person
wiki_pages = {}

# Initialize the Wikipedia API with the 'english' language
wiki = wikipediaapi.Wikipedia('english')

# Iterate over each person in the 'name_list'
for person in name_list:
  get_page = wiki.page(person)
  wiki_pages[person] = get_page.text

In [None]:
# Check the number of Wikipedia pages retrieved
len(wiki_pages)

11

In [None]:
# Combine the Wikipedia page texts into a list, including the target 'wiki_text' at index 0
combined_pages = [wiki_text] + list(wiki_pages.values())

In [None]:
# Create a TF-IDF vectorizer to convert text data into numerical vectors
tfidf_wiki = TfidfVectorizer()

# Apply TF-IDF vectorization on the combined pages
wiki_vectors = tfidf_wiki.fit_transform(combined_pages)

In [None]:
# Define the number of nearest neighbors (k) to find
k = 10

# Create a KNN model with 'cosine' similarity metric
knn = NearestNeighbors(n_neighbors=k, metric='cosine')

# Fit the KNN model on the TF-IDF vectors of the Wikipedia pages
knn.fit(wiki_vectors)

In [None]:
# Find the k-nearest neighbors for the target 'wiki_text' (index 0)
nn_dist, nn_indices = knn.kneighbors(wiki_vectors[0])

In [None]:
# Print the nearness ranking of the nearest neighbors

print("Nearness Ranking:")
for i, neighbor_index in enumerate(nn_indices[0]):
    if neighbor_index == 0:  # Skip target
        continue
    person = name_list[neighbor_index - 1]  # Subtract 1 to adjust for target being at index 0
    distance = nn_dist[0][i]
    print(f"{i + 1}. {person}: {distance:.3f}")

Nearness Ranking:
1. Tenzin Delek Rinpoche: 0.000
3. Abdulwahab Hussain: 0.566
4. Manfred Nowak: 0.594
5. Marek Antoni Nowicki: 0.596
6. Guy Sorman: 0.616
7. Philip Denwood: 0.633
8. Hadi Ghabel: 0.645
9. Habibollah Latifi: 0.659
10. Cong Thanh Do: 0.663


### ~archive

In [None]:
# blob_tfidf = blob_eda.words[0:1402].stem()

In [None]:
# count_vectorizer = CountVectorizer()

In [None]:
# word_counts = count_vectorizer.fit_transform(blob_tfidf)

In [None]:
# tf_idf_tran = TfidfTransformer()

In [None]:
# tf_idf_matrix = tf_idf_tran.fit_transform(word_counts)

In [None]:
# tf_df = pd.DataFrame(tf_idf_matrix.toarray(), columns = count_vectorizer.get_feature_names_out())
# tf_df.describe()