# Building Text Similarity

# Initialization

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import wikipedia

In [2]:
import pandas as pd

In [3]:
import numpy as np

# Read Data

In [4]:
import sqlite3

sdow_database='./sdow/sdow/sdow.sqlite'
sdow_conn = sqlite3.connect(sdow_database, check_same_thread=False)
sdow_cursor = sdow_conn.cursor()

## Local SQLite3 -- Get PageLink Data

SQLite Commands : https://www.sqlitetutorial.net/sqlite-tutorial/sqlite-show-tables/

`sqlite3 ./sdow/sdow.sqlite`

`
sqlite> .tables
links      pages      redirects
`

### How to Execute Raw Query

In [5]:
query = 'SELECT * FROM pages LIMIT 5;'
sdow_cursor.execute(query)
results = sdow_cursor.fetchall()

In [6]:
results

[(10, 'AccessibleComputing', 1),
 (12, 'Anarchism', 0),
 (13, 'AfghanistanHistory', 1),
 (14, 'AfghanistanGeography', 1),
 (15, 'AfghanistanPeople', 1)]

### Read SQLite3 PageLink Data into Pandas

In [7]:
NUM_ROWS = 100

In [8]:
dfPageLinks = pd.read_sql_query("SELECT * FROM pages JOIN links on pages.id=links.id limit {}".format(NUM_ROWS), sdow_conn)
display(dfPageLinks.shape)
dfPageLinks

(100, 8)

Unnamed: 0,id,title,is_redirect,id.1,outgoing_links_count,incoming_links_count,outgoing_links,incoming_links
0,12,Anarchism,0,12,1361,4722,10005|10030|10039756|1007026|10113|1011693|101...,10005|10025826|10030|100314|10031794|100390|10...
1,25,Autism,0,25,645,2261,10007751|10008223|10008586|10048|10070413|1007...,10007280|10007751|1002348|1002689|10044711|100...
2,39,Albedo,0,39,292,3172,10176541|10180397|1029711|1038280|1067415|1077...,1000088|1000106|1000115|1000133|1000141|100016...
3,290,A,0,290,240,870,10081|100824|103973|104433|1048610|10831|10983...,1009486|1017421|10216|1028188|10309501|1032859...
4,303,Alabama,0,303,1518,15461,10021774|100416|100580|100582|1006137|10072406...,10003335|10003629|10007041|10009034|10009060|1...
...,...,...,...,...,...,...,...,...
95,746,Azerbaijan,0,746,1196,23307,1000530|1006199|1009423|10132427|1016958|10186...,1000167|10005198|1000530|10006281|10025195|100...
96,748,Amateur_astronomy,0,748,207,524,1000626|10134|1018868|10490149|10648303|108111...,10017597|10101941|10150544|10160082|1018868|10...
97,751,Aikido,0,751,513,1020,10050914|100542|1013936|102067|10243888|102920...,10029743|10050914|1007709|1013936|10140691|101...
98,752,Art,0,752,458,4125,1010808|102036|1024345|102495|102958|1038052|1...,10004|10005|10027934|100375|10050914|10051110|...


## Fetch Page Summaries (API or Nate's CSV Dump)

### How to fetch from Wikipedia API

Fetching articles manually from wikipedia api

`
article_ids = [
    'Adam_Smith','Bill_Clinton','Bill_Gates','Economics','Free_market'
]
article_summaries = [wikipedia.summary(artId) for artId in dfPageLinks.title]
`

How to read csv file into pandas manually.
Found a better way to read subset file into pandas.. 

`
f = open("sdow/database/subset_summaries.csv", "r")
arrLines = []
for line in f.readlines()[:20]:
    print(line)
    arrLines.append(re.split("(^\d+) ", line))
f.close()
`

## Read from CSV file

In [9]:
# Expected CSV format : 
!head -n2 'sdow/database/subset_summaries.csv'

1814 Adam_Smith "Adam Smith  (16 June [O.S. 5 June] 1723 – 17 July 1790) was a Scottish economist, philosopher and author as well as a moral philosopher, a pioneer of political economy and a key figure during the Scottish Enlightenment, also known as ''The Father of Economics'' or ''The Father of Capitalism''. Smith wrote two classic works, The Theory of Moral Sentiments (1759) and An Inquiry into the Nature and Causes of the Wealth of Nations (1776). The latter, often abbreviated as The Wealth of Nations, is considered his magnum opus and the first modern work of economics. In his work, Adam Smith introduced his theory of absolute advantage.Smith studied social philosophy at the University of Glasgow and at Balliol College, Oxford, where he was one of the first students to benefit from scholarships set up by fellow Scot John Snell. After graduating, he delivered a successful series of public lectures at the University of Edinburgh, leading him to collaborate with David Hume during the

In [10]:
dfSummaries = pd.read_csv('sdow/database/subset_summaries.csv', nrows=20, sep=" ", header=None, names=['id','title','summary'])
display(dfSummaries.shape)
dfSummaries

(20, 3)

Unnamed: 0,id,title,summary
0,1814,Adam_Smith,Adam Smith (16 June [O.S. 5 June] 1723 – 17 J...
1,3356,Bill_Clinton,William Jefferson Clinton (born William Jeffer...
2,3747,Bill_Gates,"William Henry Gates III (born October 28, 195..."
3,8182,Dwight_D._Eisenhower,"Dwight David ""Ike"" Eisenhower ( EYE-zən-how-ər..."
4,9223,Economics,Economics () is the social science that studie...
5,10979,Franklin_D._Roosevelt,"Franklin Delano Roosevelt (, ; January 30, 188..."
6,11826,Free_market,"In economics, a free market is a system in whi..."
7,11955,George_H._W._Bush,"George Herbert Walker Bush (June 12, 1924 – No..."
8,13765,Henry_Kissinger,Henry Alfred Kissinger (; German: [ˈkɪsɪŋɐ]; b...
9,14567,International_trade,International trade is the exchange of capital...


# Train PageLink 'Recommender'

In [11]:
matTfidf = TfidfVectorizer().fit_transform(dfSummaries.summary)
pairwise_similarity = matTfidf * matTfidf.T


In [12]:
pd.DataFrame(pairwise_similarity.toarray()).style.background_gradient()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.34416,0.271927,0.389718,0.273005,0.377223,0.280306,0.293742,0.232377,0.146762,0.267232,0.301807,0.262899,0.240014,0.372009,0.277152,0.365021,0.227119,0.368709,0.354048
1,0.34416,1.0,0.329413,0.504073,0.210774,0.506194,0.260145,0.501848,0.32029,0.175846,0.312133,0.381544,0.299127,0.309711,0.3772,0.354789,0.450059,0.32966,0.510689,0.50367
2,0.271927,0.329413,1.0,0.365515,0.183373,0.359764,0.209474,0.282992,0.229086,0.128425,0.25174,0.289654,0.254819,0.240729,0.287812,0.278545,0.331999,0.225256,0.342778,0.353151
3,0.389718,0.504073,0.365515,1.0,0.219089,0.57046,0.273966,0.440179,0.354007,0.161464,0.321094,0.47779,0.314985,0.373578,0.441833,0.414741,0.534626,0.324216,0.566373,0.576079
4,0.273005,0.210774,0.183373,0.219089,1.0,0.251007,0.25585,0.177629,0.165073,0.173107,0.224713,0.221122,0.212593,0.163481,0.247715,0.177866,0.212059,0.193024,0.210859,0.208263
5,0.377223,0.506194,0.359764,0.57046,0.251007,1.0,0.289578,0.458538,0.359169,0.170779,0.345809,0.482108,0.340094,0.342193,0.42806,0.40463,0.514934,0.37732,0.554859,0.548552
6,0.280306,0.260145,0.209474,0.273966,0.25585,0.289578,1.0,0.227267,0.198183,0.185446,0.243499,0.248488,0.240958,0.211013,0.28761,0.207103,0.27188,0.23398,0.257704,0.256958
7,0.293742,0.501848,0.282992,0.440179,0.177629,0.458538,0.227267,1.0,0.280034,0.15085,0.272428,0.354627,0.249623,0.293214,0.317097,0.356442,0.409524,0.369978,0.503509,0.520729
8,0.232377,0.32029,0.229086,0.354007,0.165073,0.359169,0.198183,0.280034,1.0,0.149878,0.227787,0.319667,0.239999,0.217946,0.273532,0.260097,0.337883,0.234454,0.322831,0.36865
9,0.146762,0.175846,0.128425,0.161464,0.173107,0.170779,0.185446,0.15085,0.149878,1.0,0.280062,0.156029,0.139715,0.142691,0.154545,0.125651,0.163434,0.275634,0.167364,0.147703


In [None]:
target = 20979
pairwise_similarity.getcol(dfSummaries)