In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

from data.texts import TITLES, CORPUS, remove_stopwords

In [2]:
from config import get_sessionmaker
from models import Page, PageLink, PageTalk, PageQuality

In [3]:
DATABASE_URI = "postgresql://postgres:postgres@localhost:5432/complete_wikipedia"
Session = get_sessionmaker(db_uri=DATABASE_URI)
s = Session()

In [7]:
s.query(
         PageLink, Page
    ).filter(
         PageQuality.page_id == PageTalk.page_id,
    ).filter(
         PageTalk.page_title == Page.page_title,
    ).filter(
        Page.page_id == PageLink.pl_from,
    ).filter(PageLink.pl_from == 25202).first()

(<PageLink: (
	pl_from=25202,
	pl_titles[:10]='["'Zero-point_energy'", "'Zeeman_effect'", "'Young\\'s_interference_experiment'", "'Yoichiro_Nambu'", "'Yang_Chen-Ning'", "'Yakir_Aharonov'", "'Work_(physics)'", "'Wolfgang_Pauli'", "'Willis_Lamb'", "'William_Shockley'"]',
	)>, <Page: (
	page_id=25202,
	page_title='Quantum_mechanics',
	)>)

In [10]:
tfidf_vectorizer = TfidfVectorizer()
corpus_no_stop = []
for doc in CORPUS:
    print(len(remove_stopwords(doc).split()))
    corpus_no_stop.append(remove_stopwords(doc))

681
770
571
710


In [23]:
tfidf_vector = tfidf_vectorizer.fit_transform(corpus_no_stop)
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=TITLES, columns=tfidf_vectorizer.get_feature_names())

In [24]:
tfidf_df.head

<bound method NDFrame.head of                         000        10       100       102       109        11  \
Quantum Mechanics  0.000000  0.013336  0.000000  0.020893  0.020893  0.013336   
Quantum Computing  0.000000  0.008142  0.010057  0.000000  0.000000  0.008142   
Dog                0.068542  0.027745  0.017136  0.000000  0.000000  0.013873   
War of the Worlds  0.083505  0.000000  0.000000  0.000000  0.000000  0.000000   

                        111        12        13        14  ...  wozencraft  \
Quantum Mechanics  0.020893  0.026671  0.013336  0.013336  ...    0.000000   
Quantum Computing  0.000000  0.008142  0.016284  0.008142  ...    0.000000   
Dog                0.000000  0.013873  0.027745  0.041618  ...    0.065203   
War of the Worlds  0.000000  0.000000  0.000000  0.000000  ...    0.000000   

                     writer   wrought     years       yet     yield      yuri  \
Quantum Mechanics  0.000000  0.000000  0.000000  0.000000  0.020893  0.000000   
Quantum Com

In [25]:
compare_idxs = [[0, 1], [0, 2], [0, 3],
                [1, 2], [1, 3],
                [2, 3]]
for idx in compare_idxs:
    sim = cosine_similarity(tfidf_vector[idx[0]], tfidf_vector[idx[1]])
    print(f"- {TITLES[idx[0]]} -> {TITLES[idx[1]]}:")
    print(sim[0][0])

- Quantum Mechanics -> Quantum Computing:
0.5155829281224974
- Quantum Mechanics -> Dog:
0.023219291839305763
- Quantum Mechanics -> War of the Worlds:
0.03701071939759966
- Quantum Computing -> Dog:
0.019036721589325723
- Quantum Computing -> War of the Worlds:
0.0186194339439958
- Dog -> War of the Worlds:
0.04480068990958205


## Without stopword removal:
### Similarities:
- Quantum Mechanics -> Quantum Computing:
0.6766603700523258
- Quantum Mechanics -> Dog:
0.5514199048054604
- Quantum Mechanics -> War of the Worlds:
0.6617423653602914
- Quantum Computing -> Dog:
0.3806258895317568
- Quantum Computing -> War of the Worlds:
0.4400298452166757
- Dog -> War of the Worlds:
0.6512871505741027

### Top 5 words:

document	                term     	tfidf
- 3420	Dog	                 dog	    0.586827
- 4461	Dog	                 wolf	    0.347749
- 3423	Dog	                 domestic	0.217343
- 3217	Dog	                 canis	    0.173875
- 3064	Dog	                 ago	    0.171356
- 2579	Quantum Computing	quantum	    0.754287
- 1801	Quantum Computing	computer	0.191344
- 1716	Quantum Computing	 can	    0.181029
- 2581	Quantum Computing	qubit	    0.178587
- 1766	Quantum Computing	classical	0.170972
- 1086	Quantum Mechanics	quantum	    0.560050
- 841	Quantum Mechanics	mechanics	0.280025
- 981	Quantum Mechanics	particle	0.188035
- 1341	Quantum Mechanics	theory	    0.181193
- 1018	Quantum Mechanics	physics	    0.167142
- 5502	War of the Worlds	planet	    0.291268
- 5885	War of the Worlds	upon	    0.208763
- 5299	War of the Worlds	mars	    0.185352
- 5368	War of the Worlds	must	    0.185352
- 4931	War of the Worlds	earth	    0.158873

## With stopword removal:
### Similarities:
- Quantum Mechanics -> Quantum Computing:
0.5155829281224974
- Quantum Mechanics -> Dog:
0.023219291839305763
- Quantum Mechanics -> War of the Worlds:
0.03701071939759966
- Quantum Computing -> Dog:
0.019036721589325723
- Quantum Computing -> War of the Worlds:
0.0186194339439958
- Dog -> War of the Worlds:
0.04480068990958205

### Top 5 words:
	document	            term    	tfidf
- 3566	Dog	                 dog	   0.371542
- 4499	Dog	                 the	   0.631924
- 4097	Dog	                  of	   0.244152
- 3233	Dog	                 and	   0.229791
- 4658	Dog	                 wolf	   0.220173
- 2686	Quantum Computing	quantum	   0.631184
- 2940	Quantum Computing	 the	   0.323079
- 2538	Quantum Computing	  of	   0.222813
- 1878	Quantum Computing	computer   0.160115
- 2324	Quantum Computing	  is	   0.155969
- 1381	Quantum Mechanics	  the	   0.491110
- 979	Quantum Mechanics	  of	   0.371841
- 1127	Quantum Mechanics	quantum	   0.360391
- 1415	Quantum Mechanics	  to	   0.189428
- 115	Quantum Mechanics	 and	   0.182412
- 6058	War of the Worlds	 the	   0.584745
- 5656	War of the Worlds	  of	   0.422637
- 4792	War of the Worlds	 and	   0.277899
- 6056	War of the Worlds	 that	   0.179476
- 5446	War of the Worlds	 it	       0.155792

In [20]:
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe7730ff910>

In [27]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(5)

Unnamed: 0,document,term,tfidf
3420,Dog,dog,0.586827
4461,Dog,wolf,0.347749
3423,Dog,domestic,0.217343
3217,Dog,canis,0.173875
3064,Dog,ago,0.171356
2579,Quantum Computing,quantum,0.754287
1801,Quantum Computing,computer,0.191344
1716,Quantum Computing,can,0.181029
2581,Quantum Computing,qubit,0.178587
1766,Quantum Computing,classical,0.170972
