### Objectives

- Brief Intro to MongoDB in python with pymongo
- Utilize TFIDF with SVD - Singular Value Decomposition - to implement Latent Semantic Search
- Implement Latend Dirchlet Allocation

In [1]:
!pip install pymongo

[31mthinc 6.10.2 requires pathlib<2.0.0,>=1.0.0, which is not installed.[0m
[31mspacy 2.0.11 requires pathlib, which is not installed.[0m
[31mmkl-random 1.0.1 requires cython, which is not installed.[0m
[31mmkl-fft 1.0.0 requires cython, which is not installed.[0m
[31mspacy 2.0.11 has requirement regex==2017.4.5, but you'll have regex 2017.11.9 which is incompatible.[0m
[31mipywidgets 7.1.0 has requirement widgetsnbextension~=3.1.0, but you'll have widgetsnbextension 3.0.2 which is incompatible.[0m


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pymongo 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
%matplotlib inline

In [10]:
cli = pymongo.MongoClient('54.202.16.122', 27016)

In [11]:
cli.database_names()

['admin', 'local', 'my_database', 'test', 'wikipedia']

In [12]:
cli.wikipedia.collection_names()

['Machine Learning 2', 'Business Software 2']

In [29]:
wikidb = cli.wikipedia

In [30]:
def get_list_pages(col_name, ip='54.202.16.122'):
    cli = pymongo.MongoClient(ip, 27016)
    wikidb = cli.wikipedia
    col_pages = wikidb.get_collection(col_name)
    
    cursor = col_pages.find()
    
    text_list = []
    for entry in list(cursor):
        text_list.append(entry)
    return(text_list)

In [31]:
ml_df = pd.DataFrame(get_list_pages('Machine Learning 2'))
ml_df[:1]

Unnamed: 0,Category,_id,categories,page_id,text,title
0,Machine Learning,5a14b44b8423e10105ebeeac,"[Category:Artificial intelligence, Category:Ar...",9583985,A '''committee machine''' is a type of [[artif...,Committee machine


In [32]:
bizsoft_df = pd.DataFrame(get_list_pages('Business Software 2'))
bizsoft_df.head()

Unnamed: 0,Category,_id,categories,page_id,text,title
0,Business Software,5a14b4c08423e10105ebf234,"[Category:All orphaned articles, Category:Busi...",32797209,{{multiple issues|\n{{COI|date=September 2011}...,BRFplus
1,Business Software,5a14b4c08423e10105ebf235,"[Category:Emergency management software, Categ...",39584994,{{Infobox software\n| name = '''NHS Pathways''...,NHS Pathways
2,Business Software,5a14b4c08423e10105ebf236,"[Category:2002 video games, Category:All artic...",9101010,{{unreferenced|date=June 2015}}\n{{Infobox vid...,Industry Giant II
3,Business Software,5a14b4c08423e10105ebf237,"[Category:Business software companies, Categor...",32094284,\n{{Infobox company\n| name = Phor...,Phorest
4,Business Software,5a14b4c08423e10105ebf238,"[Category:Business software companies, Categor...",42590340,{{Infobox company\n| name = ObjectSecurity...,ObjectSecurity


In [33]:
wiki_df = pd.concat([ml_df, bizsoft_df], axis=0)

In [34]:
wiki_df.head()

Unnamed: 0,Category,_id,categories,page_id,text,title
0,Machine Learning,5a14b44b8423e10105ebeeac,"[Category:Artificial intelligence, Category:Ar...",9583985,A '''committee machine''' is a type of [[artif...,Committee machine
1,Machine Learning,5a14b44b8423e10105ebeead,"[Category:Algorithmic trading, Category:Machin...",37787103,The '''universal portfolio algorithm''' is a p...,Universal portfolio algorithm
2,Machine Learning,5a14b44b8423e10105ebeeae,[Category:All articles with dead external link...,8220913,{{About|the neural network|other uses|Adaline ...,ADALINE
3,Machine Learning,5a14b44b8423e10105ebeeaf,"[Category:All stub articles, Category:Artifici...",2070605,'''Stochastic neural networks''' are a type of...,Stochastic neural network
4,Machine Learning,5a14b44b8423e10105ebeeb0,[Category:Data mining and machine learning sof...,32867182,{{Infobox software\n| name =...,Waffles (machine learning)


In [35]:
def cleaner(text):
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('\\ufeff', '', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]','',text)
    
    return text

In [36]:
wiki_df['text'] = wiki_df['text'].map(str)
wiki_df['text'] = wiki_df['text'].apply(cleaner)

In [37]:
wiki_df['text'].iloc[1]

'the universal portfolio algorithm is a portfolio selection algorithm from the field of machine learning and information theory the algorithm learns adaptively from historical data and maximizes the logoptimal growth rate in the long run it was introduced by the late stanford university information theorist thomas m coverrefcite journal  urlhttponlinelibrarywileycomdoijtbxabstract  titleuniversal portfolios  firstthomas mlastcover  journalmathematical finance  volume  issue  pages  year  doijtbxrefthe algorithm rebalances the portfolio at the beginning of each trading period at the beginning of the first trading period it starts with a naive diversification in the following trading periods the portfolio composition depends on the historical total return of all possible constantrebalanced portfoliosreferencesreflistcategorymachine learningcategoryalgorithmic tradingcategoryportfolio theories'

In [38]:
tfidif_vector = TfidfVectorizer(min_df = 5, stop_words='english')

In [39]:
wiki_pages_matrix_sparse = tfidif_vector.fit_transform(wiki_df['text'])
wiki_pages_matrix_sparse

<2296x12143 sparse matrix of type '<class 'numpy.float64'>'
	with 449638 stored elements in Compressed Sparse Row format>

In [41]:
wiki_pages_df_tfdf = pd.DataFrame(wiki_pages_matrix_sparse.toarray(),
                                  index = wiki_df.index,
                                  columns = tfidif_vector.get_feature_names())

In [42]:
wiki_pages_df_tfdf.head()

Unnamed: 0,aa,aaai,aalst,aaron,ab,abacus,abandoned,abbreviated,abbreviation,abbreviations,...,zoho,zone,zones,zoo,zoom,zos,zoubin,zur,zurich,zx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
n_components = 100
SVD = TruncatedSVD(n_components)
component_names = ['component_' + str(i) for i in range(n_components)]

In [44]:
wiki_svd_matrix = SVD.fit_transform(wiki_pages_df_tfdf)
wiki_svd_matrix

array([[ 7.51758446e-02,  1.11286901e-01,  2.34817646e-03, ...,
        -1.11538713e-02, -1.97799844e-02, -6.09279517e-02],
       [ 1.10386761e-01,  1.18631511e-01,  1.53684463e-04, ...,
         4.16306891e-03, -2.85932059e-02,  2.28807921e-02],
       [ 1.60841511e-01,  2.96246648e-01, -5.06564991e-04, ...,
         2.24719128e-02, -1.46926411e-02,  2.71341969e-02],
       ...,
       [ 2.53373018e-01, -6.47309271e-02,  3.87804186e-02, ...,
        -1.05548918e-02, -6.99394301e-03,  5.76261292e-02],
       [ 1.16079353e-01, -3.19340695e-02, -5.24356469e-02, ...,
         3.21583258e-02, -3.15847901e-02,  2.78358575e-02],
       [ 2.09529616e-01, -5.71150431e-02,  3.97719171e-01, ...,
        -6.50617066e-03, -1.88342216e-02, -5.97654778e-02]])

In [46]:
wiki_svd_matrix.shape

(2296, 100)

In [49]:
search_term = 'Artificial Intelligence'
search_term_vec = tfidif_vector.transform([search_term])

In [50]:
search_term_vec

<1x12143 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [51]:
search_term_svd = SVD.transform(search_term_vec)

In [52]:
search_term_svd

array([[ 0.06273467,  0.08473824,  0.00596795, -0.04397423, -0.07190256,
         0.18031957,  0.03783672, -0.13339734, -0.06885053, -0.00293547,
        -0.00919125,  0.04777775, -0.03522506,  0.0284836 , -0.03807505,
        -0.0585073 ,  0.0704974 ,  0.07445803,  0.02325281,  0.04112419,
        -0.01232889, -0.06714794, -0.03924552,  0.00838057,  0.01209309,
        -0.00701758, -0.013238  ,  0.05380962,  0.04096959, -0.09997561,
        -0.0591631 , -0.02020825,  0.05967524, -0.01391657, -0.01190068,
        -0.05477126, -0.06542732, -0.02310838, -0.01522233,  0.10012421,
        -0.07792672,  0.17150863,  0.13956576,  0.01036835,  0.05137271,
        -0.15928634, -0.18011845, -0.05656377,  0.08565318,  0.04656801,
         0.10836968,  0.07390866, -0.00942073,  0.03089432, -0.02821143,
         0.01162539, -0.02905168, -0.25988805, -0.07233992,  0.10415012,
        -0.02577432, -0.0075142 ,  0.12153732, -0.04160252,  0.08156998,
         0.00355035,  0.02846494,  0.06651899,  0.1

In [53]:
cosine_similarities = wiki_svd_matrix.dot(search_term_svd.T).ravel()

In [54]:
cosine_similarities

array([ 0.05573261, -0.00492982,  0.01494922, ...,  0.03182179,
       -0.00856272, -0.00627889])

In [55]:
cosine_similarities.shape

(2296,)

In [56]:
cosine_similarities.argsort()[:-2:-1]

array([884])

In [57]:
wiki_df.iloc[884]

Category                                       Machine Learning
_id                                    5a14b44b8423e10105ebf220
categories    [Category:Artificial intelligence conferences,...
page_id                                                24714635
text          redirect association for the advancement of ar...
title                AAAI Conference on Artificial Intelligence
Name: 884, dtype: object

### LDA

In [58]:
from gensim import corpora, models
from nltk.corpus import stopwords
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [60]:
def split_into_words(documents):
    docs = []
    for doc in documents:
        text_processed = doc.split(' ')
        text_processed = list(filter(lambda a: a != '', text_processed))
        text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
        docs.append(text_processed)
    return docs

In [61]:
texts = split_into_words(wiki_df['text'])

In [None]:
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
ldamodel = models.ldamodel.LdaModel(corpus,
                                    id2word = dictionary,
                                    num_topics = 50,
                                    passes = 5,
                                    minimum_probability = 0.01)

In [None]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)