# Recommender System based on Doc2Vec

#### Librairies

In [1]:
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import multiprocessing

#### Wikipedia data

In [2]:
wiki = WikiCorpus("enwiki-20210101-pages-articles-multistream12.xml-p8554860p9172788.bz2")



In [38]:
class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        for content, (page_id, title) in self.wiki.get_texts():
            yield TaggedDocument(content, [title])

In [39]:
documents = TaggedWikiDocument(wiki)

#### Statitics

In [41]:
#pre = Doc2Vec(min_count=0)
#pre.build_vocab(documents)

In [42]:
#for num in range(0, 20):
    #print('min_count: {}, size of vocab: '.format(num), pre.scale_vocab(min_count=num, dry_run=True)['memory']['vocab']/700)

#### Models

In [43]:
cores = multiprocessing.cpu_count()

models = [
    # PV-DBOW 
    Doc2Vec(dm=0, dbow_words=1, vector_size=200, window=8, min_count=19, epochs=10, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, min_count=19, epochs=10, workers=cores),
]

In [44]:
models[0].build_vocab(documents)
print(str(models[0]))
models[1].reset_from(models[0])
print(str(models[1]))

Doc2Vec(dbow+w,d200,n5,w8,mc19,s0.001,t4)
Doc2Vec(dm/m,d200,n5,w8,mc19,s0.001,t4)


In [45]:
for model in models:
    %time model.train(documents, total_examples=model.corpus_count, epochs=10)

CPU times: user 2h 15min 9s, sys: 3min 40s, total: 2h 18min 50s
Wall time: 1h 34min 12s
CPU times: user 33min 38s, sys: 2min 22s, total: 36min
Wall time: 48min 56s


#### Results

In [148]:
#for model in models:
    #print(str(model))
    #pprint(model.docvecs.most_similar(positive=["Impractical joker"], topn=10))

In [149]:
for model in models:
    string= "impractical jokers".split()
    print(string)
    doc_vector = model.infer_vector(string)
    pprint(model.docvecs.most_similar(positive=[doc_vector], topn=10))

['impractical', 'jokers']
[('New Synagogue (Tarnów)', 0.6353446245193481),
 ('Marver', 0.5988513231277466),
 ('Soil Stockpile', 0.5934687852859497),
 ('Sedreh', 0.5814381241798401),
 ('Siparium', 0.5809149742126465),
 ('Type 500 training mine', 0.5795101523399353),
 ('Non-communications signals', 0.5748095512390137),
 ('Internal flow', 0.5740222930908203),
 ('Pallet inverter', 0.5732473134994507),
 ('Athermalization', 0.5713787078857422)]
['impractical', 'jokers']
[('List of Åland municipalities by area', 0.8507674932479858),
 ('Geology of the English counties', 0.8480256795883179),
 ('Zuyevka, Kirov Oblast', 0.8464033603668213),
 ('List of foliage plant diseases (Bromeliaceae)', 0.8453917503356934),
 ('Leetonia High School', 0.844794750213623),
 ('Ernest Davies (Stretford MP)', 0.8422294855117798),
 ('Shirland Township, Winnebago County, Illinois', 0.838624119758606),
 ('Sepia bidhaia', 0.8375841379165649),
 ('Alexandre Bonnet', 0.8370817303657532),
 ('Bible Christian Mission', 0.8368

#### Other function

In [150]:
#def process_query(query):
    #words = []
    #words = query.split()
    #return words

In [151]:
#query = "Impractical_Joker"
#l = process_query(query)
#for model in models:
    #sim = model.wv.most_similar(positive=l,topn=10)
    #print(sim)

## Recommender system interface :

Link example : https://en.wikipedia.org/wiki/Impractical_Jokers

In [152]:
link = input ("Enter a wiki link: ") 
#print(link[30:])
string= str(link[30:]).split('_')
string= [x.lower() for x in string]
#print(string)
for model in models:
    doc_vector = model.infer_vector(string)
    for i in range(len(model.docvecs.most_similar(positive=[doc_vector], topn=10))):
        new_string= model.docvecs.most_similar(positive=[doc_vector], topn=10)[i][0].replace(" ", "_")
        print('https://en.wikipedia.org/wiki/'+new_string)

Enter a wiki link: https://en.wikipedia.org/wiki/Impractical_Jokers
https://en.wikipedia.org/wiki/New_Synagogue_(Tarnów)
https://en.wikipedia.org/wiki/Quantum_satis
https://en.wikipedia.org/wiki/Marver
https://en.wikipedia.org/wiki/Dribbleware
https://en.wikipedia.org/wiki/Wolei-class_minelayer
https://en.wikipedia.org/wiki/Internal_flow
https://en.wikipedia.org/wiki/Pallet_inverter
https://en.wikipedia.org/wiki/Athermalization
https://en.wikipedia.org/wiki/Non-communications_signals
https://en.wikipedia.org/wiki/Intrinsic_hyperpolarizability
https://en.wikipedia.org/wiki/Geology_of_the_English_counties
https://en.wikipedia.org/wiki/List_of_Sindhi-language_poets
https://en.wikipedia.org/wiki/Shirland_Township,_Winnebago_County,_Illinois
https://en.wikipedia.org/wiki/List_of_museums_in_Upper_Normandy
https://en.wikipedia.org/wiki/Pine_Creek_Township,_Ogle_County,_Illinois
https://en.wikipedia.org/wiki/KFBX
https://en.wikipedia.org/wiki/Diving_at_the_1920_Summer_Olympics_–_Men's_plain_hi