# Recommender System based on Doc2Vec

#### Librairies

In [1]:
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import multiprocessing

#### Wikipedia data

In [2]:
wiki = WikiCorpus("enwiki-20210101-pages-articles-multistream12.xml-p8554860p9172788.bz2")



In [3]:
class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        for content, (page_id, title) in self.wiki.get_texts():
            yield TaggedDocument(content, [title])

In [4]:
documents = TaggedWikiDocument(wiki)

#### Statitics

In [5]:
#pre = Doc2Vec(min_count=0)
#pre.build_vocab(documents)

In [6]:
#for num in range(0, 20):
    #print('min_count: {}, size of vocab: '.format(num), pre.scale_vocab(min_count=num, dry_run=True)['memory']['vocab']/700)

#### Models

In [7]:
cores = multiprocessing.cpu_count()

models = [
    # PV-DBOW 
    Doc2Vec(dm=0, dbow_words=1, vector_size=200, window=8, min_count=19, epochs=10, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, min_count=19, epochs=10, workers=cores),
]

In [8]:
models[0].build_vocab(documents)
print(str(models[0]))
models[1].reset_from(models[0])
print(str(models[1]))

Doc2Vec(dbow+w,d200,n5,w8,mc19,s0.001,t4)
Doc2Vec(dm/m,d200,n5,w8,mc19,s0.001,t4)


In [9]:
for model in models:
    %time model.train(documents, total_examples=model.corpus_count, epochs=10)

CPU times: user 2h 8min 50s, sys: 3min 18s, total: 2h 12min 8s
Wall time: 1h 19min 49s
CPU times: user 33min 7s, sys: 1min 55s, total: 35min 3s
Wall time: 42min 45s


#### Results

In [10]:
#for model in models:
    #print(str(model))
    #pprint(model.docvecs.most_similar(positive=["Impractical joker"], topn=10))

In [134]:
for model in models:
    string= "love to love".split()
    print(string)
    doc_vector = model.infer_vector(string)
    pprint(model.docvecs.most_similar(positive=[doc_vector], topn=10))

['love', 'to', 'love']
[('Love to Love', 0.7007774114608765),
 ('The Romance of Kenny G', 0.6808857917785645),
 ('Just Like Heaven', 0.6676369905471802),
 ('Comfort and Joy', 0.6651524901390076),
 ('Sour (album)', 0.6620450019836426),
 ('Talking in Your Sleep', 0.6607742309570312),
 ('The Very Best of Kenny G', 0.6546471118927002),
 ('The Collection (Kenny G album)', 0.6523045301437378),
 ('Meet You There (album)', 0.6512295603752136),
 ('Soldier (Neil Young song)', 0.6511427760124207)]
['love', 'to', 'love']
[('Bible Christian Mission', 0.8978371024131775),
 ('Nuchatlaht First Nation', 0.8909875750541687),
 ('Saskatchewan Highway 953', 0.8902007341384888),
 ('Ernest Davies (Stretford MP)', 0.8901641368865967),
 ('Oudkarspel', 0.8898391723632812),
 ('Salvador Salguero', 0.8893624544143677),
 ('List of foliage plant diseases (Bromeliaceae)', 0.8885855674743652),
 ("Diving at the 1920 Summer Olympics – Men's plain high diving",
  0.8865423202514648),
 ('Sepia bidhaia', 0.8862035274505615

In [135]:
link = input ("Enter a wiki link: ") 
print(link[30:])
print(link)
string= str(link[30:]).replace("_", " ")
#string = str(link)
string= "".join([x.lower() for x in string]).split(" ")
print(string)
for model in models:
    doc_vector = model.infer_vector(string)
    print("======Model======")
    for i in range(len(model.docvecs.most_similar(positive=[doc_vector], topn=10))):
        new_string= model.docvecs.most_similar(positive=[doc_vector], topn=10)[i][0].replace(" ", "_")
        print('https://en.wikipedia.org/wiki/'+new_string)

Enter a wiki link: https://en.wikipedia.org/wiki/Love_to_Love
Love_to_Love
https://en.wikipedia.org/wiki/Love_to_Love
['love', 'to', 'love']
https://en.wikipedia.org/wiki/The_Romance_of_Kenny_G
https://en.wikipedia.org/wiki/Heart_and_Crime
https://en.wikipedia.org/wiki/Love_to_Love
https://en.wikipedia.org/wiki/Stop_the_Machine
https://en.wikipedia.org/wiki/Lost_and_Gone
https://en.wikipedia.org/wiki/Back_for_My_Life
https://en.wikipedia.org/wiki/Marti_Pellow_Sings_the_Hits_of_Wet_Wet_Wet_&_Smile
https://en.wikipedia.org/wiki/The_Very_Best_of_Kenny_G
https://en.wikipedia.org/wiki/Will_You_Still_Love_Me?_(EP)
https://en.wikipedia.org/wiki/Talking_in_Your_Sleep
https://en.wikipedia.org/wiki/Nuchatlaht_First_Nation
https://en.wikipedia.org/wiki/Owen_Township,_Winnebago_County,_Illinois
https://en.wikipedia.org/wiki/KCMX-FM
https://en.wikipedia.org/wiki/List_of_foliage_plant_diseases_(Bromeliaceae)
https://en.wikipedia.org/wiki/Banatska_Dubica
https://en.wikipedia.org/wiki/Sepia_bidhaia
ht

#### Other function

In [136]:
#def process_query(query):
    #words = []
    #words = query.split()
    #return words

In [137]:
#query = "Impractical_Joker"
#l = process_query(query)
#for model in models:
    #sim = model.wv.most_similar(positive=l,topn=10)
    #print(sim)

## Recommender system interface : <br>
Link example : <br>
- Invalide = https://en.wikipedia.org/wiki/Impractical_Jokers <br>
- Valide = https://en.wikipedia.org/wiki/Love_to_Love

In [138]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import xml.etree.ElementTree as ET
import pandas as pd
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('&')
import operator
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [139]:
tree = ET.parse('enwiki-20210101-pages-articles-multistream12.xml-p8554860p9172788')
root = tree.getroot()

titles = []
texts = []
ids = []

ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.10/'}
for child in root.findall('mediawiki:page', ns):
    title = child.find('mediawiki:title', ns)
    identifier = child.find('mediawiki:id', ns)
    titles.append(title.text)
    ids.append(identifier.text)
    for revision in child.findall('mediawiki:revision', ns):
        text_data = revision.find('mediawiki:text', ns)
        if text_data != None:
            texts.append(text_data.text)
        else:
            texts.append(None)

dataframe = pd.DataFrame(data={'Title': titles, 'ID': ids, 'Text': texts})

In [140]:
drop_lines = 'Portal|File|Category|JPG|PNG|jpg|Wikipedia|Template'
dataframe = dataframe[~dataframe.Title.str.contains(drop_lines)]
dataframe = dataframe.dropna().reset_index()
del dataframe['index']
dataframe.head(10)

Unnamed: 0,Title,ID,Text
0,Chestnut Ridge Middle School,8554860,#REDIRECT[[Washington Township Public School D...
1,Colegio de Santa Cruz de Tlatelolco,8554864,{{Infobox university\n|name = Col...
2,Impractical joker (garfield),8554867,#REDIRECT [[List of Garfield and Friends episo...
3,National Council of Teachers,8554873,'''National Council of Teachers''' may refer t...
4,Shuo Wang,8554878,#REDIRECT [[Wang Shuo]]
5,The impractical joker garfield and friends,8554883,#REDIRECT [[List of Garfield and Friends episo...
6,Order of battle at Beiping–Tianjin,8554884,'''Peiking Tientsin Operation''' (July–August ...
7,Gulshani,8554885,{{about|the Sufi order|the demonym of Gulshan|...
8,The impractical joker garfield & friends,8554892,#REDIRECT [[List of Garfield and Friends episo...
9,The impractical joker garfield,8554898,#REDIRECT [[List of Garfield and Friends episo...


In [182]:
def ask_user():
    request = str(input("Please enter a Wikipedia page name: "))
    return request


def propose_pages(request, titles):
    # Preprocessing for request
    list_results = []
    request = nltk.word_tokenize(request)
    request = [x.lower() for x in request]
    request = [word for word in request if word not in stopwords]
    if len(request) > 1:
        request = [lemmatizer.lemmatize(w) for w in request]
    else:
        request = lemmatizer.lemmatize(request[0])
    for el in titles:
        el_2 = el.lower()
        el_2 = nltk.word_tokenize(el_2)
        el_2 = ' '.join([lemmatizer.lemmatize(w) for w in el_2])
        if type(request) == list:
            for i in request:
                if i in el_2:
                    list_results.append(el)
        else:
            if request in el_2:
                list_results.append(el)
    return list_results[:10]
    
    
def check_validity(dataframe):
    request = ask_user()
    if "https://en.wikipedia.org/wiki/" in request:
        while request != 'exit':
            print("Original title : "+request[30:])
            request = str(request[30:])
            request = request.replace("_", " ")
            print("Title for searching : "+request)
            if type(request) == str: 
                print(request)
                if request in dataframe['Title'].values:
                    print("Correct Wikipedia page name, we will propose you 10 related pages!")
                    ##Recommendations
                    string= "".join([x.lower() for x in request]).split(" ")
                    for model in models:
                        doc_vector = model.infer_vector(string)
                        print("======Model======")
                        for i in range(len(model.docvecs.most_similar(positive=[doc_vector], topn=10))):
                            new_string= model.docvecs.most_similar(positive=[doc_vector], topn=10)[i][0].replace(" ", "_")
                            print('https://en.wikipedia.org/wiki/'+new_string)
                    break
                else:
                    if len(request) != 0:
                        results = propose_pages(request, dataframe['Title'])
                        print('\nIncorrect Wikipedia page, please retry!\n')
                        if len(results) > 0:
                            print('Some suggestions :) \n')
                            for i,j in enumerate(results):
                                print(str(i)+'. '+j)
                                print("https://en.wikipedia.org/wiki/"+j.replace(" ", "_"))
                        request = ask_user()
    else:
        print('\nIncorrect Wikipedia page, please retry!\n')

In [184]:
check_validity(dataframe)

Please enter a Wikipedia page name: https://en.wikipedia.org/wiki/Love_to_Love
Original title : Love_to_Love
Title for searching : Love to Love
Love to Love
Correct Wikipedia page name, we will propose you 10 related pages!
https://en.wikipedia.org/wiki/The_Romance_of_Kenny_G
https://en.wikipedia.org/wiki/Love_to_Love
https://en.wikipedia.org/wiki/The_Collection_(Kenny_G_album)
https://en.wikipedia.org/wiki/Will_You_Still_Love_Me?_(EP)
https://en.wikipedia.org/wiki/Lost_and_Gone
https://en.wikipedia.org/wiki/Soldier_(Neil_Young_song)
https://en.wikipedia.org/wiki/The_Very_Best_of_Kenny_G
https://en.wikipedia.org/wiki/Sour_(album)
https://en.wikipedia.org/wiki/Just_Like_Heaven
https://en.wikipedia.org/wiki/Stop_the_Machine
https://en.wikipedia.org/wiki/Speed_skating_at_the_1999_Asian_Winter_Games
https://en.wikipedia.org/wiki/Frankl
https://en.wikipedia.org/wiki/Bible_Christian_Mission
https://en.wikipedia.org/wiki/Hellstrom
https://en.wikipedia.org/wiki/List_of_foliage_plant_diseases_(