# Recommender System - User request

## Libraries

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import xml.etree.ElementTree as ET
import pandas as pd
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('&')
import operator
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

## Retrieve data 

In [2]:
tree = ET.parse('/media/macaire/Ubuntu/Master_2/Recommender_systems/Case_Study/enwiki-20210101-pages-articles-multistream12.xml')
root = tree.getroot()

titles = []
texts = []
ids = []

ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.10/'}
for child in root.findall('mediawiki:page', ns):
    title = child.find('mediawiki:title', ns)
    identifier = child.find('mediawiki:id', ns)
    titles.append(title.text)
    ids.append(identifier.text)
    for revision in child.findall('mediawiki:revision', ns):
        text_data = revision.find('mediawiki:text', ns)
        if text_data != None:
            texts.append(text_data.text)
        else:
            texts.append(None)

dataframe = pd.DataFrame(data={'Title': titles, 'ID': ids, 'Text': texts})

## Preprocessing the data 

In [3]:
drop_lines = 'Portal|File|Category|JPG|PNG|jpg|Wikipedia|Template'
dataframe = dataframe[~dataframe.Title.str.contains(drop_lines)]
dataframe = dataframe.dropna().reset_index()
del dataframe['index']
dataframe.head(10)

Unnamed: 0,Title,ID,Text
0,Chestnut Ridge Middle School,8554860,#REDIRECT[[Washington Township Public School D...
1,Colegio de Santa Cruz de Tlatelolco,8554864,{{Infobox university\n|name = Col...
2,Impractical joker (garfield),8554867,#REDIRECT [[List of Garfield and Friends episo...
3,National Council of Teachers,8554873,'''National Council of Teachers''' may refer t...
4,Shuo Wang,8554878,#REDIRECT [[Wang Shuo]]
5,The impractical joker garfield and friends,8554883,#REDIRECT [[List of Garfield and Friends episo...
6,Order of battle at Beiping–Tianjin,8554884,'''Peiking Tientsin Operation''' (July–August ...
7,Gulshani,8554885,{{about|the Sufi order|the demonym of Gulshan|...
8,The impractical joker garfield & friends,8554892,#REDIRECT [[List of Garfield and Friends episo...
9,The impractical joker garfield,8554898,#REDIRECT [[List of Garfield and Friends episo...


## Request of the user (deal with incorrect wiki page) 

In [4]:
def ask_user():
    request = str(input("Please enter a Wikipedia page name: "))
    return request


def propose_pages(request, titles):
    # Preprocessing for request
    list_results = []
    request = nltk.word_tokenize(request)
    request = [x.lower() for x in request]
    request = [word for word in request if word not in stopwords]
    if len(request) > 1:
        request = [lemmatizer.lemmatize(w) for w in request]
    else:
        request = lemmatizer.lemmatize(request[0])
    for el in titles:
        el_2 = el.lower()
        el_2 = nltk.word_tokenize(el_2)
        el_2 = ' '.join([lemmatizer.lemmatize(w) for w in el_2])
        if type(request) == list:
            for i in request:
                if i in el_2:
                    list_results.append(el)
        else:
            if request in el_2:
                list_results.append(el)
    return list_results[:10]
    
    
def check_validity(dataframe):
    request = ask_user()
    while request != 'exit':
        if type(request) == str: 
            if request in dataframe['Title'].values:
                print("Correct Wikipedia page name, we will propose you 10 related pages!")
                break
            else:
                if len(request) != 0:
                    results = propose_pages(request, dataframe['Title'])
                    print('\nIncorrect Wikipedia page, please retry!\n')
                    if len(results) > 0:
                        print('Some suggestions :) \n')
                        for i,j in enumerate(results):
                            print(str(i)+'. '+j)
                    request = ask_user()

In [5]:
check_validity(dataframe)

Please enter a Wikipedia page name: santa

Incorrect Wikipedia page, please retry!

Some suggestions :) 

0. Colegio de Santa Cruz de Tlatelolco
1. North Santander Department
2. Bernardo Santareno
3. Santa Clara Aqueduct
4. Ibarlucea, Santa Fe
5. Zavalla, Santa Fe
6. Abel Santa Cruz
7. Santa Claus on film
8. The Town Santa Forgot
9. Santa claus conquers the aliens
Please enter a Wikipedia page name: National Council of Teachers
Correct Wikipedia page name, we will propose you 10 related pages!
