# Recommender System - Preprocessing 

## Import required packages

In [2]:
import  re
import xml.etree.ElementTree as ET
import pandas as pd
import os
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

## Retrieve data 

In [3]:


tree = ET.parse('enwiki-20210101-pages-articles-multistream12.xml-p8554860p9172788')
root = tree.getroot()

titles = []
texts = []
ids = []

ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.10/'}
for child in root.findall('mediawiki:page', ns):
    title = child.find('mediawiki:title', ns)
    identifier = child.find('mediawiki:id', ns)
    titles.append(title.text)
    ids.append(identifier.text)
    for revision in child.findall('mediawiki:revision', ns):
        text_data = revision.find('mediawiki:text', ns)
        if text_data != None:
            texts.append(text_data.text)
        else:
            texts.append(None)

            # Create data frame with elements

dataframe = pd.DataFrame(data={'Title': titles, 'ID': ids, 'Text': texts})
dataframe.head(5)

Unnamed: 0,Title,ID,Text
0,Chestnut Ridge Middle School,8554860,#REDIRECT[[Washington Township Public School D...
1,Colegio de Santa Cruz de Tlatelolco,8554864,{{Infobox university\n|name = Col...
2,Template:US-gov-bio-stub,8554865,{{asbox\n| image = Great Seal of the Unite...
3,Impractical joker (garfield),8554867,#REDIRECT [[List of Garfield and Friends episo...
4,File:The Imperial Dowager Empress Yehenara.PNG,8554869,== Summary ==\nhttp://guangxu.netor.com/galler...


In [4]:
dataframe.describe

<bound method NDFrame.describe of                                                  Title       ID  \
0                         Chestnut Ridge Middle School  8554860   
1                  Colegio de Santa Cruz de Tlatelolco  8554864   
2                             Template:US-gov-bio-stub  8554865   
3                         Impractical joker (garfield)  8554867   
4       File:The Imperial Dowager Empress Yehenara.PNG  8554869   
...                                                ...      ...   
171737            South Delhi (Lok Sabha Constituency)  9172776   
171738                            File:Om grafitti.jpg  9172779   
171739                                    South Cliffe  9172782   
171740                                    Sándor Erdős  9172785   
171741                           Yoshibayama Junnosuke  9172787   

                                                     Text  
0       #REDIRECT[[Washington Township Public School D...  
1       {{Infobox university\n|name      

In [5]:
print(dataframe.shape[0] - dataframe.dropna().shape[0])

5


In [6]:
print(dataframe['Text'][1])

{{Infobox university
|name              = Colegio de Santa Cruz de Tlatelolco
|image = File:Iglesia_de_Santiago_Tlatelolco,_M%C3%A9xico_D.F.,_M%C3%A9xico,_2013-10-16,_DD_38.JPG
|native_name       = 
|motto             = 
|established       = {{start date and age|1536|1|6}}
|type              = [[Catholic education|Catholic]]
|city              = [[Tlatelolco (Mexico City)|Tlatelolco]], [[Mexico City]]
|country           = [[Mexico]]
|campus            = [[urban area|Urban]]
}}
[[File:Iglesia de Santiago Tlatelolco, México D.F., México, 2013-10-16, DD 31.JPG|thumbnail|Exterior of the church]]
[[File:Iglesia de Santiago Tlatelolco, México D.F., México, 2013-10-16, DD 46.JPG|thumb|View of dome from below]]
The '''Colegio de Santa Cruz''' in [[Tlatelolco (Mexico City)|Tlatelolco]], [[Mexico City]], is the first and oldest European school of [[higher learning]] in the [[Americas]]<ref>{{cite book|url=https://catalog.hathitrust.org/Record/101392426|title=The first college in America: Santa C

In [11]:
text = dataframe['Text'][1]
print(text)
#dataframe['Title'][8]

{{Infobox university
|name              = Colegio de Santa Cruz de Tlatelolco
|image = File:Iglesia_de_Santiago_Tlatelolco,_M%C3%A9xico_D.F.,_M%C3%A9xico,_2013-10-16,_DD_38.JPG
|native_name       = 
|motto             = 
|established       = {{start date and age|1536|1|6}}
|type              = [[Catholic education|Catholic]]
|city              = [[Tlatelolco (Mexico City)|Tlatelolco]], [[Mexico City]]
|country           = [[Mexico]]
|campus            = [[urban area|Urban]]
}}
[[File:Iglesia de Santiago Tlatelolco, México D.F., México, 2013-10-16, DD 31.JPG|thumbnail|Exterior of the church]]
[[File:Iglesia de Santiago Tlatelolco, México D.F., México, 2013-10-16, DD 46.JPG|thumb|View of dome from below]]
The '''Colegio de Santa Cruz''' in [[Tlatelolco (Mexico City)|Tlatelolco]], [[Mexico City]], is the first and oldest European school of [[higher learning]] in the [[Americas]]<ref>{{cite book|url=https://catalog.hathitrust.org/Record/101392426|title=The first college in America: Santa C

In [5]:
text = dataframe['Text'][8]


In [22]:
dataframe['Title'][8]

'Order of battle at Beiping–Tianjin'

### Preprocessing the Data
<ol>    
    <li>Rmove tags like\<\/ref\></li>

    <li>Remove Urls</li>
    <li>Remove Punctuations</li>
    <li>Remove stop words</li>
    <li>Remove numbers</li>
    </ol>

In [18]:
def pre_process(text):
    text = tags_Rem(text)
    text = url_Rem(text)
    text = Rem_Punc(text)
    text = newline_remove(text)
    text = stop_words(text)
    return text
    
    

In [19]:
#Remove html tags 
def tags_Rem(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [20]:
#Remove urls
def url_Rem(text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return text

In [21]:
#Remove punctuations 
#THis function takes a string as input, removes the punctuation and return it back 
def Rem_Punc(text):
        text = re.sub(r'[^\w\s]','',text)
        return text


In [22]:
##Remove \n
def newline_remove(text):
    text.replace('\n', '')
    return text


In [23]:
##remove stop words 

def stop_words(text):
    

    stop_words = set(stopwords.words('english')) 

    word_tokens = word_tokenize(text) 

    filtered_text = [w for w in word_tokens if not w in stop_words] 

    filtered_text = ""

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_text = filtered_text + " "+w
    return filtered_text
 


In [25]:
titles_ = []
ids_ = []
text_ = []
for index, row in dataframe.iterrows():
    #print(type(row["Text"]))
    text =pre_process( str(row["Text"]))
    text_.append(text)
    ids_.append(row["ID"])
    titles_.append(row["Title"])

In [27]:
dataframe_processed = pd.DataFrame(data={'Title': titles_, 'ID': ids_, 'Text': text_})


In [29]:
dataframe_processed.to_csv("processed_data", sep='\t')
