In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import  re

tree = ET.parse('enwiki-20210101-pages-articles-multistream12.xml-p8554860p9172788')
root = tree.getroot()

titles = []
texts = []
ids = []

ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.10/'}
for child in root.findall('mediawiki:page', ns):
    title = child.find('mediawiki:title', ns)
    identifier = child.find('mediawiki:id', ns)
    titles.append(title.text)
    ids.append(identifier.text)
    for revision in child.findall('mediawiki:revision', ns):
        text_data = revision.find('mediawiki:text', ns)
        if text_data != None:
            texts.append(text_data.text)
        else:
            texts.append(None)

            # Create data frame with elements

dataframe = pd.DataFrame(data={'Title': titles, 'ID': ids, 'Text': texts})
dataframe.head(5)

Unnamed: 0,Title,ID,Text
0,Chestnut Ridge Middle School,8554860,#REDIRECT[[Washington Township Public School D...
1,Colegio de Santa Cruz de Tlatelolco,8554864,{{Infobox university\n|name = Col...
2,Template:US-gov-bio-stub,8554865,{{asbox\n| image = Great Seal of the Unite...
3,Impractical joker (garfield),8554867,#REDIRECT [[List of Garfield and Friends episo...
4,File:The Imperial Dowager Empress Yehenara.PNG,8554869,== Summary ==\nhttp://guangxu.netor.com/galler...


In [3]:
print(titles)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Preprocessing the Data

<ol>    
    <li> Remove tags like</li>

    <li>Remove Urls</li>
    
    <li>Remove Punctuations</li>
    
    <li>Remove stop words</li>
    
    <li>Remove numbers</li>
</ol>

In [4]:
#Remove html tags 
def tags_Rem(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext
#Remove urls
def url_Rem(text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return text
#Remove punctuations 
#THis function takes a string as input, removes the punctuation and return it back 
def Rem_Punc(text):
        text = re.sub(r'[^\w\s]','',text)
        return text
##Remove \n
def newline_remove(text):
    text.replace('\n', '')
    return text##remove stop words 

def stop_words(text):
    

    stop_words = set(stopwords.words('english')) 

    word_tokens = word_tokenize(text) 

    filtered_text = [w for w in word_tokens if not w in stop_words] 

    filtered_text = ""

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_text = filtered_text + " "+w
    return filtered_text
 



In [5]:
def pre_process(text):
    text = tags_Rem(text)
    text = url_Rem(text)
    text = Rem_Punc(text)
    text = newline_remove(text)
    text = stop_words(text)
    return text
    
    

In [9]:
### drop  empty  pages 
drop_lines = 'Portal|File|Category|JPG|PNG|jpg|Wikipedia|Template'
dataframe = dataframe[~dataframe.Title.str.contains(drop_lines)]
dataframe = dataframe.dropna().reset_index()
del dataframe['index']

In [10]:
##Save the preprocessed dataset
titles_ = []
ids_ = []
text_ = []
for index, row in dataframe.iterrows():
    #print(type(row["Text"]))
    text =pre_process( str(row["Text"]))
    text_.append(text)
    ids_.append(row["ID"])
    titles_.append(row["Title"])

KeyboardInterrupt: 

In [None]:
dataframe_processed = pd.DataFrame(data={'Title': titles_, 'ID': ids_, 'Text': text_})


In [4]:
import json

def enregistrerJson(nom, dic):
	dic = json.dumps(dic, indent=4, ensure_ascii=False)
	file = open(nom, "w", encoding="utf-8")
	file.write(dic)
	file.close()

def chargerJson(chemin):
	file = open(chemin, "r", encoding="utf-8")
	dic = json.load(file)
	file.close()
	return dic

vocabulary = {}

for i in titles:
    mots=i.split(" ")
    for m in mots:
        vocabulary.setdefault(m, 0)
        vocabulary[m] += 1

enregistrerJson("voca.json", vocabulary)

In [5]:
def calculTf(titre):
    wordCounts = {}
    totalMots = 0
    document = titre.split()
    for mot in document:
        if mot not in wordCounts:
            wordCounts[mot]=0
        wordCounts[mot] += 1
        totalMots += 1
    for m in wordCounts:
        wordCounts[m] = wordCounts[m]/totalMots
    return wordCounts

def calculTfIdf(titres, vocabulary, tf, idf):
    tfidf = {}
    for t in titres:
        for mot in vocabulary:
            tfidf.setdefault(t, {})
            if mot in tf[t]:
                val = tf[t][mot] * idf[mot]
                tfidf[t].setdefault(mot, val)
            else:
                tfidf[t].setdefault(mot, 0)
    return tfidf

In [6]:
import math

tf = {}

for t in titles:
    tf[t] = calculTf(t)
    
idf = {}

nbDocs = len(titles_) #titles of the preprocessed data 

vocabulary = chargerJson("voca.json")

for mot in vocabulary:
    idf[mot] = math.log10(nbDocs/(vocabulary[mot]))

tfIdf = calculTfIdf(titles_, vocabulary.keys(), tf, idf)

for k, v in tfIdf.items():
    print(k)
    print(v)
    print("-"*10)

KeyboardInterrupt: 