## 0) Creating the corpus of articles from simple Wikipedia

In [72]:
import xml.etree.ElementTree as ET
import re
import codecs
import os
import random
import pickle

In [18]:
def is_ascii(s) : 
    return all(ord(c)<128 for c in s)

In [19]:
tree=ET.parse('simplewiki/simplewiki-20170201-pages-articles-multistream.xml')
root=tree.getroot()
url  = '{http://www.mediawiki.org/xml/export-0.10/}page'
dir_path_out = 'simplewiki_articles_corpus/'

In [20]:
def clean_filters(article_txt):
    #article_txt = article_txt[ : article_txt.find("==")]
    article_txt = re.sub(r"{{.*}}","",article_txt)
    article_txt = re.sub(r"\[\[File:.*\]\]","",article_txt)
    article_txt = re.sub(r"\[\[Image:.*\]\]","",article_txt)
    article_txt = re.sub(r"\n: \'\'.*","",article_txt)
    article_txt = re.sub(r"\n!.*","",article_txt)
    article_txt = re.sub(r"^:\'\'.*","",article_txt)
    article_txt = re.sub(r"&nbsp","",article_txt)
    article_txt = re.sub(r"http\S+","",article_txt)
    article_txt = re.sub(r"\d+","",article_txt)   
    article_txt = re.sub(r"\(.*\)","",article_txt)
    article_txt = re.sub(r"Category:.*","",article_txt)
    article_txt = re.sub(r"\| .*","",article_txt)
    article_txt = re.sub(r"\n\|.*","",article_txt)
    article_txt = re.sub(r"\n \|.*","",article_txt)
    article_txt = re.sub(r".* \|\n","",article_txt)
    article_txt = re.sub(r".*\|\n","",article_txt)
    article_txt = re.sub(r"{{Infobox.*","",article_txt)
    article_txt = re.sub(r"{{infobox.*","",article_txt)
    article_txt = re.sub(r"{{taxobox.*","",article_txt)
    article_txt = re.sub(r"{{Taxobox.*","",article_txt)
    article_txt = re.sub(r"{{ Infobox.*","",article_txt)
    article_txt = re.sub(r"{{ infobox.*","",article_txt)
    article_txt = re.sub(r"{{ taxobox.*","",article_txt)
    article_txt = re.sub(r"{{ Taxobox.*","",article_txt)
    article_txt = re.sub(r"\* .*","",article_txt)
    article_txt = re.sub(r"<.*>","",article_txt)
    article_txt = re.sub(r"\n","",article_txt)  
    article_txt = re.sub(r"\!|\"|\#|\$|\%|\&|\'|\(|\)|\*|\+|\,|\-|\.|\/|\:|\;|\<|\=|\>|\?|\@|\[|\\|\]|\^|\_|\`|\{|\||\}|\~"," ",article_txt)
    article_txt = re.sub(r" +"," ",article_txt)
    article_txt = article_txt.replace(u'\xa0', u' ')
    return article_txt

In [1]:
#To run only one time : create the corpus

#revision_tag="{http://www.mediawiki.org/xml/export-0.10/}revision"
#text_tag="{http://www.mediawiki.org/xml/export-0.10/}text"

#for i, page in enumerate(root.findall(url)):
#    for child_page in page : 
#        if child_page.tag == revision_tag : 
#            for x in child_page : 
#                if x.tag == text_tag : 
#                    text=x.text
#                    if not text == None :
#                        text = text[:text.find("==")]
#                        article_text=clean_filters(text)
#                        if not article_text == None and not article_text == "" :
#                            if len(article_text) and is_ascii(article_text) :
#                                outfile=dir_path_out +str(i+1)+'article.txt'
#                                file=codecs.open(outfile,"w","utf-8")
#                                file.write(article_text)
#                                file.close()
#                                print(text)
#                                print('\n====================================\n')

## 1) Preprocessing and Training data preparation

#### Removal stop word and lemmatization

In [21]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [77]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andrealequin/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [48]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc) :
    s_free = " ".join([i for i in doc.lower().split() if i not in stop])  #list
    p_free = "".join(w for w in s_free if w not in exclude)               #chain str
    lemm = " ".join(lemma.lemmatize(word) for word in p_free.split())     #list
    words = lemm.split()
    cleaned = [word for word in words if len(word)>2]
    return cleaned

In [31]:
fich=open(dir_path_out +str(1)+'article.txt',"r")
art1=fich.read()
#art1.lower().split()
s1 = " ".join(i for i in art1.lower().split() if i not in stop)
"".join(w for w in s1 if w not in exclude)

#### Read contents of all the article in a list doc_complete

In [64]:
corpus_path = "simplewiki_articles_corpus"
article_paths= [os.path.join(corpus_path, p) for p in os.listdir(corpus_path)]

#for mac os : 
article_paths.remove('simplewiki_articles_corpus/.DS_Store')

In [66]:
doc_complete=[]
for path in article_paths : 
    fp = codecs.open(path, 'r', 'utf-8')
    doc_content=fp.read()
    doc_complete.append(doc_content)

#### Select randomly 70000 articles from the corpus and create a clean train list

In [74]:
docs_all = random.sample(doc_complete, 70000)
docs = open('docs_simplewiki.pkl','wb')
pickle.dump(docs_all,docs)

In [75]:
docs_train = docs_all[:60000]

In [78]:
doc_train_clean = [clean(doc) for doc in docs_train]

## 2) Building word dictionnary

In [79]:
from gensim import corpora

unable to import 'smart_open.gcs', disabling that module


In [83]:
dictionary = corpora.Dictionary(doc_train_clean)

# Filter terms which occurs in less than 4 articles & more than 40% of the articles
dictionary.filter_extremes(no_below=4, no_above=0.4)

# Filter additional stop words : 
stoplist=set('also use make people know many call include part find become like mean often different \
usually takecome give well get since type list say change see refer actually kinds ask would way \
something need things want every'.split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)

## 3) Feature Extraction (Bag of Words)



In [85]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_train_clean]
#Document term matrix (rows=articles, columns=words)

## 4) LDA model training

In [86]:
from gensim.models.ldamodel import LdaModel as Lda
#building model and training
ldamodel = Lda(doc_term_matrix, num_topics=50, id2word=dictionary, passes=50, iterations=500)

#dump LDA model using pickle
ldafile = open('lda_model_sym_simplewiki', 'wb')
pickle.dump(ldamodel, ldafile)
ldafile.close()

In [89]:
#Print all the 50 topics
for i, topics in enumerate(ldamodel.print_topics(num_topics=50, num_words=10)):
    words=topics[1].split("+")
    print (words, "\n")

['0.088*"british" ', ' 0.079*"class" ', ' 0.053*"unit" ', ' 0.044*"rail" ', ' 0.039*"service" ', ' 0.025*"australian" ', ' 0.023*"electric" ', ' 0.021*"website" ', ' 0.016*"multiple" ', ' 0.015*"locomotive"'] 

['0.053*"chemical" ', ' 0.046*"company" ', ' 0.029*"contains" ', ' 0.027*"number" ', ' 0.026*"studio" ', ' 0.022*"compound" ', ' 0.021*"sold" ', ' 0.019*"made" ', ' 0.019*"disney" ', ' 0.019*"acid"'] 

['0.061*"church" ', ' 0.038*"saint" ', ' 0.026*"guitar" ', ' 0.024*"doctor" ', ' 0.024*"pay" ', ' 0.022*"southwest" ', ' 0.020*"catholic" ', ' 0.020*"wwe" ', ' 0.020*"wrestling" ', ' 0.019*"gironde"'] 

['0.041*"food" ', ' 0.041*"hot" ', ' 0.034*"culture" ', ' 0.025*"flag" ', ' 0.025*"england" ', ' 0.021*"oil" ', ' 0.021*"fox" ', ' 0.018*"heat" ', ' 0.018*"ruler" ', ' 0.017*"greater"'] 

['0.038*"web" ', ' 0.037*"earth" ', ' 0.034*"cite" ', ' 0.030*"period" ', ' 0.030*"white" ', ' 0.027*"black" ', ' 0.019*"middle" ', ' 0.018*"mass" ', ' 0.016*"planet" ', ' 0.015*"birth"'] 

['0.06