In [2]:
import gensim
import numpy as np
from gensim.utils import smart_open, simple_preprocess
import gensim.models
from gensim.utils import *
import io
import re
import string


# Preprocessing function in UTF for gensim corpus creation
# using regular expression and removing stop words


RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation))
def strip_punctuation(s):
    return RE_PUNCT.sub(" ", s)

RE_TAGS = re.compile(r"<([^>]+)>")
def strip_tags(s):
    return RE_TAGS.sub("",s)


def strip_short(s, minsize=3):
    return " ".join(e for e in s.split() if len(e) >= minsize)


RE_NUMERIC = re.compile(r"[0-9]+")
def strip_numeric(s):
    return RE_NUMERIC.sub("", s)


RE_NONALPHA = re.compile(r"\W")
def strip_non_alphanum(s):
    return RE_NONALPHA.sub(" ", s)


RE_WHITESPACE = re.compile(r"(\s)+")
def strip_multiple_whitespaces(s):
    return RE_WHITESPACE.sub(" ", s)


RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)")
RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)")
def split_alphanum(s):
    s = RE_AL_NUM.sub(r"\1 \2", s)
    return RE_NUM_AL.sub(r"\1 \2", s)


mySTOPWORDFile = io.open('/home/hqxor/Desktop/AIC/REI/Projet/text/frSTOPWORD.txt').read()
mySTOPWORDS = [word for word in mySTOPWORDFile.split()]


def remove_stopwords(s):
    return " ".join(w for w in s.split() if w not in mySTOPWORDS)


def tokenize(text): 
    return [token for token in preprocess_string(text)]

DEFAULT_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces,
                   strip_numeric, remove_stopwords, strip_short]
def preprocess_string(s, filters=DEFAULT_FILTERS):
    for f in filters:
        s = f(s)
    return s.split()


In [53]:
# functions that extracts all files from a directory and its sub directories if any
fileList =[]
from os import walk
for root, directories, filenames in os.walk('/home/hqxor/Desktop/AIC/REI/Projet/text/2015/01/01020304/'):
    for filename in filenames: 
        fileList.append(os.path.join(root,filename)) 

In [54]:
#function that returns preprocessed texts files from the file list 
def get_texts(fileList):
        for filename in fileList: # for each relevant file
            file_contents = io.open(filename).read()
            yield tokenize(file_contents)

            
texts = get_texts(fileList)
listText = []
for text in texts:
    listText.append(text)
    

In [55]:
# see your total number of text files
print np.shape(listText)

(6451,)


In [56]:
# create a dictionnary for text file
id2word = gensim.corpora.Dictionary(listText)
# print total number of unique words
print np.shape(id2word)
# Creates the Bag of Word corpus.
mm = [id2word.doc2bow(text) for text in listText]

(64957,)


In [57]:
# generate a topic model using gensim
# we explicitly choose the number topics and the number of passes to be sure that the algorithm would converge
# even with a small number of documents
## this step is a bit time consuming with big corpus or large number of topics
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=50, passes=10)

In [7]:
# you can use this gensim function to ensure persistance of your model and not compute it each time
#lda.save('12345.model')
#lda = gensim.models.ldamodel.LdaState.load('12345.model')

In [58]:
# this is the part were the computation of all necerssary variable is done automatically with the gg.prepare
# vis_data would be the object to pass to a displau function to plot the visualization here on the notebook or just
# pass it to a pyLDAvis.save_html() function a have it directly converted to html
import pyLDAvis.gensim as gg
import gensim
from gensim.matutils import *
vis_data = gg.prepare(lda,mm,id2word)


matrix size (64957, 6451)
this is your topicdists shape (6451, 50)


In [59]:
pyLDAvis.display(vis_data)

In [60]:
pyLDAvis.save_html(vis_data,"5premiersJoursJanvier2015_50topics.html")

In [44]:
# this is an example of a raw output of an LDA topic model
i=0
for topic in lda.show_topics(num_topics=50, num_words=16, formatted=False):
        i = i + 1
        print "Topic #" + str(i) + ":",
        for p, id in topic:
            print id,
        print ""
        

Topic #1: tunisie pegida musée tunis viande mouvement rue quartier bardo touristes grotte cheval payet dresde grass ville 
Topic #2: travail syndicats salariés entreprises cgt routiers négociations salaires mise patronat social grève transport cfdt salaire janvier 
Topic #3: millions milliards groupe dollars d’euros monde prix annoncé capital total chiffre euros milliard production avril nucléaire 
Topic #4: pen marine jean marie national front parti région maréchal marion régionales présidente liste avril président fille 
Topic #5: club match france finale coupe football ligue joueurs monde équipe face stade psg saison championnat champions 
Topic #6: femmes club féminin ans hommes foot football allemagne années ville gazprom joueuses sport allemand robert clubs 
Topic #7: departementales canton resultats elections départementales carte monde élections departementale resultat départements saint sûre décodeurs source contexte 
Topic #8: monde australie mondial ans tennis open tournoi a

In [14]:
print lda.show_topics()

 [u'0.013*pays + 0.012*pr\xe9sident + 0.012*ministre + 0.009*parti + 0.008*politique + 0.008*janvier + 0.006*gouvernement + 0.005*lors + 0.005*europ\xe9enne + 0.004*d\xe9clar\xe9', u'0.027*dakar + 0.016*\xe9tape + 0.013*aires + 0.013*buenos + 0.011*sec + 0.011*mini + 0.009*min + 0.009*d\xe9part + 0.008*minutes + 0.008*carlos', u'0.027*http + 0.017*lepoint + 0.013*favoris + 0.012*mons + 0.012*retweets + 0.009*beffroi + 0.008*auray + 0.007*lpnt + 0.006*favori + 0.006*retweet', u'0.016*ann\xe9e + 0.009*d\u2019un + 0.008*janvier + 0.008*place + 0.007*mgr + 0.007*monde + 0.006*l\u2019ann\xe9e + 0.006*pape + 0.006*mois + 0.006*c\u2019est', u'0.023*m\xe9decins + 0.021*gr\xe8ve + 0.011*sant\xe9 + 0.011*carte + 0.011*syndicats + 0.010*soins + 0.010*vitale + 0.009*patients + 0.007*feuilles + 0.007*projet', u'0.022*janvier + 0.019*article + 0.013*abonn\xe9s + 0.012*suite + 0.011*r\xe9actions + 0.011*lundi + 0.011*ligne + 0.011*r\xe9serv\xe9e + 0.009*bloqu\xe9la + 0.009*mis', u'0.030*coupe + 0.025