Please click on the round icone to allow nbviewer and see the LDA graphic. Here -----------------------------------------------------------------> 
<br>If the notebook doesn't load on the first try, please reload.

## Import dependencies
_____________



In [113]:
import re
import pandas as pd
import numpy as np
from pprint import pprint

# Spacy + NLTK
import nltk; nltk.download('stopwords')
import spacy
from spacy.lang.fr import French
from nltk.corpus import stopwords
stop_words = stopwords.words('french')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [114]:
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

### Divide by phrases
_____________

In [115]:
df = pd.read_table("/content/drive/MyDrive/Epitech_DATA-IA/S3_NLP/alice.txt", header=None, sep="delimiter")
print(df)

                                                      0
0                                     CHAPITRE PREMIER.
1                                   AU FOND DU TERRIER.
2     ALICE, assise auprès de sa sœur sur le gazon, ...
3     de rester là à ne rien faire; une ou deux fois...
4     yeux sur le livre que lisait sa sœur; mais quo...
...                                                 ...
2670  étranges, et peut-être même en leur contant le...
2671  Merveilles du temps jadis: elle la voyait part...
2672  chagrins et trouver plaisir à leurs innocentes...
2673         propre enfance et les heureux jours d'été.
2674                                               FIN.

[2675 rows x 1 columns]


  return read_csv(**locals())


### Convert to list 
_________

In [116]:
data = df.values.tolist()

#Check result
pprint(data[:1])

[['CHAPITRE PREMIER.']]


In [117]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations 
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  

data_words = list(sent_to_words(data))

print(data_words[:50])

[['chapitre', 'premier'], ['au', 'fond', 'du', 'terrier'], ['alice', 'assise', 'aupres', 'de', 'sa', 'sœur', 'sur', 'le', 'gazon', 'commencait', 'ennuyer'], ['de', 'rester', 'la', 'ne', 'rien', 'faire', 'une', 'ou', 'deux', 'fois', 'elle', 'avait', 'jete', 'les'], ['yeux', 'sur', 'le', 'livre', 'que', 'lisait', 'sa', 'sœur', 'mais', 'quoi', 'pas', 'images', 'pas', 'de'], ['dialogues', 'la', 'belle', 'avance', 'pensait', 'alice', 'qu', 'un', 'livre', 'sans', 'images'], ['sans', 'causeries'], ['elle', 'etait', 'mise', 'reflechir', 'tant', 'bien', 'que', 'mal', 'car', 'la', 'chaleur', 'du'], ['jour', 'endormait', 'et', 'la', 'rendait', 'lourde', 'se', 'demandant', 'si', 'le', 'plaisir', 'de'], ['faire', 'une', 'couronne', 'de', 'marguerites', 'valait', 'bien', 'la', 'peine', 'de', 'se', 'lever', 'et'], ['de', 'cueillir', 'les', 'fleurs', 'quand', 'tout', 'coup', 'un', 'lapin', 'blanc', 'aux', 'yeux', 'roses'], ['passa', 'pres', 'elle'], ['il', 'avait', 'rien', 'la', 'de', 'bien', 'etonnan

### Remove stop words
_________

In [118]:
# Define function for stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]



### Prepare bigrams & trigrams
_________

In [119]:
# Define functions for bigrams and trigrams
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


In [120]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['chapitre', 'premier']


In [121]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

### Lemmatization
_________



In [122]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [123]:
from spacy.lang.fr import French
spacy.cli.download("fr_core_news_md")

nlp = spacy.load('fr_core_news_md', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:100])

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_md')
[['chapitre', 'premier'], ['fond', 'terrier'], ['alice', 'assise', 'aupre', 'commencer', 'ennuyer'], ['faire', 'fois', 'jete'], ['oeil', 'livre', 'lire', 'sœur', 'image'], ['dialogue', 'bel', 'avance', 'alice', 'livre', 'image'], ['causerie'], ['eter', 'mise', 'reflechir', 'tant', 'bien', 'mal', 'chaleur'], ['jour', 'endormir', 'lourd', 'demander', 'plaisir'], ['faire', 'couronne', 'marguerite', 'valoir', 'bien', 'peine', 'lever'], ['cueillir', 'fleur', 'quand', 'coup', 'oeil', 'rose'], ['passer', 'pre'], ['bien', 'etonner', 'alice', 'trouver', 'meme'], ['tre', 'extraordinaire', 'entendre', 'parler', 'lapin', 'dire', 'ah'], ['arriverai', 'trop', 'tard', 'apre', 'bien'], ['etonner', 'moment', 'paraître'], ['naturel', 'cependant', 'venir', 'tirer', 'montre'], ['gousset', 'regarder', 'prendre', 'courir', 'plus', 'bel', 'alice'], ['sauter', 'pieds', 'frappee', 'idee', 'jamais', 

In [124]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
#print(corpus[:20])

### LDA
_________

In [125]:
# Build LDA model
%%capture output
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=300,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


In [126]:
# Print the Keywords in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.032*"tout" + 0.031*"roi" + 0.027*"trouver" + 0.024*"autour" + '
  '0.022*"vouloir" + 0.020*"tandis" + 0.019*"salle" + 0.016*"suivre" + '
  '0.015*"porte" + 0.015*"chercher"'),
 (1,
  '0.191*"dire" + 0.099*"alice" + 0.082*"faire" + 0.073*"bien" + '
  '0.028*"chapelier" + 0.027*"griffon" + 0.027*"voix" + 0.024*"regarder" + '
  '0.024*"air" + 0.015*"loir"'),
 (2,
  '0.047*"continuer" + 0.044*"peu" + 0.041*"alice" + 0.036*"demander" + '
  '0.035*"jamais" + 0.018*"pouvoir" + 0.014*"nouveau" + 0.013*"question" + '
  '0.012*"avoir" + 0.012*"personne"'),
 (3,
  '0.074*"plus" + 0.045*"aller" + 0.044*"donc" + 0.039*"alors" + 0.034*"oeil" '
  '+ 0.032*"illustration" + 0.020*"encore" + 0.017*"voila" + 0.017*"pauvre" + '
  '0.015*"long"'),
 (4,
  '0.059*"mettre" + 0.041*"non" + 0.034*"falloir" + 0.031*"chose" + '
  '0.025*"commencer" + 0.021*"comment" + 0.019*"enfant" + 0.018*"savoir" + '
  '0.017*"cœur" + 0.017*"maintenant"'),
 (5,
  '0.092*"tout" + 0.054*"tete" + 0.034*"crier" + 0.033*"

In [127]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis