In [123]:

# Some code is inspired from: 
# https://github.com/scikit-learn/scikit-learn/blob/master/examples/applications/plot_topics_extraction_with_nmf_lda.py
# which is available under the following license: BSD 3-Clause

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import Stemmer

from string import punctuation


language = 'french'


In [124]:

sample_dataset = [
    "Ceci est un certain texte,",
    "Ceci est un autre texte.",
    "Et en voici un troixième.",
    "Et un quatrième.",
]


def lemmatize(document, language):
    # Split on spaces and convert words to their 
    # lowercase lemmas, whilst ignoring punctuation.
    document = document.lower()
    for punctuation_character in punctuation:
        document = document.replace(
            punctuation_character, " {} ".format(punctuation_character)
        )
    document = document.replace("  ", " ").replace("  ", " ").strip()
    
    words_or_punct = document.split(" ")
    
    stemmer = Stemmer.Stemmer(language)
    stemmed_words = stemmer.stemWords(words_or_punct)
    
    lemmatized_document = " ".join(stemmed_words)
    return lemmatized_document


lemmatized_dataset = [lemmatize(doc, language) for doc in sample_dataset]

# Result of lemmatization:
for i in range(len(sample_dataset)):
    print(sample_dataset[i], " | ", lemmatized_dataset[i])


Ceci est un certain texte,  |  cec est un certain text ,
Ceci est un autre texte.  |  cec est un autr text .
Et en voici un troixième.  |  et en voic un troixiem .
Et un quatrième.  |  et un quatriem .


In [125]:

max_vocab_size = 10000
n_topics = 2
ngram_range = (1, 3)  # 1-gram to 3-grams will be used.


print("Features for LDA are Term Frequencies (TF) of the lemmatized dataset...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=max_vocab_size, 
                                ngram_range=ngram_range)
tf = tf_vectorizer.fit_transform(lemmatized_dataset)

print("Fiting the LDA on the data...")
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)


print("Topics in the LDA model:")
def print_top_words(model, feature_names, n_top_words):
    for i, topic in enumerate(model.components_):
        top_topics = topic.argsort()[:-n_top_words - 1:-1]
        escape = lambda x: "'" + x + "'"
        print("topic #{}:".format(i), " ".join([escape(feature_names[i]) for i in top_topics]))
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words=3)

Features for LDA are Term Frequencies (TF) of the lemmatized dataset...
Fiting the LDA on the data...
Topics in the LDA model:
topic #0: 'et' 'cec est un' 'cec'
topic #1: 'est' 'text' 'cec est'


In [129]:
lda.components_

array([[1.05844558, 0.94007403, 1.06608669, 0.82795306, 0.9037063 ,
        1.38864166, 0.9684819 ],
       [1.35535886, 1.39141826, 1.37174657, 1.48673973, 1.34355707,
        0.74135258, 1.40593732]])

In [121]:
print(tf_feature_names)

['cec', 'cec est', 'cec est un', 'est', 'est un', 'et', 'text']
