# Topic Modeling Morocco World News

## Importation des données

In [154]:
import pandas as pd

In [None]:
morocco_world_news = pd.read_csv('morocco_world_news_articles.csv', engine='python', error_bad_lines=False)

In [156]:
morocco_world_news.head()

Unnamed: 0.1,Unnamed: 0,category,content
0,0,politics,Rabat - A confidential report from NATO has ex...
1,1,politics,Rabat - Top security officials from Morocco an...
2,2,politics,Rabat - Indie-rock band Big Thief has announce...
3,3,politics,Rabat - The European Union has called on Alger...
4,4,politics,Rabat - Spain regrets Algeria’s decision to su...


## Installation des packages

In [None]:
!pip install nltk
!pip install gensim

## Importation des librairies

In [158]:
import numpy as np

In [159]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import CoherenceModel

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
nltk.download('wordnet')
nltk.download('omw-1.4')

## Fonction de preprocessing

* on supprime tous les stopwords (this, that, where...)
* on supprime les mots de moins de 3 lettres
* on applique la lemmatisation

In [161]:
stemmer = SnowballStemmer('english')

In [162]:
def lemmatize_stemming(text) :
  return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='n'))

In [163]:
def preprocess(text) :
  result = []
  for token in gensim.utils.simple_preprocess(text) :
    if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 :
      result.append(lemmatize_stemming(token))
  return result

## Prepocessing des données

* on supprime les valeurs NAN de nos données
* on applique la fonction de preprocessing

In [164]:
morocco_world_news.dropna(subset = ["content"], inplace=True)

In [165]:
processed_docs = [preprocess(doc) for doc in morocco_world_news['content']]

In [166]:
processed_docs[10][:10]

['rabat',
 'spanish',
 'interior',
 'minist',
 'fernando',
 'grand',
 'marlaska',
 'reiter',
 'countri',
 'commit']

## Stockage des données après preprocessing

* on utilise un dictionnaire qui contient le mot comme clé et son nombre d'occurences comme valeur

In [167]:
dictionary = gensim.corpora.Dictionary(processed_docs)

## Nettoyage du dictionnaire

* on supprime les mots trop rares qui appraîssent moins de 15 fois
* on supprime les mots trop fréquents qui apparaîssent dans plus de 10% des documents
* à la fin on ne garde que les 100 000 mots les plus fréquents 

In [168]:
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

## Conversion en Bag-Of-Words

* on convertit notre dictionnaire en couple mot et nombres d'occurrences : format bag-of-words

In [169]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [170]:
for i in range(10) :
    print("Word {} (\"{}\") appears {} time.".format(bow_corpus[10][i][0], dictionary[bow_corpus[10][i][0]], bow_corpus[10][i][1]))

Word 6 ("amid") appears 1 time.
Word 8 ("autonomi") appears 1 time.
Word 10 ("basi") appears 1 time.
Word 22 ("credibl") appears 1 time.
Word 25 ("disput") appears 1 time.
Word 27 ("endors") appears 1 time.
Word 38 ("immedi") appears 1 time.
Word 43 ("madrid") appears 3 time.
Word 47 ("outlet") appears 1 time.
Word 60 ("sanchez") appears 3 time.


## Exécution du LDA

* LdaMulticore pour utiliser tout les coeurs du CPU afin de gagner en temps d'exécution

* num_topics : nombre de topic à extraire du corpus

* id2word : mapping des identifiants de mots (entiers) aux mots (chaînes de caractères)

* passes : nombre d'itération d'entraînement sur le corpus

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = 10, id2word = dictionary, passes = 1000)

In [172]:
topics = []
for idx, topic in lda_model.print_topics(-1) :
    print("Topic: {} -> Words: {}".format(idx, topic))
    topics.append(topic)

Topic: 0 -> Words: 0.026*"onlin" + 0.025*"digit" + 0.025*"exam" + 0.022*"candid" + 0.020*"internet" + 0.019*"network" + 0.017*"port" + 0.015*"baccalaur" + 0.015*"facebook" + 0.014*"survey"
Topic: 1 -> Words: 0.017*"polisario" + 0.013*"spanish" + 0.009*"trump" + 0.008*"resolut" + 0.006*"migrant" + 0.006*"migrat" + 0.005*"vote" + 0.005*"secretari" + 0.005*"autonomi" + 0.005*"sovereignti"
Topic: 2 -> Words: 0.029*"learner" + 0.016*"classroom" + 0.014*"method" + 0.011*"grammar" + 0.011*"text" + 0.010*"target" + 0.009*"reader" + 0.009*"facebook" + 0.008*"comprehens" + 0.007*"border"
Topic: 3 -> Words: 0.008*"discours" + 0.007*"attitud" + 0.007*"book" + 0.006*"concept" + 0.006*"influenc" + 0.006*"theori" + 0.006*"principl" + 0.006*"ident" + 0.006*"linguist" + 0.005*"religion"
Topic: 4 -> Words: 0.014*"vaccin" + 0.011*"bank" + 0.010*"agricultur" + 0.009*"travel" + 0.008*"price" + 0.008*"tourism" + 0.008*"export" + 0.007*"food" + 0.007*"test" + 0.007*"flight"
Topic: 5 -> Words: 0.015*"parent" 

## Cohérence du topic model

* Les mesures de cohérence évaluent le degré de similitude sémantique entre les mots les mieux notés dans le topics
* Ces mesures aident à faire la distinction entre les topics sémantiquement interprétables et les topics dû à des inférences statistiques
* Pour un bon modèle LDA la cohérence doit être comprise entre 0.4 et 0.7 au delà le modèle est probablement erroné

In [173]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5129831809687769


## Stockage des résultats

In [174]:
all_topic_model = []
for i in range(len(topics)):
  str = topics[i].split(' + ')
  topic_model = []
  for j in range(10):
    weight = str[j][0:5]
    word = str[j][7:len(str[j])-1]
    topic_model.append((weight, word))
  all_topic_model.append(topic_model)

In [175]:
df_topic_model = pd.DataFrame(all_topic_model)
df_topic_model.rename(index = {0: "Topic 1", 1: "Topic 2", 2: "Topic 3", 3: "Topic 4", 4: "Topic 5", 5: "Topic 6", 6: "Topic 7", 7: "Topic 8", 8: "Topic 9", 9: "Topic 10"}, inplace = True)

In [176]:
df_topic_model

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic 1,"(0.026, onlin)","(0.025, digit)","(0.025, exam)","(0.022, candid)","(0.020, internet)","(0.019, network)","(0.017, port)","(0.015, baccalaur)","(0.015, facebook)","(0.014, survey)"
Topic 2,"(0.017, polisario)","(0.013, spanish)","(0.009, trump)","(0.008, resolut)","(0.006, migrant)","(0.006, migrat)","(0.005, vote)","(0.005, secretari)","(0.005, autonomi)","(0.005, sovereignti)"
Topic 3,"(0.029, learner)","(0.016, classroom)","(0.014, method)","(0.011, grammar)","(0.011, text)","(0.010, target)","(0.009, reader)","(0.009, facebook)","(0.008, comprehens)","(0.007, border)"
Topic 4,"(0.008, discours)","(0.007, attitud)","(0.007, book)","(0.006, concept)","(0.006, influenc)","(0.006, theori)","(0.006, principl)","(0.006, ident)","(0.006, linguist)","(0.005, religion)"
Topic 5,"(0.014, vaccin)","(0.011, bank)","(0.010, agricultur)","(0.009, travel)","(0.008, price)","(0.008, tourism)","(0.008, export)","(0.007, food)","(0.007, test)","(0.007, flight)"
Topic 6,"(0.015, parent)","(0.012, classroom)","(0.009, exam)","(0.007, applic)","(0.006, write)","(0.005, answer)","(0.005, graduat)","(0.005, degre)","(0.005, motiv)","(0.005, grade)"
Topic 7,"(0.007, violenc)","(0.005, sexual)","(0.004, polic)","(0.004, love)","(0.004, girl)","(0.004, religion)","(0.004, victim)","(0.003, protest)","(0.003, stori)","(0.003, street)"
Topic 8,"(0.015, rank)","(0.011, innov)","(0.010, youth)","(0.009, confer)","(0.008, competit)","(0.006, british)","(0.006, team)","(0.006, vocat)","(0.006, professor)","(0.005, engin)"
Topic 9,"(0.013, algerian)","(0.012, israel)","(0.008, terror)","(0.007, iran)","(0.007, regim)","(0.007, terrorist)","(0.007, isra)","(0.006, democraci)","(0.006, saudi)","(0.006, west)"
Topic 10,"(0.021, mother)","(0.016, tongu)","(0.013, literaci)","(0.011, tamazight)","(0.010, curriculum)","(0.008, linguist)","(0.007, amazigh)","(0.007, unesco)","(0.007, programm)","(0.007, coloni)"


In [177]:
df_topic_model.to_csv('topic_model_morocco_world_news.csv')

## Visualisation des résultats

In [None]:
!pip install pyLDAvis

In [179]:
import pyLDAvis.gensim_models

In [180]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
