In [1]:
#comment out if using a local environment
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
cd "gdrive/My Drive/TopicModelling2/TopicModelling"

/content/gdrive/My Drive/TopicModelling2/TopicModelling


In [3]:
#download spacy's french language model and pyLDAvis for Topic Modelling visualization
!python -m spacy download fr_core_news_sm
!pip install pyLDAvis

In [4]:
import pandas as pd
import numpy as np
import warnings
from collections import Counter

#from utilsTM import corpusVisualizer, Output, deTokenize, compareCorpuses #Output is from ipywidgets
from utilsTM import *

from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases
from gensim.models.ldamodel import LdaModel, CoherenceModel
from gensim.corpora.dictionary import Dictionary

import pyLDAvis
import pyLDAvis.gensim #this is needed

import spacy

#import nltk

### Aim Of The Project
The aim of this notebook is to perform topic modelling on a corpus of company descriptions extracted from the [Commercial Register of Geneva](https://www.ge.ch/recherche-entreprises-registre-du-commerce-geneve).  
This could be used to analyze the economic landscape of the city and to build similarities from the topics embeddings.  
*(Please refer to the commercial register of Geneva for information on restrictions of use of the data)*

### Preprocessing the corpus
To get meaningful topics we need to remove words which are not relevant to categorize company descriptions.  
The steps are the following (the order can be changed):


1.   Remove punctations, lower case, accents ...
2.   Remove links, frequent unuseful information
3.   Tokenize
4.   Remove stop words, named entities
5.   Remove words other than noun, adjectif, verb or adverb
6.   Lemmatize
7.   Build bi-grams and tri-grams
8.   Remove frequent words, list of manually chosen words, verbs 

Not that the last step is specific to this task.



In [9]:
warnings.filterwarnings(action="ignore")

In [5]:
#read the data from a csv file. 
corpus0 = pd.read_csv("CommercialRegisterData.csv").Objet.to_list()

In [6]:
print("data size: ",len(corpus0))

data size:  9996


In [11]:
# we will use a small UI to vizualize the corpus more easily, as we go along the different preprocessing steps
Out0 = Output()
corpusVisualizer(corpus0,Out0)

VBox(children=(HBox(children=(FloatText(value=0.0, step=1.0), Button(description='Show', style=ButtonStyle()),…

In [12]:
#1. a lot of doc contain (cf. statuts pour but complet), (cf. but de l'établissement principal), ... at the end of the description. We remove them. 
corpus1 = [doc[:doc.find("(cf")] for doc in corpus0]

In [13]:
Out1 = Output()
corpusVisualizer(corpus1,Out1)

VBox(children=(HBox(children=(FloatText(value=0.0, step=1.0), Button(description='Show', style=ButtonStyle()),…

In [21]:
#2. Perform a number of preliminary preprocessing steps using gensim's simple_preprocess() (remove punctations, lowercase etc...).
corpus2 = [simple_preprocess(doc,deacc=False,max_len=30) for doc in corpus1]

In [22]:
#deTokenize is a utility function which converts the tokens back to a string.
Out2 = Output()
corpusVisualizer(deTokenize(corpus2),Out2)

VBox(children=(HBox(children=(FloatText(value=0.0, step=1.0), Button(description='Show', style=ButtonStyle()),…

In [49]:
#3. a. Remove stopwords, lemmatize (this might take a bit more time)
#load a spaCy french language model
nlp = spacy.load("fr_core_news_sm")
keepPos = ['NOUN', 'ADJ', 'VERB', 'ADV']
corpus3 = []
for doc2 in deTokenize(corpus2):
  doc2 = nlp(doc2) #perform text analysis on "doc" with the model from spacy.
  doc3 = [token.lemma_ for token in doc2 if not token.is_punct and not token.is_stop and (token.pos_ in keepPos) and not token.ent_type]
  corpus3.append(doc3)


In [25]:
Out3 = Output()
corpusVisualizer(deTokenize(corpus3),Out3)

VBox(children=(HBox(children=(FloatText(value=0.0, step=1.0), Button(description='Show', style=ButtonStyle()),…

In [74]:
with open(file="frenchVerbs.txt",mode="r") as f:
  FRENCHVERBS = list(f.readlines())
  FRENCHVERBS = [verb.replace("\n","").strip() for verb in FRENCHVERBS]
corpus4 = [[word for word in doc if word not in FRENCHVERBS] for doc in corpus3]

In [59]:
#4. construct bigrams tokens, using the Phrases model from gensim
#corpus4 = list(Phrases(sentences=corpus3,min_count=2)[corpus4])

In [None]:
# detect trigrams
#corpus4 = list(Phrases(sentences=corpus4,min_count=2)[corpus4])

In [78]:
#b. Remove accents
corpus4 = [simple_preprocess(doc,deacc=True,max_len=30) for doc in deTokenize(corpus4)]

In [62]:
Out4 = Output()
corpusVisualizer(deTokenize(corpus4),Out4)

VBox(children=(HBox(children=(FloatText(value=0.0, step=1.0), Button(description='Show', style=ButtonStyle()),…

In [None]:
cnt = Counter([word for doc in corpus4 for word in doc])
cnt.most_common(63)

In [81]:
#5. Remove pervasive words, which are present in most of the documents and are noise from a domain specific vocabulary (business) rather than discriminative features
#a. Take the most frequent word, manually set the threshold for the most common word number and keep some words which might be due to a frequent topic
cnt = Counter([word for doc in corpus4 for word in doc])
keep = ["financier","immobilier","courtage","batiment","materiel","importation_exportation","informatique","bien_immobilier","investissement","industriel","public"]
remove = [k for k,_ in cnt.most_common(63) if k not in keep]
manual = ["lfaie","but","pays","genre","aupres","groupe","lien","principalement","commercer","rapport","edition","geneve","premier","tiers"] #remove adverbes...
remove = remove + manual
#b. Automatically detect topics that are domain specific and not disciminative, by looking at the entropy of topic distributions

#remove words
corpus5 = [[word for word in doc if word not in remove] for doc in corpus4]

In [None]:
#corpus4 = list(Phrases(sentences=corpus3,min_count=2)[corpus4])

In [79]:
#To compare corpuses
Out = Output()
compareCorpuses(corpus0,corpus4,Out)

VBox(children=(HBox(children=(FloatText(value=0.0, step=1.0), Button(description='Show', style=ButtonStyle()),…

In [66]:
#Final corpus remove empty docs
corpusFinal = [doc for doc in corpus5 if doc]

In [67]:
id2word = Dictionary(documents=corpusFinal)

In [68]:
corpusBow = [id2word.doc2bow(doc) for doc in corpusFinal]

In [43]:
#without removing frequent words, coherence 0.58, 15 topics

In [71]:
ntopics = 17
rs = 100
model = LdaModel(corpus=corpusBow,num_topics=ntopics,id2word=id2word,chunksize=2000,
                 passes=20,minimum_probability=0.0001,random_state=rs,per_word_topics=False)
coherence = CoherenceModel(model=model,texts=corpusFinal,dictionary=id2word).get_coherence()
print("Average correlation between multi-words in topics",round(coherence,4))

Average correlation between multi-words in topics 0.4654


In [72]:
#80 not so bad,10 not bad, 15 0.5 !!! #sometimes bigrams are confusing the thign
#model.save(fname="model80topics")

In [73]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpusBow, id2word)
vis

In [None]:
from gensim import similarities
index = similarities.MatrixSimilarity(model[corpusBow])

In [None]:
sims = index[vec_lsi]

In [None]:
vec_lsi

[(0, 0.02222222),
 (1, 0.02222222),
 (2, 0.02222222),
 (3, 0.02222222),
 (4, 0.02222222),
 (5, 0.02222225),
 (6, 0.02222222),
 (7, 0.02222222),
 (8, 0.02222222),
 (9, 0.02222222),
 (10, 0.02222222),
 (11, 0.02222222),
 (12, 0.02222222),
 (13, 0.3555555),
 (14, 0.3555556)]

In [None]:
sorted(enumerate(sims), key=lambda item: -item[1])

In [None]:
doc = "horlogerie metal precieux"
vec_bow = id2word.doc2bow(doc.lower().split())
vec_lsi = model[vec_bow]  # convert the query to LSI space
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for i, s in sims[:200]:
    print(i,s, corpus0[i])