In [1]:
#comment out if using a local environment
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
cd "gdrive/My Drive/TopicModelling2/TopicModelling"

/content/gdrive/My Drive/TopicModelling2/TopicModelling


In [None]:
#download spacy's french language model and pyLDAvis for Topic Modelling visualization
!python -m spacy download fr_core_news_sm
!pip install pyLDAvis

In [4]:
import pandas as pd
import numpy as np
import warnings
from collections import Counter

from utilsTM import *

from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases
from gensim.models.ldamodel import LdaModel, CoherenceModel
from gensim.corpora.dictionary import Dictionary

import pyLDAvis
import pyLDAvis.gensim #this is needed

import spacy

#import nltk

### Aim Of The Project
The aim of this notebook is to perform topic modelling on a corpus of company descriptions extracted from the [Commercial Register of Geneva](https://www.ge.ch/recherche-entreprises-registre-du-commerce-geneve).  
This could be used to analyze the economic landscape of the city and to build similarities from the topics embeddings.  
*(Please refer to the commercial register of Geneva for information on restrictions of use of the data)*

### Preprocessing the corpus
To get meaningful topics we need to remove words which are not relevant to categorize company descriptions.  
The steps are the following (the order can be changed):


1.   Remove punctations, lower case, accents ...
2.   Remove links, frequent unuseful information
3.   Tokenize
4.   Remove stop words, named entities
5.   Remove words other than noun, adjectif, verb or adverb
6.   Lemmatize
7.   Build bi-grams and tri-grams
8.   Remove frequent words, list of manually chosen words, verbs 

Not that the last step is specific to this task.



In [5]:
#remove deprecation warning messages from ipywidgets
warnings.filterwarnings(action="ignore")

#TRY LDA FROM PYSPARK MAYBE YOU CAN GET BETTER RESULTS...

Preprocessing step functions

In [6]:
# a list of processing steps which can be used as in different order in a processing pipeline, these functions are partly reusable.

def removeCfs(corpus):
  '''a lot of doc contain (cf. statuts pour but complet), (cf. but de l'établissement principal), ... at the end of the description. We remove them.'''
  corpus = [doc[:doc.find("(cf")] for doc in corpus]
  return corpus

def gensimPreprocess(corpus):
  '''perform a number of preliminary preprocessing steps using gensim's simple_preprocess() (remove punctations, lowercase etc...).'''
  corpus = [simple_preprocess(doc,deacc=False,max_len=30) for doc in corpus]
  return corpus

def spacyPreprocess(corpus):
  ''''Remove stopwords, punctation, named entities, words that are not a noun, adjectif, verb or adverb, lemmatize.'''
  nlp = spacy.load("fr_core_news_sm") #load a spaCy french language model
  keepPos = ['NOUN', 'ADJ', 'VERB', 'ADV']
  corpusNew = []
  for doc in deTokenize(corpus):
    doc = nlp(doc) #perform text analysis on "doc" with the model from spacy.
    doc = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop and (token.pos_ in keepPos) and not token.ent_type]
    corpusNew.append(doc)
  
  return corpusNew

def removeVerbs(corpus):
  '''Remove french verbs, spacy POS for verbs was not good, so we had to scrape a full list of french verbs (refer to the notebook ScrapingAllFrenchVerbs.ipynb).'''
  with open(file="frenchVerbs.txt",mode="r") as f:
    FRENCHVERBS = list(f.readlines())
    FRENCHVERBS = [verb.replace("\n","").strip() for verb in FRENCHVERBS]
  corpus = [[word for word in doc if word not in FRENCHVERBS] for doc in corpus]
  return corpus

def constructBigrams(corpus):
  '''Extract frequent set of two words.'''
  corpus = list(Phrases(sentences=corpus,min_count=2)[corpus])
  return corpus

def removeAccents(corpus):
  corpus = [simple_preprocess(doc,deacc=True,max_len=30) for doc in deTokenize(corpus)]
  return corpus
  
def removeNonDiscriminative(corpus):
  '''Remove word, which are present in most of the documents and are noise from a domain specific vocabulary (business) rather than discriminative features.'''
  #Take the most frequent word, manually set the threshold for the most common word number and keep some words which might be due to a frequent topic
  cnt = Counter([word for doc in corpus for word in doc])
  keep = ["financier","immobilier","courtage","batiment","materiel","construction","importation_exportation",
        "informatique","bien_immobilier","investissement","industriel","public","prohiber_lfaie","financement"]
  #remove 63 most common
  remove = [k for k,_ in cnt.most_common(63) if k not in keep]
  manual = ["but","pays","genre","aupres","groupe","lien","principalement","commercer","rapport","edition","geneve","premier","tiers"] #a list of unwanted words
  remove = remove + manual

  #remove words
  corpus = [[word for word in doc if word not in remove] for doc in corpus]
  
  return corpus

def removeEmpty(corpus):
  return [doc for doc in corpus if doc]

Preprocessing

In [7]:
#read the data from a csv file. The data was extracted from the commercial register of Geneva, and consists on company descriptions in French.
corpus0 = pd.read_csv("CommercialRegisterData.csv").Objet.to_list()

#For this first experiment we consider only 10'000 descriptions, but the register contains up to 50'000 companies.
print("Corpus size :",len(corpus0))

Corpus size : 9996


In [8]:
# we will use a small UI to vizualize the corpus more easily, as we go along the different preprocessing steps
Out0 = Output()
corpusVisualizer(corpus0,Out0)

VBox(children=(HBox(children=(FloatText(value=0.0, step=1.0), Button(description='Show', style=ButtonStyle()),…

In [9]:
#the pipeline of preprocessing steps
preprocessingSteps = [removeCfs, gensimPreprocess, removeVerbs,spacyPreprocess, removeAccents, constructBigrams, removeNonDiscriminative, removeEmpty]

#Probably inefficient for large datasets
corpuses = [corpus0]
for step in preprocessingSteps:
  corpuses.append(step(corpuses[-1]))

In [10]:
corpusFinal = corpuses[-1]

In [13]:
Out = Output()
compareCorpuses(corpus0,corpusFinal,Out)

VBox(children=(HBox(children=(FloatText(value=0.0, step=1.0), Button(description='Show', style=ButtonStyle()),…

### Fitting the model

In [14]:
id2word = Dictionary(documents=corpusFinal)

In [15]:
corpusBow = [id2word.doc2bow(doc) for doc in corpusFinal]

In [16]:
ntopics = 15
rs = 100
model = LdaModel(corpus=corpusBow,num_topics=ntopics,id2word=id2word,chunksize=2000,
                 passes=20,minimum_probability=0.0001,random_state=rs,per_word_topics=False)
coherence = CoherenceModel(model=model,texts=corpusFinal,dictionary=id2word).get_coherence()
print("Average correlation between multi-words in topics",round(coherence,4))

Average correlation between multi-words in topics 0.4542


In [17]:
#visualize the topics with pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpusBow, id2word)
vis

### Building a similarity matrix to perform queries
(Next step)

In [None]:
from gensim import similarities
index = similarities.MatrixSimilarity(model[corpusBow])