In [1]:
# import libraries 
import numpy as np
import pandas as pd 

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# loading data 
data = pd.read_csv('pure_df.csv')

# extract year from datetime
data['year'] = data['date'].apply(lambda x: x.split('-')[0])

# remove unnamed column
data = data.drop(columns=['Unnamed: 0'])

# view datafraem
data.head(3)

Unnamed: 0,meeting_text,file_name,date,year
0,"Good morning, ladies and gentlemen. I welcome ...",20211104_Hannover_Rueck_SE-_Earnings_Call_2021...,2021-11-04,2021
1,Well. Good morning to all of you. Welcome to H...,20191023_Hannover_Rueck_SE-_Shareholder_Mtg_Ca...,2019-10-23,2019
2,"Good morning, ladies and gentlemen, and welcom...",20150506_Hannover_Rueck_SE-_Earnings_Call_2015...,2015-05-06,2015


In [3]:
# define a function to create a dataframe that detects each year and creates the corresponding df
def create_year_df(data, year):
    df = data[data['year'] == year]
    return df

In [4]:
# create different dataframe containing only the year and the text
data_year = data[['year', 'meeting_text']]

# create one dataframe for each year
data_year_1 = data_year[data_year['year'] == '2018']
data_year_2 = data_year[data_year['year'] == '2019']
data_year_3 = data_year[data_year['year'] == '2020']
data_year_4 = data_year[data_year['year'] == '2021']
data_year_5 = data_year[data_year['year'] == '2022']

In [14]:
# year 1 test 
text = data_year_1['meeting_text'].tolist()

In [6]:
def lemmatization(texts, allowed_postags=["NOUN"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []

    for row in range(len(texts)):
        for text in texts:
            doc = nlp(text)
        
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatized_texts = lemmatization(text)
print(lemmatized_texts[0][0:20])
#lemmatized_texts

morning lady gentlem


In [7]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0][0:20])

['morning', 'lady', 'gentleman', 'conference', 'call', 'result', 'month', 'event', 'influence', 'result', 'recapture', 'charge', 'legacy', 'mortality', 'business', 'result', 'inforce', 'management', 'action', 'loss']


In [8]:
# Bigrams and Trigrams
# words that commonly appear together (and have a different meaning when together, eg. American & army vs American army)
bigram_phrases = gensim.models.Phrases(data_words, min_count=2, threshold=3)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0][0:20])

['morning_lady', 'gentleman_conference', 'call_result', 'month_event', 'influence_result', 'recapture_charge', 'legacy_mortality', 'business', 'result_inforce', 'management_action', 'loss', 'line', 'reporting_occasion', 'result', 'tax', 'effect', 'group_income', 'performance', 'business', 'business']


In [9]:
id2word = corpora.Dictionary(data_bigrams_trigrams)

corpus = []
for text in data_bigrams_trigrams:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 7), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 8)]
account


In [10]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=100,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [11]:
#LDA topic modeling
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 100);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);


In [12]:
# results table 
topic_df = get_lda_topics(lda_model, 10)

# view topic_df
topic_df.head(3)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,portfolio_impairment,year,year,year,portfolio_impairment,year,business,portfolio_impairment,portfolio_impairment,year
1,month_potential,business,business,business,month_potential,business,year,month_potential,month_potential,loss
2,month_period,loss,line,loss,month_period,loss,loss,month_period,month_period,business


In [15]:
# count most common words in each topic
#topic_df.apply(lambda x: x.value_counts().head(1), axis=1)

Unnamed: 0,matrix_target,model_profitability,month,month_event,month_period,month_potential,month_stand,morbidity_business,morning_lady,mortal_portfolio,...,ratio_sic,re_insurance,reason,reason_number,reason_profitability,recapture,recapture_charge,recapture_process,recapture_segment,year
0,,,,,,,,,,,...,,,,,,,,,,5.0
1,,,,,,4.0,,,,,...,,,,,,,,,,
2,,,,,4.0,,,,,,...,,,,,,,,,,
3,,,,4.0,,,,,,,...,,,,,,,,,,
4,,,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,,,,,,...,,,,,,,,,,
96,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# visualisation 
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
# vis