In [1]:
import pdftotext
import pandas as pd
import numpy as np
import re
import os

### Define function for "Cleaning" and "participants list"

In [2]:
def cleaning_text(contents):
    ### Cleaning all the unwanted rows in the transcript
    df = pd.DataFrame(contents)

    # remove the unnessary string
    df[0] = df[0].str.replace('\n','')
    df[0] = df[0].str.replace('Bloomberg Transcript','')
    df[0] = df[0].str.replace('\x0c\n','')
    df[0] = df[0].str.replace('FINAL','')
    df[0] = df[0].str.replace('A - ','')
    df[0] = df[0].str.replace('Q - ','')

    # using re to remove the unnessary string
    def drop_unnessary(x):
        page = re.findall(r'Page \d+ of \d+', x) # 'page ... of ... '
        BIO = re.findall(r'{BIO', x) # '{BIO 18731996 <GO>}'
        Company_Name = re.findall(r'Company N ame:', x) # 'Company N ame: H annover Rueck SE'
        Company_Ticker = re.findall(r'Company Ticker:', x) # 'Company Ticker: H N R1 GR Equity'
        Date = re.findall(r'Date:', x) # Date: 2015-03-10
        if page == [] and BIO == [] and Company_Name == [] and Company_Ticker == [] and Date == []:
            return True
        else:
            return False

    true_false = df[0].apply(lambda x: drop_unnessary(x))
    df = df[true_false]

    # drop the final page declaration
    df = df[df[0] != 'This transcript may not be 100 percent accurate and may contain misspellings and other']
    df = df[df[0] != 'inaccuracies. This transcript is provided "as is", without express or implied warranties of']
    df = df[df[0] != 'any kind. Bloomberg retains all rights to this transcript and provides it solely for your']
    df = df[df[0] != 'personal, non-commercial use. Bloomberg, its suppliers and third-party agents shall']
    df = df[df[0] != 'have no liability for errors in this transcript or for lost profits, losses, or direct, indirect,']
    df = df[df[0] != 'incidental, consequential, special or punitive damages in connection with the']
    df = df[df[0] != 'furnishing, performance or use of such transcript. Neither the information nor any']
    df = df[df[0] != 'opinion expressed in this transcript constitutes a solicitation of the purchase or sale of']
    df = df[df[0] != 'securities or commodities. Any opinion expressed in the transcript does not necessarily']
    # df = df[df[0] != 'reflect the views of Bloomberg LP. ¬© COPYRIGHT 2022, BLOOMBERG LP. All rights']  
    df = df[df[0] != 'reserved. Any reproduction, redistribution or retransmission is expressly prohibited.']
    # ¬© could not be identified, would apply re
    def drop_Bloomberg_mark(x):
        Bloomberg_mark = re.findall(r'reflect the views of Bloomberg LP', x) # 'reflect the views of Bloomberg LP. ¬© COPYRIGHT 2022, BLOOMBERG LP. All rights'
        if Bloomberg_mark == []:
            return True
        else:
            return False

    true_false = df[0].apply(lambda x: drop_Bloomberg_mark(x))
    df = df[true_false]

    # drop the empthy row
    df = df[df[0] != '']
    df = df[df[0] != '']

    return df

def participants_list(df):
    # reset the index to make sure the index is continuous for better processing
    df = df.reset_index(drop=True)

    #  'Company Participants' index
    # df.loc[df[0] == 'Company Participants']
    Participant_start_index = df.index[df.iloc[:,0] == 'Company Participants'].tolist()
    #  'Other Participants' index
    # df.loc[df[0] == 'Other Participants']
    Participant_middle_index = df.index[df.iloc[:,0] == 'Other Participants'].tolist()
    #  'MANAGEMENT DISCUSSION SECTION' index, is the beginning of the management discussion, would stop before this row
    # df.loc[df[0] == 'MANAGEMENT DISCUSSION SECTION']
    Participant_end_index = df.index[df.iloc[:,0] == 'MANAGEMENT DISCUSSION SECTION' ].tolist()
    # try to find the 'MANAGEMENT DISCUSSION SECTION' or 'Presentation' index
    if Participant_end_index == []:
        Participant_end_index = df.index[df.iloc[:,0] == 'Presentation'].tolist()

    #print(Participant_start_index, Participant_middle_index, Participant_end_index)

    # make the list of company_paticipants and other_participants
    company_paticipants = df.loc[Participant_start_index[0]+1:Participant_middle_index[0]-1]
    company_paticipants.drop(company_paticipants.index[company_paticipants.iloc[:,0] == ''].tolist(), inplace=True)
    company_paticipants = company_paticipants.values.tolist()

    other_paticipants = df.loc[Participant_middle_index[0]+1:Participant_end_index[0]-1]
    other_paticipants.drop(other_paticipants.index[other_paticipants.iloc[:,0] == ''].tolist(), inplace=True)
    other_paticipants = other_paticipants.values.tolist()

    # print("==========================")
    # print("the company paticipants is: ", company_paticipants)
    # print("==========================")
    # print("the other paticipants is: ", other_paticipants)

    #%%
    # after extract the paticipants, we can drop those information to make the transcript more clear
    df = df.reset_index(drop=True)
    df = df.drop(range(df.index[df.iloc[:,0] == 'Company Participants'].tolist()[0],df.index[df.iloc[:,0].isin(['MANAGEMENT DISCUSSION SECTION','Presentation'])].tolist()[0]+1))

    # drop the first row of the df
    df = df.reset_index(drop=True)
    df = df.iloc[1: , :]


    # reset the index again to make sure the index is continuous for better processing
    df = df.reset_index(drop=True)
    # # save to csv
    # df.to_csv('/Users/timliu/Desktop/output/df.csv')
    return df, company_paticipants, other_paticipants

### Testing on the single company

In [3]:
path = "/Users/hienanh/Documents/GitHub/final_01/Transcript/European (Re)Insurers/HNR1 GY" 
save_path = "/Users/hienanh/Documents/GitHub/final_01/Output"
df = pd.DataFrame()
# create a dataframe with 2500 rows
df_clean_na = pd.DataFrame(np.zeros((2500,1)), columns=['index'])

all_participants = []

files= os.listdir(path) 
for file in files:
    if file.endswith(".pdf"):
        # print(file)
        # Load PDF
        with open(path+"/"+file, "rb") as f:
            pdf = pdftotext.PDF(f)
        # Save all text to a txt file.
        with open(save_path+"/"+file.replace(".pdf", ".txt"), "w") as f:
            f.write("\n\n".join(pdf))
        # open the text file
        with open(save_path+"/"+file.replace(".pdf", ".txt")) as f:
            contents = f.readlines()
            df_clean = cleaning_text(contents)
            # extract all the participants
            df_pure_text,company_paticipants,other_paticipants = participants_list(df_clean)
            all_participants.append(company_paticipants)
            all_participants.append(other_paticipants)
            # using the file name to set as the dataframe's column name
            # df[f"{files.index(file)}"] = df_clean
            df[f"{files[files.index(file)]}"] = df_pure_text
            df_clean_na[f"{files[files.index(file)]}"] = df[f"{files[files.index(file)]}"].dropna(inplace=False).reset_index(drop=True)

# drop the first column of the df
df_clean_na = df_clean_na.iloc[:,1:]
df_clean_na

# save the dataframe
# df_clean_na.to_csv('/Users/timliu/Documents/GitHub/data_collecting/output/test/df_test.csv')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,20211104_Hannover_Rueck_SE-_Earnings_Call_2021-11-4_RT000000002967437630.pdf,20191023_Hannover_Rueck_SE-_Shareholder_Mtg_Call_2019-10-23_SD000000002903050937.pdf,20150506_Hannover_Rueck_SE-_Earnings_Call_2015-5-6_FS000000002212304783.pdf,20160804_Hannover_Rueck_SE-_Earnings_Call_2016-8-4_SD000000002853744569.pdf,20171108_Hannover_Rueck_SE-_Earnings_Call_2017-11-8_SD000000002868833083.pdf,20160310_Hannover_Rueck_SE-_Earnings_Call_2016-3-10_FS000000002259507768.pdf,20201104_Hannover_Rueck_SE-_Earnings_Call_2020-11-4_RT000000002931797209.pdf,20210204_Hannover_Rueck_SE-_M-A_Call_2021-2-4_RT000000002949284264.pdf,20180809_Hannover_Rueck_SE-_Earnings_Call_2018-8-9_SD000000002876144587.pdf,20190507_Hannover_Rueck_SE-_Earnings_Call_2019-5-7_DN000000002633135788.pdf,...,20200205_Hannover_Rueck_SE-_M-A_Call_2020-2-5_DN000000002787035776.pdf,20160510_Hannover_Rueck_SE-_Earnings_Call_2016-5-10_FS000000002275763746.pdf,20161020_Hannover_Rueck_SE-_Guidance_Call_2016-10-20_SD000000002902464788.pdf,20210505_Hannover_Rueck_SE-_Earnings_Call_2021-5-5_DN000000002956339792.pdf,20190205_Hannover_Rueck_SE-_Guidance_Call_2019-2-5_SD000000002901846468.pdf,20181108_Hannover_Rueck_SE-_Earnings_Call_2018-11-8_SD000000002879406671.pdf,20150805_Hannover_Rueck_SE-_Earnings_Call_2015-8-5_FS000000002223534191.pdf,20190307_Hannover_Rueck_SE-_Earnings_Call_2019-3-7_DN000000002597819789.pdf,20200506_Hannover_Rueck_SE-_Earnings_Call_2020-5-6_DN000000002833326951.pdf,20211014_Hannover_Rueck_SE-_Shareholder_Mtg_Call_2021-10-14_SD000000002965861183.pdf
0,Operator,Karl Steinle,Operator,Operator,Ulrich Wallin,Karl Steinle,Operator,Operator,Ulrich Wallin,Operator,...,Operator,Operator,Karl Steinle,Operator,Call,Ulrich Wallin,Operator,Operator,Operator,Karl Steinle
1,"Good morning, ladies and gentlemen. I welcome ...",Well. Good morning to all of you. Welcome to H...,"Good morning, ladies and gentlemen, and welcom...","Good morning, ladies and gentlemen. Welcome to...","Good morning, ladies and gentlemen. I'd like t...","Good afternoon everybody here in Frankfurt, an...","Good morning, ladies and gentlemen. I welcome ...","Good morning, ladies and gentlemen. I welcome ...","Yes. Good morning, ladies and gentlemen. I'd l...","Good morning, ladies and gentlemen. I welcome ...",...,Good morning ladies and gentlemen. I welcome y...,"Good morning, ladies and gentlemen, and welcom...","Good morning, to all of you. Welcome to Hannov...","Good morning, ladies and gentlemen. I welcome ...",Operator,"Good morning, ladies and gentlemen. I'd like t...","Good morning, ladies and gentlemen. I welcome ...","Well, good afternoon to everybody here in Lond...","Good morning, ladies and gentlemen. I welcome ...","Hello. Good morning, to the Hannover Re's Inve..."
2,Conference Call on the Q3 2021 Financial Resul...,really delighted that so many of you were able...,Conference Call on Interim Results 1/2015. For...,conference call on interim results (technical ...,presenting the results for the first nine mont...,via the Internet. Welcome to Hannover Re's Ana...,conference call on the Q3 2020 results. For yo...,"Conference Call on 1st January, 2021, Property...",presenting the results for the first half year...,Conference Call on Q1 2019 Results. For your i...,...,"Conference Call on January 1, 2020 Property an...",Conference Call on Interim Results Q1 2016. Fo...,Karl Steinle. And I'm really delighted that so...,Call on the Q1 2021 Results. For your informat...,"Good morning, ladies and gentlemen. I welcome ...",presenting our results for the first nine mont...,Conference Call on Interim Results Q2 2015. Fo...,Internet. Welcome to Hannover Re's Analyst Con...,Conference Call on Q1 2020 Financial Results. ...,on behalf of the entire management team. Again...
3,"being recorded. At this time, I would like to ...","Steinle. And I'm, among other things, responsi...","recorded. At this time, I would like to hand t...","is being recorded. At this time, I would like ...",Roland Vogel.,see that so many have taken up our invitation....,recorded.,"information, this conference is being recorded...",Roland Vogel.,"recorded. At this time, I would like to hand t...",...,"information, this conference is being recorded.","recorded. At this time, I would like to hand t...",invitation for our 19th edition of this event.,"this time, I would like to hand the call over ...",Conference Call on 1st of January 2019 P&C Tre...,Roland Vogel.,being recorded.,truly a pleasure to see so many of you taking ...,"being recorded. At this time, I would like to ...",pandemic. So we are not broadcasting from Cope...
4,Jean-Jacques Henchoz,comms.,"Wallin, Chief Executive Officer. Please go ahe...","Ulrich Wallin, Chief Executive Officer. Please...","After years of moderate losses, we saw an accu...","the figures for 2015 in greater detail, which ...","At this time, I would like to hand you over to...","over to your host today, Mr.Jean-Jacques Hench...",Our business developed rather favorable in the...,"Wallin, Chief Executive Officer. Please go ahe...",...,"At this time, I would like to hand the call ov...","Wallin, Chief Executive Officer. Please go ahe...",Since we are keeping the annual rotating sched...,"Chief Executive Officer. Please go ahead, sir.",conference is being recorded.,The most significant event that had an influen...,"At this time, I would like to hand the call ov...","interest in Hannover Re. As you know, the key ...",Jean-Jacques Henchoz,Hanover. I'm happy that so many of you are alr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,,,,,,,,,,,...,,,,,,,,,,
2496,,,,,,,,,,,...,,,,,,,,,,
2497,,,,,,,,,,,...,,,,,,,,,,
2498,,,,,,,,,,,...,,,,,,,,,,


### All participants in the single company

In [4]:
# get the value inside the all_participants 
all_participants = [item for sublist in all_participants for item in sublist]
all_participants = [i[0] for i in all_participants]
# print(all_participants)
# %%
# exclude the title of the participants, i.e.'Roland Vogel, CFO' to 'Roland Vogel" by using re
all_participants = [re.sub(r'\,.*', '', participant) for participant in all_participants]
# exclude the 'Property & Casualty Reinsurance'
all_participants = [re.sub(r'Property & Casualty Reinsurance', '', participant) for participant in all_participants]
# exclude the '[0682QB-E Ulrich Wallin]'
all_participants = [re.sub(r'\[0682QB-E Ulrich Wallin\]', '', participant) for participant in all_participants]
# drop duplicated participants
# all_participants = [i[0] for i in all_participants]
# drop the empty string
all_participants = [participant for participant in all_participants if participant != '']
# remove the sapce in the string
all_participants = [participant.strip() for participant in all_participants]
# add the 'Operator' to the list
all_participants.append('Operator')

# drop the duplicated participants
all_participants_copy = all_participants.copy()
all_participants = []
# drop the duplicated participants
for i in all_participants_copy: 
    if i not in all_participants: 
        all_participants.append(i) 

all_participants = sorted(all_participants)

In [5]:
all_participants

['Anasuya Iyer',
 'Andreas Maerkert',
 'Andreas Markert',
 'Andreas MÃ¤rkert',
 'Andreas Schaefer',
 'Andreas Schafer',
 'Andreas SchÃ¤fer',
 'Andreas Schäfer',
 'Andrew Broadfield',
 'Andrew J. Ritchie',
 'Andrew James Ritchie',
 'Andrew Richie',
 'Andrew Ritchie',
 'Andy D. Broadfield',
 'Ashik Musaddi',
 'Ben Cohen',
 'Bill Hawkins',
 'Claude Chevre',
 'Claude Jacques Chevre',
 'Claude Jacques ChÃ¨vre',
 'Clemens Jungsthofel',
 'Daniel Bischof',
 'Darius Satkauskas',
 'Dieter Hein',
 'Eberhard Mueller',
 'Edward Morris',
 'Emanuele Musio',
 'Farooq Hanif',
 'Frank Kopfinger',
 'Guilhem Horvath',
 'Henry Heathfield',
 'Iain Pearce',
 'In-Yong Hwang',
 'Ivan Bokhmat',
 'James Austin Shuck',
 'James R Oram',
 'James Shuck',
 'Janet Van den Berg',
 'Jean-Jacques Hencho',
 'Jean-Jacques Henchoz',
 'Jochen Schmitt',
 'Jonathan Denham',
 'Jonathan Peter Phillip Urwin',
 'Jonathan Urwin',
 'Jonny Urwin',
 'Juergen Graeber',
 'JÃ¼rgen GrÃ¤ber',
 'Kamran Hossain',
 'Karl Steinle',
 'Klaus Mil

In [6]:
new_df = pd.DataFrame()
# identify the len before NaN of each column
for column in df_clean_na.columns:
    # end_index = len(df_clean_na[column])-df_clean_na.isnull().sum(axis = 0)[column]-1
    # identify all the rows in df with all_participants in it
    both_participants_row_index = df_clean_na[df_clean_na[column].isin(all_participants)].index.tolist()
    # # append the end_index to the end of both_participants_row_index
    # both_participants_row_index.append(end_index)
    # apply the both_participants_row_index to the df_clean_na['participants']
    new_df[column] = df_clean_na[column]
    new_df[f"participants_{column}"] = df_clean_na[column].apply(lambda x: x if x in all_participants else np.nan)
    # fill the NaN with the value of the previous row
    new_df[f"participants_{column}"] = new_df[f"participants_{column}"].fillna(method='ffill')
    # # exclude the row if pure_df[column]==pure_df[f"participants_{column}"]
    # pure_df = pure_df[pure_df[column] != pure_df[f"participants_{column}"]]

## Split with the file and add the date

In [7]:
pure_df = pd.DataFrame()
# identify the len before NaN of each column
for column in df_clean_na.columns:
    # exclude the row if pure_df[column]==pure_df[f"participants_{column}"]
    pure_df = new_df[new_df[column] != new_df[f"participants_{column}"]]
# drop the column if the column start with participants
pure_df = pure_df.drop(pure_df.columns[pure_df.columns.str.startswith('participants_')], axis=1).T

# append the text of each roll into one string by using s.str.cat(sep='. ')
pure_df = pure_df.apply(lambda x: x.str.cat(sep='. '), axis=1)
# change the pure_df to dataframe
pure_df = pd.DataFrame(pure_df)
# rename the column
pure_df.columns = ['meeting_text']
# extract the index as column from the text
pure_df['file_name'] = pure_df.index
# extract the date from the index column
pure_df['date'] = pure_df['file_name'].apply(lambda x: x.split('_')[0])
# change the date column to datetime
pure_df['date'] = pd.to_datetime(pure_df['date'])
# reset the index
pure_df = pure_df.reset_index(drop=True)
pure_df

#save the dataframe
# pure_df.to_csv('/Users/timliu/Documents/GitHub/data_collecting/output/test/pure_df.csv')

Unnamed: 0,meeting_text,file_name,date
0,"Good morning, ladies and gentlemen. I welcome ...",20211104_Hannover_Rueck_SE-_Earnings_Call_2021...,2021-11-04
1,Well. Good morning to all of you. Welcome to H...,20191023_Hannover_Rueck_SE-_Shareholder_Mtg_Ca...,2019-10-23
2,"Good morning, ladies and gentlemen, and welcom...",20150506_Hannover_Rueck_SE-_Earnings_Call_2015...,2015-05-06
3,"Good morning, ladies and gentlemen. Welcome to...",20160804_Hannover_Rueck_SE-_Earnings_Call_2016...,2016-08-04
4,"Good morning, ladies and gentlemen. I'd like t...",20171108_Hannover_Rueck_SE-_Earnings_Call_2017...,2017-11-08
5,"Good afternoon everybody here in Frankfurt, an...",20160310_Hannover_Rueck_SE-_Earnings_Call_2016...,2016-03-10
6,"Good morning, ladies and gentlemen. I welcome ...",20201104_Hannover_Rueck_SE-_Earnings_Call_2020...,2020-11-04
7,"Good morning, ladies and gentlemen. I welcome ...",20210204_Hannover_Rueck_SE-_M-A_Call_2021-2-4_...,2021-02-04
8,"Yes. Good morning, ladies and gentlemen. I'd l...",20180809_Hannover_Rueck_SE-_Earnings_Call_2018...,2018-08-09
9,"Good morning, ladies and gentlemen. I welcome ...",20190507_Hannover_Rueck_SE-_Earnings_Call_2019...,2019-05-07


## Testing on the single company

### splitting with paragraph

In [9]:
new_df

Unnamed: 0,20211104_Hannover_Rueck_SE-_Earnings_Call_2021-11-4_RT000000002967437630.pdf,participants_20211104_Hannover_Rueck_SE-_Earnings_Call_2021-11-4_RT000000002967437630.pdf,20191023_Hannover_Rueck_SE-_Shareholder_Mtg_Call_2019-10-23_SD000000002903050937.pdf,participants_20191023_Hannover_Rueck_SE-_Shareholder_Mtg_Call_2019-10-23_SD000000002903050937.pdf,20150506_Hannover_Rueck_SE-_Earnings_Call_2015-5-6_FS000000002212304783.pdf,participants_20150506_Hannover_Rueck_SE-_Earnings_Call_2015-5-6_FS000000002212304783.pdf,20160804_Hannover_Rueck_SE-_Earnings_Call_2016-8-4_SD000000002853744569.pdf,participants_20160804_Hannover_Rueck_SE-_Earnings_Call_2016-8-4_SD000000002853744569.pdf,20171108_Hannover_Rueck_SE-_Earnings_Call_2017-11-8_SD000000002868833083.pdf,participants_20171108_Hannover_Rueck_SE-_Earnings_Call_2017-11-8_SD000000002868833083.pdf,...,20181108_Hannover_Rueck_SE-_Earnings_Call_2018-11-8_SD000000002879406671.pdf,participants_20181108_Hannover_Rueck_SE-_Earnings_Call_2018-11-8_SD000000002879406671.pdf,20150805_Hannover_Rueck_SE-_Earnings_Call_2015-8-5_FS000000002223534191.pdf,participants_20150805_Hannover_Rueck_SE-_Earnings_Call_2015-8-5_FS000000002223534191.pdf,20190307_Hannover_Rueck_SE-_Earnings_Call_2019-3-7_DN000000002597819789.pdf,participants_20190307_Hannover_Rueck_SE-_Earnings_Call_2019-3-7_DN000000002597819789.pdf,20200506_Hannover_Rueck_SE-_Earnings_Call_2020-5-6_DN000000002833326951.pdf,participants_20200506_Hannover_Rueck_SE-_Earnings_Call_2020-5-6_DN000000002833326951.pdf,20211014_Hannover_Rueck_SE-_Shareholder_Mtg_Call_2021-10-14_SD000000002965861183.pdf,participants_20211014_Hannover_Rueck_SE-_Shareholder_Mtg_Call_2021-10-14_SD000000002965861183.pdf
0,Operator,Operator,Karl Steinle,Karl Steinle,Operator,Operator,Operator,Operator,Ulrich Wallin,Ulrich Wallin,...,Ulrich Wallin,Ulrich Wallin,Operator,Operator,Operator,Operator,Operator,Operator,Karl Steinle,Karl Steinle
1,"Good morning, ladies and gentlemen. I welcome ...",Operator,Well. Good morning to all of you. Welcome to H...,Karl Steinle,"Good morning, ladies and gentlemen, and welcom...",Operator,"Good morning, ladies and gentlemen. Welcome to...",Operator,"Good morning, ladies and gentlemen. I'd like t...",Ulrich Wallin,...,"Good morning, ladies and gentlemen. I'd like t...",Ulrich Wallin,"Good morning, ladies and gentlemen. I welcome ...",Operator,"Well, good afternoon to everybody here in Lond...",Operator,"Good morning, ladies and gentlemen. I welcome ...",Operator,"Hello. Good morning, to the Hannover Re's Inve...",Karl Steinle
2,Conference Call on the Q3 2021 Financial Resul...,Operator,really delighted that so many of you were able...,Karl Steinle,Conference Call on Interim Results 1/2015. For...,Operator,conference call on interim results (technical ...,Operator,presenting the results for the first nine mont...,Ulrich Wallin,...,presenting our results for the first nine mont...,Ulrich Wallin,Conference Call on Interim Results Q2 2015. Fo...,Operator,Internet. Welcome to Hannover Re's Analyst Con...,Operator,Conference Call on Q1 2020 Financial Results. ...,Operator,on behalf of the entire management team. Again...,Karl Steinle
3,"being recorded. At this time, I would like to ...",Operator,"Steinle. And I'm, among other things, responsi...",Karl Steinle,"recorded. At this time, I would like to hand t...",Operator,"is being recorded. At this time, I would like ...",Operator,Roland Vogel.,Ulrich Wallin,...,Roland Vogel.,Ulrich Wallin,being recorded.,Operator,truly a pleasure to see so many of you taking ...,Operator,"being recorded. At this time, I would like to ...",Operator,pandemic. So we are not broadcasting from Cope...,Karl Steinle
4,Jean-Jacques Henchoz,Jean-Jacques Henchoz,comms.,Karl Steinle,"Wallin, Chief Executive Officer. Please go ahe...",Operator,"Ulrich Wallin, Chief Executive Officer. Please...",Operator,"After years of moderate losses, we saw an accu...",Ulrich Wallin,...,The most significant event that had an influen...,Ulrich Wallin,"At this time, I would like to hand the call ov...",Operator,"interest in Hannover Re. As you know, the key ...",Operator,Jean-Jacques Henchoz,Jean-Jacques Henchoz,Hanover. I'm happy that so many of you are alr...,Karl Steinle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,,Operator,,Roland Helmut Vogel,,Operator,,Roland Vogel,,Ulrich Wallin,...,,Ulrich Wallin,,Ulrich Wallin,,Ulrich Wallin,,Operator,,Karl Steinle
2496,,Operator,,Roland Helmut Vogel,,Operator,,Roland Vogel,,Ulrich Wallin,...,,Ulrich Wallin,,Ulrich Wallin,,Ulrich Wallin,,Operator,,Karl Steinle
2497,,Operator,,Roland Helmut Vogel,,Operator,,Roland Vogel,,Ulrich Wallin,...,,Ulrich Wallin,,Ulrich Wallin,,Ulrich Wallin,,Operator,,Karl Steinle
2498,,Operator,,Roland Helmut Vogel,,Operator,,Roland Vogel,,Ulrich Wallin,...,,Ulrich Wallin,,Ulrich Wallin,,Ulrich Wallin,,Operator,,Karl Steinle


In [61]:
paragrapg_df = new_df.iloc[:,:2]
# drop the NaN
paragrapg_df = paragrapg_df.dropna(inplace=False)
#rename the second column to 'participants', first column to 'text'
paragrapg_df.columns = ['text', 'participants']
# if the text is empty, drop it
paragrapg_df = paragrapg_df[paragrapg_df['text'] != '']
# reset the index
paragrapg_df = paragrapg_df.reset_index(drop=True)
# if 'text' == 'participants', get the index of the row
paragrapg_index = paragrapg_df[paragrapg_df['text'] == paragrapg_df['participants']].index.tolist()

# +1 for every value in paragrapg_index
start_paragrapg_index = []
for i in range(len(paragrapg_index)):
    start_paragrapg_index.append(paragrapg_index[i]+1)
# disregard the last value in the list
start_paragrapg_index = start_paragrapg_index[:-1]
print(len(start_paragrapg_index))
# -1 for every value in paragrapg_index
end_paragrapg_index = []
for i in range(len(paragrapg_index)):
    end_paragrapg_index.append(paragrapg_index[i])
# disregard the first value in the list
end_paragrapg_index = end_paragrapg_index[1:]
print(len(end_paragrapg_index))

# extracct the text of the paragrapg_df between end_paragrapg_index and start_paragrapg_index
paragraph_split_df = pd.DataFrame()
for i in range(len(start_paragrapg_index)):
    paragraph = paragrapg_df.iloc[start_paragrapg_index[i]:end_paragrapg_index[i]]
    # merge the paragraph to one cell 
    paragraph_text = paragraph.apply(''.join).to_frame().T
    # appemd the paragraph_text['text'].iloc[:,0] to the paragraph_split_df
    paragraph_split_df = paragraph_split_df.append(paragraph_text, ignore_index=True)

# look up for paragrapg_df['participants'] from the start_paragrapg_index
participants = paragrapg_df.iloc[start_paragrapg_index,1].to_frame()
# reset the index
participants = participants.reset_index(drop=True)

paragraph_split_df['participants'] = participants
paragraph_split_df

65
65


Unnamed: 0,text,participants
0,"Good morning, ladies and gentlemen. I welcome ...",Operator
1,"Well, good morning, everyone and welcome to ou...",Jean-Jacques Henchoz
2,"Thank you very much, Clemens. On the next slid...",Jean-Jacques Henchoz
3,"(Question And Answer)Ladies and gentlemen, we ...",Operator
4,"Hi. Good morning, everyone. Could I just dig i...",Andrew Ritchie
...,...,...
60,Yes. Two quick questions. The first one would ...,Thomas Fossard
61,"And on the NatCat, so as Jean-Jacques explaine...",Klaus Miller
62,Thank you. Thank you.,Thomas Fossard
63,And there are no further questions at this poi...,Operator


### splitting with participants (a bit useless)

### splitting with sentences 
and will have the participants_split_df split the text by '.'

In [62]:
# write a function to split the text by '.' in paragraph_split_df
p_to_s_split_df = paragraph_split_df.copy()

def split_text(text):
    text = text.split(". ")
    return text
# apply the function to the paragraph_split_df
p_to_s_split_df['text'] = p_to_s_split_df['text'].apply(lambda x: split_text(x))
p_to_s_split_df

Unnamed: 0,text,participants
0,"[Good morning, ladies and gentlemen, I welcome...",Operator
1,"[Well, good morning, everyone and welcome to o...",Jean-Jacques Henchoz
2,"[Thank you very much, Clemens, On the next sli...",Jean-Jacques Henchoz
3,"[(Question And Answer)Ladies and gentlemen, we...",Operator
4,"[Hi, Good morning, everyone, Could I just dig ...",Andrew Ritchie
...,...,...
60,"[Yes, Two quick questions, The first one would...",Thomas Fossard
61,"[And on the NatCat, so as Jean-Jacques explain...",Klaus Miller
62,"[Thank you, Thank you.]",Thomas Fossard
63,[And there are no further questions at this po...,Operator


In [13]:
sentence_split_df = pd.DataFrame()
for i in range(len(p_to_s_split_df)):
    sentence_list = p_to_s_split_df['text'].iloc[i]
    sentence_split_single_df = pd.DataFrame (sentence_list, columns = ['sentence'])
    sentence_split_single_df['participants'] = p_to_s_split_df['participants'].iloc[i]
    sentence_split_single_df['paragraph'] = i
    sentence_split_df = sentence_split_df.append(sentence_split_single_df, ignore_index=True)
# drop if the 'sentence' is empty
sentence_split_df = sentence_split_df.dropna(inplace=False)

sentence_split_df

# safe the dataframe with the path
#path = '/Users/timliu/Documents/GitHub/data_collecting/df_for_NLP/sentence_split_df.csv'
#sentence_split_df.to_csv(path)
#sentence_split_df

Unnamed: 0,sentence,participants,paragraph
0,"Good morning, ladies and gentlemen",Operator,0
1,I welcome you to today's Hannover Re Internati...,Operator,0
2,"For your information, this conference isbeing ...",Operator,0
3,"At this time, I would like to hand the call ov...",Operator,0
4,"Please go ahead, sir.",Operator,0
...,...,...,...
437,You've seen a year impacted by large losses on...,Jean-Jacques Henchoz,64
438,And I think the key message is that the profit...,Jean-Jacques Henchoz,64
439,The guidance for '22 shows boththe growth traj...,Jean-Jacques Henchoz,64
440,"So, I think thekey messages were addressed today",Jean-Jacques Henchoz,64


## sentiment analysis

In [14]:
#%% # snetiment analysis
import numpy as np
import pandas as pd

import re
import string 

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

import nltk 
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords          # module for stop words that come with NLTK
nltk.download('stopwords')
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

# twitter_samples.fileids()
# documents
docs_negative = [(t, "neg") for t in twitter_samples.strings("negative_tweets.json")]
docs_positive = [(t, "pos") for t in twitter_samples.strings("positive_tweets.json")]
print("==========================================================")
print(f'There are {len(docs_negative)} negative sentences.')
print(f'There are {len(docs_positive)} positive sentences.')

# spliting dataset 
train_set = docs_negative[:3500] + docs_positive[:3500]
test_set = docs_negative[3500:4250] + docs_positive[3500:4250]
valid_set = docs_negative[4250:] + docs_positive[4250:]

# clean text
def process_text(text):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    #text = text.str
    text = str(text)
    text = re.sub(r'\$\w*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'#', '', text)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    text_tokens = tokenizer.tokenize(text)

    text_clean = []
    for word in text_tokens:
        if (word not in stopwords_english and  
                word not in string.punctuation): 
            stem_word = stemmer.stem(word)  # stemming word
            text_clean.append(stem_word)
            
    sentence = ' '.join(text_clean)
    
    return sentence

# categorical label
def cat_label(label):
    if label == 'neg':
        value = -1
    elif label == 'pos':
        value = 1
    return value 

# split for x and y 
def xy(dataset):
    df = pd.DataFrame(dataset, columns = ['text', 'label'])
    df['text_clean'] = df['text'].apply(lambda r: process_text(r))
    #df['categorical_label'] = df.label.factorize()[0]
    df['categorical_label'] = df['label'].apply(lambda r: cat_label(r))

    x = df.text_clean
    y = df.categorical_label

    return x, y

# dataframe
x_train, y_train = xy(train_set)
x_test, y_test = xy(test_set)
x_valid, y_valid = xy(valid_set)

## using the naive bayes classifier
model = Pipeline([
    ('bow',CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
print("==========================================================")
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))
print(accuracy_score(y_pred,y_test))

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/hienanh/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hienanh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


There are 5000 negative sentences.
There are 5000 positive sentences.
[[593 235]
 [157 515]]
              precision    recall  f1-score   support

          -1       0.79      0.72      0.75       828
           1       0.69      0.77      0.72       672

    accuracy                           0.74      1500
   macro avg       0.74      0.74      0.74      1500
weighted avg       0.74      0.74      0.74      1500

0.7386666666666667


In [15]:
# Apply into earnings call sentence
# import dataset
#path = '/Users/timliu/Documents/GitHub/data_collecting/df_for_NLP/sentence_split_df.csv'
#df_sentence = pd.read_csv(path)
# df_sentence.head()
df_sentence = sentence_split_df.copy()

# drop participant columns as we dont need it
# df_sentence = df_sentence.drop(['participants'], axis=1)

# check NaN values
print("==========================================================")
print(df_sentence.isnull().sum())

# delete NaN rows
df_sentence = df_sentence.dropna()  

# clean text for sentiment analysis
df_sentence['text_clean'] = df_sentence['sentence'].apply(lambda r: process_text(r))
# df_sentence.head(5)

# making prediction
prediction = model.predict(df_sentence.text_clean)
prediction_label = np.array(['positive' if p==1 else 'negative' for p in prediction])
df_sentence['prediction_label'] = prediction_label
df_sentence['sentiment'] = prediction
# df_sentence.head()

print("==========================================================")
print(Counter(df_sentence['prediction_label']))

# df_sentence left columns with only 'sentence','participants','sentiment_score'
df_sentence = df_sentence[['sentence','participants','sentiment']]
df_sentence

sentence        0
participants    0
paragraph       0
dtype: int64
Counter({'positive': 296, 'negative': 146})


Unnamed: 0,sentence,participants,sentiment
0,"Good morning, ladies and gentlemen",Operator,1
1,I welcome you to today's Hannover Re Internati...,Operator,1
2,"For your information, this conference isbeing ...",Operator,-1
3,"At this time, I would like to hand the call ov...",Operator,-1
4,"Please go ahead, sir.",Operator,-1
...,...,...,...
437,You've seen a year impacted by large losses on...,Jean-Jacques Henchoz,1
438,And I think the key message is that the profit...,Jean-Jacques Henchoz,-1
439,The guidance for '22 shows boththe growth traj...,Jean-Jacques Henchoz,1
440,"So, I think thekey messages were addressed today",Jean-Jacques Henchoz,1


### Calculate by weighted

In [16]:
#take a copy
length_sentence = sentence_split_df.copy()
# calculate length of each sentence
length_sentence['length_of_sentence'] = length_sentence['sentence'].apply(lambda r: len(r))
# calculate length by paragraph
length_para = length_sentence.groupby(['paragraph']).sum().reset_index(drop=False)
length_para.rename(columns={'length_of_sentence': 'length_of_para'}, inplace=True)
# merged into 1 datafram
cal_sentiment = pd.merge(length_sentence, length_para, on=["paragraph"])
cal_sentiment['sentiment']=df_sentence['sentiment']
# calculate weighted 
cal_sentiment['sentiment_by_weighted']=cal_sentiment['length_of_sentence']/cal_sentiment['length_of_para']*cal_sentiment['sentiment']
# merged sentiment columnes
#cal_sentiment['sentiment']=df_sentence['sentiment']
#cal_sentiment['sentiment_by_weighted']=cal_sentiment['sentiment']*cal_sentiment['weighted']
cal_sentiment

Unnamed: 0,sentence,participants,paragraph,length_of_sentence,length_of_para,sentiment,sentiment_by_weighted
0,"Good morning, ladies and gentlemen",Operator,0,34,323,1,0.105263
1,I welcome you to today's Hannover Re Internati...,Operator,0,98,323,1,0.303406
2,"For your information, this conference isbeing ...",Operator,0,54,323,-1,-0.167183
3,"At this time, I would like to hand the call ov...",Operator,0,116,323,-1,-0.359133
4,"Please go ahead, sir.",Operator,0,21,323,-1,-0.065015
...,...,...,...,...,...,...,...
437,You've seen a year impacted by large losses on...,Jean-Jacques Henchoz,64,145,594,1,0.244108
438,And I think the key message is that the profit...,Jean-Jacques Henchoz,64,120,594,-1,-0.202020
439,The guidance for '22 shows boththe growth traj...,Jean-Jacques Henchoz,64,104,594,1,0.175084
440,"So, I think thekey messages were addressed today",Jean-Jacques Henchoz,64,48,594,1,0.080808


In [17]:
score_df = cal_sentiment[['paragraph','sentiment_by_weighted']]
score_list = np.array(score_df.groupby(['paragraph']).sum().reset_index(drop=True).values)
score_list

array([[-0.18266254],
       [ 0.48681948],
       [ 0.63472222],
       [ 0.84615385],
       [ 0.46130653],
       [ 1.        ],
       [-0.42884615],
       [ 0.23848746],
       [ 1.        ],
       [ 1.        ],
       [ 1.        ],
       [-0.09345794],
       [ 1.        ],
       [ 1.        ],
       [ 0.65909091],
       [-0.11441308],
       [ 0.82532239],
       [ 0.824     ],
       [ 0.42857143],
       [-0.55122951],
       [ 0.52941176],
       [ 0.64705882],
       [-0.6445993 ],
       [-0.03591682],
       [ 0.05607477],
       [-0.16953317],
       [-0.35892514],
       [-0.37084399],
       [ 0.04347826],
       [-0.77777778],
       [ 0.30024814],
       [ 1.        ],
       [ 1.        ],
       [ 0.63414634],
       [ 0.48538682],
       [-0.68553459],
       [ 1.        ],
       [-1.        ],
       [ 0.65909091],
       [ 0.6642984 ],
       [ 1.        ],
       [ 0.37900552],
       [-1.        ],
       [ 0.61190612],
       [ 1.        ],
       [ 0

In [18]:
# add the score_list into paragraph_split_df['sentiment_score']
paragraph_split_df = paragraph_split_df.assign(sentiment_score=score_list)
paragraph_split_df


Unnamed: 0,text,participants,sentiment_score
0,"Good morning, ladies and gentlemen. I welcome ...",Operator,-0.182663
1,"Well, good morning, everyone and welcome to ou...",Jean-Jacques Henchoz,0.486819
2,"Thank you very much, Clemens. On the next slid...",Jean-Jacques Henchoz,0.634722
3,"(Question And Answer)Ladies and gentlemen, we ...",Operator,0.846154
4,"Hi. Good morning, everyone. Could I just dig i...",Andrew Ritchie,0.461307
...,...,...,...
60,Yes. Two quick questions. The first one would ...,Thomas Fossard,1.000000
61,"And on the NatCat, so as Jean-Jacques explaine...",Klaus Miller,0.656716
62,Thank you. Thank you.,Thomas Fossard,1.000000
63,And there are no further questions at this poi...,Operator,-0.030303


### add the date

In [19]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/hienanh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hienanh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [20]:
extract_date = new_df.iloc[:,:2]
# get the column name from extract_date
date_extract = extract_date.columns.to_list()
date_extract[0]

# date_extract[0] split by '_'
date = date_extract[0].split('_')[0]
paragraph_split_df['meeting_date'] = pd.to_datetime(date)
paragraph_split_df

Unnamed: 0,text,participants,sentiment_score,meeting_date
0,"Good morning, ladies and gentlemen. I welcome ...",Operator,-0.182663,2021-11-04
1,"Well, good morning, everyone and welcome to ou...",Jean-Jacques Henchoz,0.486819,2021-11-04
2,"Thank you very much, Clemens. On the next slid...",Jean-Jacques Henchoz,0.634722,2021-11-04
3,"(Question And Answer)Ladies and gentlemen, we ...",Operator,0.846154,2021-11-04
4,"Hi. Good morning, everyone. Could I just dig i...",Andrew Ritchie,0.461307,2021-11-04
...,...,...,...,...
60,Yes. Two quick questions. The first one would ...,Thomas Fossard,1.000000,2021-11-04
61,"And on the NatCat, so as Jean-Jacques explaine...",Klaus Miller,0.656716,2021-11-04
62,Thank you. Thank you.,Thomas Fossard,1.000000,2021-11-04
63,And there are no further questions at this poi...,Operator,-0.030303,2021-11-04


In [21]:
### add the past, future and present to the dataframe
from nltk import word_tokenize, pos_tag
# import nltk
# nltk.download()
def determine_tense_input(sentence):
    text = word_tokenize(sentence)
    tagged = pos_tag(text)

    tense = {}
    tense["future"] = len([word for word in tagged if word[1] == "MD"])
    tense["present"] = len([word for word in tagged if word[1] in ["VBP", "VBZ","VBG"]])
    tense["past"] = len([word for word in tagged if word[1] in ["VBD", "VBN"]]) 
    return(tense)

# %%
# apply the function to the paragraph_split_df
paragraph_split_df['tense'] = paragraph_split_df['text'].apply(lambda x: determine_tense_input(x))
# decode the paragraph_split_df['tense'] to different column
paragraph_split_df['tense_future'] = paragraph_split_df['tense'].apply(lambda x: x['future'])
paragraph_split_df['tense_present'] = paragraph_split_df['tense'].apply(lambda x: x['present'])
paragraph_split_df['tense_past'] = paragraph_split_df['tense'].apply(lambda x: x['past'])

# if the tense_future > tense_present > tense_past, then the tense is future
# else if the tense_present > tense_future > tense_past, then the tense is present
# else if the tense_past > tense_future > tense_present, then the tense is past
paragraph_split_df['tense'] = paragraph_split_df.apply(lambda x: 'future' if x['tense_future'] > x['tense_present'] > x['tense_past'] else 'present' if x['tense_present'] > x['tense_future'] > x['tense_past'] else 'past', axis=1)
# else if the tense_past = tense_future = tense_present, then the tense is unknown
paragraph_split_df['tense'] = paragraph_split_df.apply(lambda x: 'unknown' if x['tense_past'] == x['tense_future'] == x['tense_present'] else x['tense'], axis=1)
# disregard the tense_future, tense_present, and tense_past
paragraph_split_df = paragraph_split_df.drop(columns=['tense_future', 'tense_present', 'tense_past'])

# %%
# count the amount of rowa of paragraph_split_df['tense']=='unknown'
unknown_sentence_count = paragraph_split_df[paragraph_split_df['tense'] == 'unknown'].shape[0]
# count the amount of rowa of paragraph_split_df['tense']=='future'
future_sentence_count = paragraph_split_df[paragraph_split_df['tense'] == 'future'].shape[0]
# count the amount of rowa of paragraph_split_df['tense']=='present'
present_sentence_count = paragraph_split_df[paragraph_split_df['tense'] == 'present'].shape[0]
# count the amount of rowa of paragraph_split_df['tense']=='past'
past_sentence_count = paragraph_split_df[paragraph_split_df['tense'] == 'past'].shape[0]

# print the result
print(f'unknown_para_count: {unknown_sentence_count}')
print(f'future_sentence_count: {future_sentence_count}')
print(f'present_sentence_count: {present_sentence_count}')
print(f'past_sentence_count: {past_sentence_count}')

paragraph_split_df

unknown_para_count: 14
future_sentence_count: 1
present_sentence_count: 3
past_sentence_count: 47


Unnamed: 0,text,participants,sentiment_score,meeting_date,tense
0,"Good morning, ladies and gentlemen. I welcome ...",Operator,-0.182663,2021-11-04,past
1,"Well, good morning, everyone and welcome to ou...",Jean-Jacques Henchoz,0.486819,2021-11-04,past
2,"Thank you very much, Clemens. On the next slid...",Jean-Jacques Henchoz,0.634722,2021-11-04,past
3,"(Question And Answer)Ladies and gentlemen, we ...",Operator,0.846154,2021-11-04,present
4,"Hi. Good morning, everyone. Could I just dig i...",Andrew Ritchie,0.461307,2021-11-04,past
...,...,...,...,...,...
60,Yes. Two quick questions. The first one would ...,Thomas Fossard,1.000000,2021-11-04,past
61,"And on the NatCat, so as Jean-Jacques explaine...",Klaus Miller,0.656716,2021-11-04,past
62,Thank you. Thank you.,Thomas Fossard,1.000000,2021-11-04,unknown
63,And there are no further questions at this poi...,Operator,-0.030303,2021-11-04,past


## Topic moedelling

In [22]:
import numpy as np
import pandas as pd 

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [23]:
data_topic = paragraph_split_df[['text','participants']]
data_topic # each line is a paragraph 

Unnamed: 0,text,participants
0,"Good morning, ladies and gentlemen. I welcome ...",Operator
1,"Well, good morning, everyone and welcome to ou...",Jean-Jacques Henchoz
2,"Thank you very much, Clemens. On the next slid...",Jean-Jacques Henchoz
3,"(Question And Answer)Ladies and gentlemen, we ...",Operator
4,"Hi. Good morning, everyone. Could I just dig i...",Andrew Ritchie
...,...,...
60,Yes. Two quick questions. The first one would ...,Thomas Fossard
61,"And on the NatCat, so as Jean-Jacques explaine...",Klaus Miller
62,Thank you. Thank you.,Thomas Fossard
63,And there are no further questions at this poi...,Operator


In [24]:
def lemmatization(texts, allowed_postags=["NOUN"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []

    for row in range(len(data_topic)):
        for text in texts:
            doc = nlp(data_topic.loc[row,"text"])
        
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

lemmatized_texts = lemmatization(data_topic)
lemmatized_texts[0]
#lemmatized_texts

'morning lady gentleman today information conference isbeing time call host today sir'

In [25]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0][0:20])

['morning', 'lady', 'gentleman', 'today', 'information', 'conference', 'isbeing', 'time', 'call', 'host', 'today', 'sir']


In [26]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2)]
call


### LDA 

In [27]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

### WiP - Coherence Values 

In [28]:
#LDA topic modeling
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 100);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

In [29]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []

    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=lemmatized_texts, start=2, limit=20, step=6)

  numerator = (co_occur_count / num_docs) + EPSILON
  denominator = (w_prime_count / num_docs) * (w_star_count / num_docs)
  co_doc_prob = co_occur_count / num_docs


### Results - table

In [30]:
topic_df = get_lda_topics(lda_model, 10)

In [31]:
topic_df

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,year,growth,year,result,year,year,year,growth,question,premium
1,loss,month,fund,number,number,life,loss,year,covid,impact
2,underwriting,business,business,profitability,effect,calculation,business,loss,line,question
3,ratio,number,investment,eur,question,cover,growth,budget,claim,number
4,difference,year,quarter,side,bit,growth,portfolio,question,bit,business
...,...,...,...,...,...,...,...,...,...,...
95,future,riot,interest,guidance,market,eur,target,nature,presentation,group
96,income,profit,comment,line,profit,lot,morbidity,theprecise,isdeposit,line
97,regard,thesetreatie,look,bond,asset,way,movement,tochange,rate,asset
98,model,highlight,thatthere,reinsurance,longevity,accounting,population,everyelement,andnobody,gain


### Visualize topic

In [32]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


## Match topic and paragraph

In [33]:
data_topic_match = paragraph_split_df[['text','sentiment_score']].copy()
data_topic_match['para_lemma'] = data_words
data_topic_match

Unnamed: 0,text,sentiment_score,para_lemma
0,"Good morning, ladies and gentlemen. I welcome ...",-0.182663,"[morning, lady, gentleman, today, information,..."
1,"Well, good morning, everyone and welcome to ou...",0.486819,"[morning, welcome, conference, call, resultsof..."
2,"Thank you very much, Clemens. On the next slid...",0.634722,"[slide, target, metric, profitability, target,..."
3,"(Question And Answer)Ladies and gentlemen, we ...",0.846154,"[question, answer, ladie, gentleman, question,..."
4,"Hi. Good morning, everyone. Could I just dig i...",0.461307,"[morning, bit, catbudget, man, andnatcat, spli..."
...,...,...,...
60,Yes. Two quick questions. The first one would ...,1.000000,"[question, one, life, longevity, iguess, ebit,..."
61,"And on the NatCat, so as Jean-Jacques explaine...",0.656716,"[portfolio, profitability, hurdle, rate, prici..."
62,Thank you. Thank you.,1.000000,[]
63,And there are no further questions at this poi...,-0.030303,"[question, point, speaker, remark]"


In [34]:
# list of columns name as key 
col_name = topic_df.columns

In [35]:
def create_topic_dictionary(topic_df):
    # create dictionary
    topic_dict = {}
    for col in col_name:
        topic_dict[col] = list(topic_df[col])
    
    return topic_dict

topic_dictionary = create_topic_dictionary(topic_df)
#topic_dictssssss

In [36]:
def count_topic(para_lemma,topic_dict):
    count = 0
    for words in para_lemma:
        if words in list(topic_dict):
            count += 1
        else:
            continue 
    return count

In [37]:
for col in col_name:
    data_topic_match[col] = data_topic_match['para_lemma'].apply(lambda r: count_topic(r,topic_dictionary[col]))

In [38]:
# find the columns contains maximum values of each row
final_topic = data_topic_match[col_name].idxmax(axis = 1)
data_topic_match['final_topic'] = final_topic
data_topic_match

Unnamed: 0,text,sentiment_score,para_lemma,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10,final_topic
0,"Good morning, ladies and gentlemen. I welcome ...",-0.182663,"[morning, lady, gentleman, today, information,...",1,1,1,3,2,12,3,5,2,1,Topic # 06
1,"Well, good morning, everyone and welcome to ou...",0.486819,"[morning, welcome, conference, call, resultsof...",295,383,328,290,304,255,385,307,291,311,Topic # 07
2,"Thank you very much, Clemens. On the next slid...",0.634722,"[slide, target, metric, profitability, target,...",73,103,137,90,84,93,103,129,112,100,Topic # 03
3,"(Question And Answer)Ladies and gentlemen, we ...",0.846154,"[question, answer, ladie, gentleman, question,...",1,4,4,4,4,5,4,6,4,5,Topic # 08
4,"Hi. Good morning, everyone. Could I just dig i...",0.461307,"[morning, bit, catbudget, man, andnatcat, spli...",11,12,9,12,11,16,13,32,12,12,Topic # 08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,Yes. Two quick questions. The first one would ...,1.000000,"[question, one, life, longevity, iguess, ebit,...",9,16,13,17,16,15,18,18,18,15,Topic # 07
61,"And on the NatCat, so as Jean-Jacques explaine...",0.656716,"[portfolio, profitability, hurdle, rate, prici...",16,20,16,32,14,23,19,18,19,14,Topic # 04
62,Thank you. Thank you.,1.000000,[],0,0,0,0,0,0,0,0,0,0,Topic # 01
63,And there are no further questions at this poi...,-0.030303,"[question, point, speaker, remark]",0,2,2,2,1,2,2,1,4,1,Topic # 09


In [39]:
df_sentiment_of_topic = data_topic_match[['sentiment_score','final_topic']]
df_sentiment_of_topic = df_sentiment_of_topic.groupby(['final_topic']).mean()
df_sentiment_of_topic.T
#score_list = list(df_sentiment_of_topic.sentiment_score)
#score_list

final_topic,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
sentiment_score,0.477182,0.667655,0.153348,0.145042,0.277224,0.263043,0.330346,0.476481,-0.028592,0.134245


## Test 1 company - several transcript

In [55]:
# count total number of columns
number_of_files = int(len(new_df.columns)/2) ##37

# taking start and end index of columns, divided by files
start_index = []
end_index = []
for n in range(number_of_files+1):
    if n % 2 == 0:
        start_index.append(n)
    else:
        end_index.append(n+1)
print(start_index)
print(end_index)

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36]
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38]


In [58]:
def splitting_para(new_df,start, stop):
    paragrapg_df = new_df.iloc[:,start:stop]
    # drop the NaN
    paragrapg_df = paragrapg_df.dropna(inplace=False)
    #rename the second column to 'participants', first column to 'text'
    paragrapg_df.columns = ['text', 'participants']
    # if the text is empty, drop it
    paragrapg_df = paragrapg_df[paragrapg_df['text'] != '']
    # reset the index
    paragrapg_df = paragrapg_df.reset_index(drop=True)
    # if 'text' == 'participants', get the index of the row
    paragrapg_index = paragrapg_df[paragrapg_df['text'] == paragrapg_df['participants']].index.tolist()

    # +1 for every value in paragrapg_index
    start_paragrapg_index = []
    for i in range(len(paragrapg_index)):
        start_paragrapg_index.append(paragrapg_index[i]+1)
    # disregard the last value in the list
    start_paragrapg_index = start_paragrapg_index[:-1]
    #print(len(start_paragrapg_index))
    # -1 for every value in paragrapg_index
    end_paragrapg_index = []
    for i in range(len(paragrapg_index)):
        end_paragrapg_index.append(paragrapg_index[i])
    # disregard the first value in the list
    end_paragrapg_index = end_paragrapg_index[1:]
    #print(len(end_paragrapg_index))

    # extracct the text of the paragrapg_df between end_paragrapg_index and start_paragrapg_index
    paragraph_split_df = pd.DataFrame()
    for i in range(len(start_paragrapg_index)):
        paragraph = paragrapg_df.iloc[start_paragrapg_index[i]:end_paragrapg_index[i]]
        # merge the paragraph to one cell 
        paragraph_text = paragraph.apply(''.join).to_frame().T
        # appemd the paragraph_text['text'].iloc[:,0] to the paragraph_split_df
        paragraph_split_df = paragraph_split_df.append(paragraph_text, ignore_index=True)

    # look up for paragrapg_df['participants'] from the start_paragrapg_index
    participants = paragrapg_df.iloc[start_paragrapg_index,1].to_frame()
    # reset the index
    participants = participants.reset_index(drop=True)

    paragraph_split_df['participants'] = participants
    
    return paragraph_split_df

In [66]:
def splitting_sentence(paragraph_split_df):
    # write a function to split the text by '.' in paragraph_split_df
    p_to_s_split_df = paragraph_split_df.copy()

    def split_text(text):
        text = text.split(". ")
        return text
    
    # apply the function to the paragraph_split_df
    p_to_s_split_df['text'] = p_to_s_split_df['text'].apply(lambda x: split_text(x))
    #p_to_s_split_df
    
    # split by sentence
    sentence_split_df = pd.DataFrame()
    for i in range(len(p_to_s_split_df)):
        sentence_list = p_to_s_split_df['text'].iloc[i]
        sentence_split_single_df = pd.DataFrame (sentence_list, columns = ['sentence'])
        sentence_split_single_df['participants'] = p_to_s_split_df['participants'].iloc[i]
        sentence_split_single_df['paragraph'] = i
        sentence_split_df = sentence_split_df.append(sentence_split_single_df, ignore_index=True)
    # drop if the 'sentence' is empty
    sentence_split_df = sentence_split_df.dropna(inplace=False)

    sentence_split_df
    
    return sentence_split_df

In [69]:
def apply_sentiment(sentence_split_df):
    df_sentence = sentence_split_df.copy()
    # delete NaN rows
    df_sentence = df_sentence.dropna()  

    # clean text for sentiment analysis
    df_sentence['text_clean'] = df_sentence['sentence'].apply(lambda r: process_text(r))
    # df_sentence.head(5)

    # making prediction
    prediction = model.predict(df_sentence.text_clean)
    prediction_label = np.array(['positive' if p==1 else 'negative' for p in prediction])
    df_sentence['prediction_label'] = prediction_label
    df_sentence['sentiment'] = prediction
    # df_sentence.head()

    #print("==========================================================")
    #print(Counter(df_sentence['prediction_label']))

    # df_sentence left columns with only 'sentence','participants','sentiment_score'
    df_sentence = df_sentence[['sentence','participants','sentiment']]
    
    return df_sentence

In [71]:
def calculate_weighted_average(sentence_split_df):
    #take a copy
    length_sentence = sentence_split_df.copy()
    # calculate length of each sentence
    length_sentence['length_of_sentence'] = length_sentence['sentence'].apply(lambda r: len(r))
    # calculate length by paragraph
    length_para = length_sentence.groupby(['paragraph']).sum().reset_index(drop=False)
    length_para.rename(columns={'length_of_sentence': 'length_of_para'}, inplace=True)
    # merged into 1 datafram
    cal_sentiment = pd.merge(length_sentence, length_para, on=["paragraph"])
    cal_sentiment['sentiment']=df_sentence['sentiment']
    # calculate weighted 
    cal_sentiment['sentiment_by_weighted']=cal_sentiment['length_of_sentence']/cal_sentiment['length_of_para']*cal_sentiment['sentiment']
    # merged sentiment columnes
    #cal_sentiment['sentiment']=df_sentence['sentiment']
    #cal_sentiment['sentiment_by_weighted']=cal_sentiment['sentiment']*cal_sentiment['weighted']
    return cal_sentiment

In [73]:
def sentiment_by_paragraph(cal_sentiment,paragraph_split_df):
    score_df = cal_sentiment[['paragraph','sentiment_by_weighted']]
    score_list = np.array(score_df.groupby(['paragraph']).sum().reset_index(drop=True).values)

    # add the score_list into paragraph_split_df['sentiment_score']
    paragraph_split_df = paragraph_split_df.assign(sentiment_score=score_list)
    return paragraph_split_df

In [77]:
def match_topic(paragraph_split_df,data_words ):
    # cleaning text into lematized token
    data_topic_match = paragraph_split_df[['text','sentiment_score']].copy()
    data_topic_match['para_lemma'] = data_words

    # list of columns name as key 
    col_name = topic_df.columns
    
    # create topic dictionary that contains words in each topic
    def create_topic_dictionary(topic_df):
        # create dictionary
        topic_dict = {}
        for col in col_name:
            topic_dict[col] = list(topic_df[col])
        return topic_dict
    topic_dictionary = create_topic_dictionary(topic_df)
    
    # count number words inside each topic
    def count_topic(para_lemma,topic_dict):
        count = 0
        for words in para_lemma:
            if words in list(topic_dict):
                count += 1
            else:
                continue 
        return count
    for col in col_name:
        data_topic_match[col] = data_topic_match['para_lemma'].apply(lambda r: count_topic(r,topic_dictionary[col]))
        
    # find the columns contains maximum values of each row
    final_topic = data_topic_match[col_name].idxmax(axis = 1)
    data_topic_match['final_topic'] = final_topic
    
    # sentiment score - per topic - per transcript
    df_sentiment_of_topic = data_topic_match[['sentiment_score','final_topic']]
    df_sentiment_of_topic = df_sentiment_of_topic.groupby(['final_topic']).mean().T
    #df_sentiment_of_topic = df_sentiment_of_topic.T
    return df_sentiment_of_topic

In [79]:
# list of columns name as key 
col_name = topic_df.columns

for start, stop in zip(start_index, end_index):
    ## Spliting paragraph, sentence
    # splitting with para
    paragraph_split_df = splitting_para(new_df,start, stop)
    # splitting by sentence
    sentence_split_df = splitting_sentence(paragraph_split_df)
    
    ## Calculate sentiment score
    # apply sentiment model
    df_sentence = apply_sentiment(sentence_split_df)
    # weighted average by paragraph
    cal_sentiment = calculate_weighted_average(sentence_split_df)
    # sentiment by paragraph 
    paragraph_split_df = sentiment_by_paragraph(cal_sentiment,paragraph_split_df)
    
    ## Match topic
    #df_sentiment_of_topic = match_topic(paragraph_split_df,data_words )
    
paragraph_split_df

Unnamed: 0,text,participants,sentiment_score
0,"Good morning, ladies and gentlemen and welcome...",Operator,-0.208861
1,"Good morning, ladies and gentlemen. Let me als...",Jean-Jacques Henchoz,0.365525
2,"Thank you Jean-Jacques, and good morning to ev...",Roland Helmut Vogel,0.262577
3,"Thank you very much Roland. On slide 17, the t...",Jean-Jacques Henchoz,0.396970
4,Ladies and gentlemen we will now begin our que...,Operator,0.560694
...,...,...,...
79,"Okay, thank you. And you have increased loss p...",Jonny Urwin,-0.508197
80,"Not that I'm aware of. We had, again I mention...",Sven Althoff,1.000000
81,Thank you.,Jonny Urwin,1.000000
82,And we have a follow-up question from James Sh...,Operator,0.670330
