In [1]:
import pdftotext
import pandas as pd
import numpy as np
import re
import os

### Define function for "Cleaning" and "participants list"

In [2]:
def cleaning_text(contents):
    ### Cleaning all the unwanted rows in the transcript
    df = pd.DataFrame(contents)

    # remove the unnessary string
    df[0] = df[0].str.replace('\n','')
    df[0] = df[0].str.replace('Bloomberg Transcript','')
    df[0] = df[0].str.replace('\x0c\n','')
    df[0] = df[0].str.replace('FINAL','')
    df[0] = df[0].str.replace('A - ','')
    df[0] = df[0].str.replace('Q - ','')

    # using re to remove the unnessary string
    def drop_unnessary(x):
        page = re.findall(r'Page \d+ of \d+', x) # 'page ... of ... '
        BIO = re.findall(r'{BIO', x) # '{BIO 18731996 <GO>}'
        Company_Name = re.findall(r'Company N ame:', x) # 'Company N ame: H annover Rueck SE'
        Company_Ticker = re.findall(r'Company Ticker:', x) # 'Company Ticker: H N R1 GR Equity'
        Date = re.findall(r'Date:', x) # Date: 2015-03-10
        if page == [] and BIO == [] and Company_Name == [] and Company_Ticker == [] and Date == []:
            return True
        else:
            return False

    true_false = df[0].apply(lambda x: drop_unnessary(x))
    df = df[true_false]

    # drop the final page declaration
    df = df[df[0] != 'This transcript may not be 100 percent accurate and may contain misspellings and other']
    df = df[df[0] != 'inaccuracies. This transcript is provided "as is", without express or implied warranties of']
    df = df[df[0] != 'any kind. Bloomberg retains all rights to this transcript and provides it solely for your']
    df = df[df[0] != 'personal, non-commercial use. Bloomberg, its suppliers and third-party agents shall']
    df = df[df[0] != 'have no liability for errors in this transcript or for lost profits, losses, or direct, indirect,']
    df = df[df[0] != 'incidental, consequential, special or punitive damages in connection with the']
    df = df[df[0] != 'furnishing, performance or use of such transcript. Neither the information nor any']
    df = df[df[0] != 'opinion expressed in this transcript constitutes a solicitation of the purchase or sale of']
    df = df[df[0] != 'securities or commodities. Any opinion expressed in the transcript does not necessarily']
    # df = df[df[0] != 'reflect the views of Bloomberg LP. ¬© COPYRIGHT 2022, BLOOMBERG LP. All rights']  
    df = df[df[0] != 'reserved. Any reproduction, redistribution or retransmission is expressly prohibited.']
    # ¬© could not be identified, would apply re
    
    def drop_Bloomberg_mark(x):
        Bloomberg_mark = re.findall(r'reflect the views of Bloomberg LP', x) # 'reflect the views of Bloomberg LP. ¬© COPYRIGHT 2022, BLOOMBERG LP. All rights'
        if Bloomberg_mark == []:
            return True
        else:
            return False

    true_false = df[0].apply(lambda x: drop_Bloomberg_mark(x))
    df = df[true_false]

    # drop the empthy row
    df = df[df[0] != '']
    df = df[df[0] != '']

    return df

def participants_list(df):
    # reset the index to make sure the index is continuous for better processing
    df = df.reset_index(drop=True)

    #  'Company Participants' index
    # df.loc[df[0] == 'Company Participants']
    Participant_start_index = df.index[df.iloc[:,0] == 'Company Participants'].tolist()
    #  'Other Participants' index
    # df.loc[df[0] == 'Other Participants']
    Participant_middle_index = df.index[df.iloc[:,0] == 'Other Participants'].tolist()
    #  'MANAGEMENT DISCUSSION SECTION' index, is the beginning of the management discussion, would stop before this row
    # df.loc[df[0] == 'MANAGEMENT DISCUSSION SECTION']
    Participant_end_index = df.index[df.iloc[:,0] == 'MANAGEMENT DISCUSSION SECTION' ].tolist()
    # try to find the 'MANAGEMENT DISCUSSION SECTION' or 'Presentation' index
    if Participant_end_index == []:
        Participant_end_index = df.index[df.iloc[:,0] == 'Presentation'].tolist()
        Participant_end_index = [Participant_end_index[-1]]
    # some transcript dont have 'Other Participants'
    if Participant_middle_index == []:
        Participant_middle_index = Participant_end_index

    print(Participant_start_index, Participant_middle_index, Participant_end_index)

    # make the list of company_paticipants and other_participants
    company_paticipants = df.loc[Participant_start_index[0]+1:Participant_middle_index[0]-1]
    company_paticipants.drop(company_paticipants.index[company_paticipants.iloc[:,0] == ''].tolist(), inplace=True)
    company_paticipants = company_paticipants.values.tolist()

    other_paticipants = df.loc[Participant_middle_index[0]+1:Participant_end_index[0]-1]
    other_paticipants.drop(other_paticipants.index[other_paticipants.iloc[:,0] == ''].tolist(), inplace=True)
    other_paticipants = other_paticipants.values.tolist()

    #print("==========================")
    #print("the company paticipants is: ", company_paticipants)
    #print("==========================")
    #print("the other paticipants is: ", other_paticipants)

    #%%
    # after extract the paticipants, we can drop those information to make the transcript more clear
    df = df.reset_index(drop=True)
    df = df.drop(range(df.index[df.iloc[:,0] == 'Company Participants'].tolist()[0],df.index[df.iloc[:,0].isin(['MANAGEMENT DISCUSSION SECTION','Presentation'])].tolist()[0]+1))

    # drop the first row of the df
    df = df.reset_index(drop=True)
    df = df.iloc[1: , :]


    # reset the index again to make sure the index is continuous for better processing
    df = df.reset_index(drop=True)
    # # save to csv
    # df.to_csv('/Users/timliu/Desktop/output/df.csv')
    return df, company_paticipants, other_paticipants

### df_clean_na

In [3]:
path = "/Users/hienanh/Documents/GitHub/final_01/Transcript_clean"
save_path = "/Users/hienanh/Documents/GitHub/final_01/Output"

# all files path
company_paths = []
sectors = os.listdir(path) 
for sector in sectors:
    # path to each sector files
    if sector != '.DS_Store':
        sector_path = path+"/"+sector
    
    # path to each company files
    companies = os.listdir(sector_path) 
    for company in companies:
        if company != '.DS_Store':
            company_path = sector_path+"/"+company
            company_paths.append(company_path)

# create dataframe
df = pd.DataFrame()
df_clean_na = pd.DataFrame(np.zeros((2500,1)), columns=['index']) # create a dataframe with 2500 rows
all_participants = []            

for single_path in company_paths:
    files = os.listdir(single_path)
    for file in files:
        print(file)
        if file.endswith(".pdf"):
            # print(file)
            # Load PDF
            with open(single_path+"/"+file, "rb") as f:
                pdf = pdftotext.PDF(f)
            # Save all text to a txt file.
            with open(save_path+"/"+file.replace(".pdf", ".txt"), "w") as f:
                f.write("\n\n".join(pdf))
            # open the text file
            with open(save_path+"/"+file.replace(".pdf", ".txt")) as f:
                contents = f.readlines()
                df_clean = cleaning_text(contents)
                # extract all the participants
                df_pure_text,company_paticipants,other_paticipants = participants_list(df_clean)
                all_participants.append(company_paticipants)
                all_participants.append(other_paticipants)
                # using the file name to set as the dataframe's column name
                # df[f"{files.index(file)}"] = df_clean
                df[f"{files[files.index(file)]}"] = df_pure_text
                df_clean_na[f"{files[files.index(file)]}"] = df[f"{files[files.index(file)]}"].dropna(inplace=False).reset_index(drop=True)

# drop the first column of the df
df_clean_na = df_clean_na.iloc[:,1:]
df_clean_na.head(5)

20190312_Quilter_PLC-_Earnings_Call_2019-3-12_RT000000002903022737.pdf
[1] [5] [10]
.DS_Store
20210811_Quilter_PLC-_Earnings_Call_2021-8-11_RT000000002962310912.pdf


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


[1] [8] [15]
20200311_Quilter_PLC-_Earnings_Call_2020-3-11_DN000000002822346246.pdf
[1] [5] [11]
20200811_Quilter_PLC-_Earnings_Call_2020-8-11_DN000000002883967732.pdf
[1] [7] [12]
20190805_Quilter_PLC-_Earnings_Call_2019-8-5_RT000000002897819838.pdf
[1] [8] [14]
20180808_Quilter_PLC-_Earnings_Call_2018-8-8_FS000000002459369907.pdf
[1] [5] [12]
20210401_Quilter_PLC-_M-A_Call_2021-4-1_RT000000002954187686.pdf
[1] [4] [10]
20171219_Quilter_PLC-_M-A_Call_2017-12-19_SD000000002919419964.pdf
[3] [7] [11]
20210310_Quilter_PLC-_Earnings_Call_2021-3-10_RT000000002952420637.pdf
[1] [7] [13]
20140904_Sanlam_Ltd-_Earnings_Call_2014-9-4_DN000000002170588806.pdf
[2] [5] [8]
20140604_Sanlam_Ltd-_Sales_Results_Call_2014-6-4_DN000000002147084411.pdf
[1] [4] [9]
.DS_Store
20211208_Sanlam_Ltd-_Sales_Results_Call_2021-12-8_RT000000002970564782.pdf
[1] [6] [16]
20181031_Sanlam_Ltd-_Guidance_Call_2018-10-31_SD000000002901840988.pdf
[2] [5] [8]
20170907_Sanlam_Ltd-_Earnings_Call_2017-9-7_DN00000000237135355

  df_clean_na[f"{files[files.index(file)]}"] = df[f"{files[files.index(file)]}"].dropna(inplace=False).reset_index(drop=True)
  df[f"{files[files.index(file)]}"] = df_pure_text


[1] [4] [15]
20191120_Aviva_PLC-_Guidance_Call_2019-11-20_SD000000002903085152.pdf
[1] [11] [25]
20151029_Aviva_PLC-_Sales_Results_Call_2015-10-29_FS000000002237412488.pdf
[2] [10] [24]
20120308_Aviva_PLC-_Earnings_Call_2012-3-8_SD000000002751473834.pdf
[1] [6] [20]
20160804_Aviva_PLC-_Earnings_Call_2016-8-4_FS000000002423330467.pdf
[1] [8] [17]
20200305_Aviva_PLC-_Earnings_Call_2020-3-5_DN000000002803389869.pdf
[1] [9] [21]
20150806_Aviva_PLC-_Earnings_Call_2015-8-6_FS000000002423330383.pdf
[1] [12] [25]
20130307_Aviva_PLC-_Earnings_Call_2013-3-7_SD000000002751476780.pdf
[1] [8] [18]
20140807_Aviva_PLC-_Earnings_Call_2014-8-7_FS000000002423330353.pdf
[1] [6] [17]
20140709_Aviva_PLC-_Shareholder_Mtg_Call_2014-7-9_FS000000002484152926.pdf
[1] [18] [26]
20171130_Aviva_PLC-_Guidance_Call_2017-11-30_SD000000002902599292.pdf
[1] [9] [20]
20190606_Aviva_PLC-_M-A_Call_2019-6-14_DN000000002656970308.pdf
[1] [5] [17]
20170803_Aviva_PLC-_Earnings_Call_2017-8-3_SD000000002864584312.pdf
[1] [8] [1

[1] [10] [19]
20140508_Prudential_PLC-_Sales_Results_Call_2014-5-8_FS000000002141609559.pdf
[2] [8] [18]
20130812_Prudential_PLC-_Earnings_Call_2013-8-12_SD000000002773157340.pdf
[1] [9] [18]
20160119_Prudential_PLC-_Shareholder_Mtg_Call_2016-1-19_FS000000002376328298.pdf
[1] [5] [15]
20140508_Prudential_PLC-_Guidance_Call_2014-5-8_SD000000002832306952.pdf
[2] [8] [18]
20200910_Prudential_PLC-_Conf-Presentation_Call_2020-9-10_SD000000002900812382.pdf
[1] [4] [4]
20130507_Prudential_PLC-_Guidance_Call_2013-5-7_SD000000002848868457.pdf
[2] [6] [11]
20171116_Prudential_PLC-_Shareholder_Mtg_Call_2017-11-16_SD000000002905251912.pdf
[1] [19] [31]
20210128_Prudential_PLC-_M-A_Call_2021-1-28_DN000000002948732590.pdf
[1] [7] [14]
20140312_Prudential_PLC-_Earnings_Call_2014-3-12_SD000000002781659155.pdf
[1] [8] [16]
20190706_Prudential_PLC-_Shareholder_Mtg_Call_2019-7-6_DN000000002665618566.pdf
[1] [11] [25]
20181114_Prudential_PLC-_Shareholder_Mtg_Call_2018-11-14_SD000000002902760427.pdf
[1] [1

[1] [6] [15]
20130228_St_James-s_Place_PLC-_Earnings_Call_2013-2-28_SD000000002761572094.pdf
[1] [6] [15]
20121119_Sompo_Holdings_Inc-_Earnings_Call_2012-11-20_RT000000002067692926.pdf
[1] [4] [12]
20191128_Sompo_Holdings_Inc-_Conf-Presentation_Call_2019-11-28_SD000000002903087817.pdf
[2] [8] [15]
20180524_Sompo_Holdings_Inc-_M-A_Call_2018-5-24_FS000000002476688929.pdf
[1] [3] [10]
20190520_Sompo_Holdings_Inc-_Earnings_Call_2019-5-20_SD000000002902929962.pdf
[1] [4] [11]
20120518_Sompo_Holdings_Inc-_Earnings_Call_2012-5-21_RT000000002050742732.pdf
[1] [4] [14]
20141119_Sompo_Holdings_Inc-_Earnings_Call_2014-11-24_FS000000002417374854.pdf
[1] [4] [11]
20151118_Sompo_Holdings_Inc-_Earnings_Call_2015-11-20_DN000000002241138647.pdf
[1] [3] [12]
20131119_Sompo_Holdings_Inc-_Earnings_Call_2013-11-25_DN000000002116348219.pdf
[1] [3] [10]
.DS_Store
20181119_Sompo_Holdings_Inc-_Earnings_Call_2018-11-19_SD000000002905610039.pdf
[1] [4] [12]
20151126_Sompo_Holdings_Inc-_Conf-Presentation_Call_201

[1] [4] [9]
20140213_TOKIO_MARINE_HD-_Earnings_Call_2014-2-18_DN000000002126578586.pdf
[1] [4] [12]
20111130_TOKIO_MARINE_HD-_Earnings_Call_2011-12-29_RT000000002039512340.pdf
[1] [7] [12]
20170519_TOKIO_MARINE_HD-_Earnings_Call_2017-5-19_SD000000002893098131.pdf
[1] [4] [11]
20160212_TOKIO_MARINE_HD-_Earnings_Call_2016-2-12_FS000000002253111492.pdf
[1] [4] [9]
.DS_Store
20160809_TOKIO_MARINE_HD-_Earnings_Call_2016-8-9_FS000000002298049534.pdf
[1] [3] [11]
20170808_TOKIO_MARINE_HD-_Earnings_Call_2017-8-8_FS000000002365225099.pdf
[1] [4] [13]
20151118_TOKIO_MARINE_HD-_Earnings_Call_2015-11-18_FS000000002240862350.pdf
[1] [3] [11]
20210527_TOKIO_MARINE_HD-_Shareholder_Mtg_Call_2021-5-27_SD000000002958087539.pdf
[1] [9] [18]
20120531_TOKIO_MARINE_HD-_Conf-Presentation_Call_2012-5-31_SD000000002867310091.pdf
[2] [9] [17]
20130213_TOKIO_MARINE_HD-_Earnings_Call_2013-2-13_DN000000002077439881.pdf
[1] [3] [8]
20150520_TOKIO_MARINE_HD-_Earnings_Call_2015-5-20_DN000000002212037068.pdf
[1] [3] [

[1] [4] [14]
20190224_QBE_INSURANCE-_Earnings_Call_2019-2-25_DN000000002590705808.pdf
[1] [4] [16]
.DS_Store
20170817_QBE_INSURANCE-_Earnings_Call_2017-8-16_FS000000002369265064.pdf
[1] [5] [16]
20120227_QBE_INSURANCE-_Earnings_Call_2012-2-27_SD000000002785543283.pdf
[1] [10] [19]
20191217_QBE_INSURANCE-_Guidance_Call_2019-12-17_SD000000002901873748.pdf
[1] [4] [10]
20180122_QBE_INSURANCE-_Guidance_Call_2018-1-22_SD000000002905262152.pdf
[1] [3] [15]
20200813_QBE_INSURANCE-_Earnings_Call_2020-8-12_DN000000002885183171.pdf
[1] [4] [12]
20150224_QBE_INSURANCE-_Earnings_Call_2015-2-23_FS000000002195934790.pdf
[1] [5] [13]
20190815_QBE_INSURANCE-_Earnings_Call_2019-8-15_SD000000002887415514.pdf
[1] [4] [11]
20110227_QBE_INSURANCE-_Earnings_Call_2011-2-27_SD000000002767279336.pdf
[1] [4] [15]
20181210_QBE_INSURANCE-_Guidance_Call_2018-12-10_SD000000002901842698.pdf
[3] [6] [17]
20210219_QBE_INSURANCE-_Earnings_Call_2021-2-18_DN000000002950935440.pdf
[1] [5] [15]
20180225_QBE_INSURANCE-_Earn

[1] [7] [20]
20150923_Swiss_Re_AG-_M-A_Call_2015-9-24_FS000000002232027288.pdf
[2] [5] [13]
20190314_Swiss_Re_AG-_M-A_Call_2019-3-14_FS000000002602069859.pdf
[1] [6] [15]
20210730_Swiss_Re_AG-_Earnings_Call_2021-7-30_DN000000002963086835.pdf
[1] [6] [17]
20151029_Swiss_Re_AG-_Earnings_Call_2015-10-29_SD000000002846857713.pdf
[1] [3] [3]
20190314_Swiss_Re_AG-_Guidance_Call_2019-3-14_SD000000002905443089.pdf
[3] [8] [16]
20171026_SCOR_SE-_Earnings_Call_2017-10-26_FS000000002378310995.pdf
[1] [8] [20]
20200227_SCOR_SE-_Earnings_Call_2020-2-27_RT000000002799748126.pdf
[1] [10] [17]
20170427_SCOR_SE-_Earnings_Call_2017-4-27_FS000000002465467570.pdf
[1] [8] [18]
20150506_SCOR_SE-_Earnings_Call_2015-5-6_FS000000002465467276.pdf
[1] [9] [18]
20211027_SCOR_SE-_Earnings_Call_2021-10-27_DN000000002966600307.pdf
[1] [10] [17]
20160427_SCOR_SE-_Earnings_Call_2016-4-27_FS000000002465467432.pdf
[1] [9] [20]
20141106_SCOR_SE-_Earnings_Call_2014-11-6_FS000000002464032501.pdf
[1] [8] [19]
20210728_SCOR_

[1] [9] [21]
20160510_Munich_Re-_Earnings_Call_2016-5-10_FS000000002275761730.pdf
[1] [4] [17]
20200806_Munich_Re-_Earnings_Call_2020-8-6_RT000000002889985374.pdf
[1] [5] [16]
20190508_Munich_Re-_Earnings_Call_2019-5-8_RT000000002897946997.pdf
[1] [4] [12]
20190206_Munich_Re-_Sales_Results_Call_2019-2-6_FS000000002582683118.pdf
[1] [5] [17]
20150507_Munich_Re-_Earnings_Call_2015-5-7_FS000000002450434991.pdf
[1] [6] [17]
20181107_Munich_Re-_Earnings_Call_2018-11-7_FS000000002510108193.pdf
[1] [5] [19]
20180315_Munich_Re-_Earnings_Call_2018-3-15_DN000000002409987736.pdf
[1] [7] [10]
20190807_Munich_Re-_Earnings_Call_2019-8-7_DN000000002686215020.pdf
[1] [6] [19]
20201105_Munich_Re-_Earnings_Call_2020-11-5_DN000000002928821379.pdf
[1] [4] [13]
20200228_Munich_Re-_Earnings_Call_2020-2-28_RT000000002825328883.pdf
[1] [7] [20]
20180315_Munich_Re-_Earnings_Call_2018-3-15_FS000000002411211650.pdf
[1] [8] [23]
20180808_Munich_Re-_Earnings_Call_2018-8-8_FS000000002455566648.pdf
[1] [5] [19]
2016

[1] [7] [16]
20131106_LANCASHIRE_HOLDI-_Earnings_Call_2013-11-6_SD000000002765815509.pdf
[1] [6] [15]
20211104_LANCASHIRE_HOLDI-_Sales_Results_Call_2021-11-4_DN000000002967305231.pdf
[1] [6] [16]
20180503_LANCASHIRE_HOLDI-_Earnings_Call_2018-5-3_FS000000002565462344.pdf
[1] [9] [16]
20201105_LANCASHIRE_HOLDI-_Sales_Results_Call_2020-11-5_RT000000002928846889.pdf
[1] [6] [16]
20161103_LANCASHIRE_HOLDI-_Earnings_Call_2016-11-3_FS000000002320502446.pdf
[1] [7] [14]
20130725_LANCASHIRE_HOLDI-_Earnings_Call_2013-7-25_SD000000002764811108.pdf
[1] [5] [15]
20110727_LANCASHIRE_HOLDI-_Earnings_Call_2011-7-27_SD000000002770182400.pdf
[1] [7] [15]
20200729_LANCASHIRE_HOLDI-_Earnings_Call_2020-7-29_SD000000002876482017.pdf
[1] [6] [15]
20110316_LANCASHIRE_HOLDI-_Shareholder_Mtg_Call_2011-3-16_SD000000002829511024.pdf
[1] [9] [17]
20120725_LANCASHIRE_HOLDI-_Earnings_Call_2012-7-25_SD000000002762627675.pdf
[1] [5] [13]
20150212_LANCASHIRE_HOLDI-_Earnings_Call_2015-2-12_FS000000002320503712.pdf
[1] [

[1] [5] [20]
20170802_GENERALI_ASSIC-_Earnings_Call_2017-8-2_FS000000002363579728.pdf
[1] [5] [21]
20150730_GENERALI_ASSIC-_Earnings_Call_2015-7-30_FS000000002507680321.pdf
[1] [5] [18]
20200313_GENERALI_ASSIC-_Earnings_Call_2020-3-13_DN000000002808659181.pdf
[1] [10] [21]
20180315_GENERALI_ASSIC-_Earnings_Call_2018-3-15_FS000000002577074190.pdf
[1] [4] [4]
20170511_GENERALI_ASSIC-_Earnings_Call_2017-5-11_FS000000002415216805.pdf
[1] [6] [15]
20160803_AXA_SA-_Earnings_Call_2016-8-3_FS000000002295899826.pdf
[2] [7] [16]
20170426_AXA_SA-_Shareholder_Mtg_Call_2017-4-26_SD000000002905864394.pdf
[1] [9] [14]
20220224_AXA_SA-_Earnings_Call_2022-2-24_RT000000002974608258.pdf
[1] [8] [17]
20140506_AXA_SA-_Guidance_Call_2014-5-6_SD000000002901752433.pdf
[2] [5] [13]
20190221_AXA_SA-_Earnings_Call_2019-2-21_FS000000002589654793.pdf
[2] [9] [20]
20140506_AXA_SA-_Sales_Results_Call_2014-5-6_FS000000002152991889.pdf
[1] [4] [12]
20190424_AXA_SA-_Shareholder_Mtg_Call_2019-4-24_SD000000002905882094.p

[1] [8] [12]
20160615_Enstar_Group_Ltd-_Shareholder_Mtg_Call_2016-6-15_SD000000002886636178.pdf
[1] [9] [13]
20150217_FAIRFAX_FINL_HLD-_M-A_Call_2015-2-17_FS000000002432346588.pdf
[1] [5] [11]
20150501_FAIRFAX_FINL_HLD-_Earnings_Call_2015-5-1_FS000000002208030544.pdf
[1] [5] [9]
20150501_FAIRFAX_FINL_HLD-_Earnings_Call_2015-5-1_FS000000002208030550.pdf
[13] [16] [20]
20180504_FAIRFAX_FINL_HLD-_Earnings_Call_2018-5-4_FS000000002432347506.pdf
[1] [5] [11]
20160219_FAIRFAX_FINL_HLD-_Earnings_Call_2016-2-19_FS000000002254512428.pdf
[1] [5] [15]
20170428_FAIRFAX_FINL_HLD-_Earnings_Call_2017-4-28_FS000000002346862392.pdf
[1] [5] [11]
20130503_FAIRFAX_FINL_HLD-_Earnings_Call_2013-5-3_SD000000002764201444.pdf
[1] [5] [11]
20140801_FAIRFAX_FINL_HLD-_Earnings_Call_2014-8-1_FS000000002170601454.pdf
[13] [16] [25]
20211105_FAIRFAX_FINL_HLD-_Earnings_Call_2021-11-5_RT000000002967382918.pdf
[1] [6] [9]
20131101_FAIRFAX_FINL_HLD-_Earnings_Call_2013-11-1_SD000000002765841405.pdf
[1] [5] [9]
20150731_F

[1] [4] [14]
20140226_DIRECT_LINE_INSU-_Earnings_Call_2014-2-26_SD000000002797425799.pdf
[2] [5] [16]
20150506_DIRECT_LINE_INSU-_Sales_Results_Call_2015-5-6_FS000000002209596861.pdf
[2] [5] [13]
20210308_DIRECT_LINE_INSU-_Earnings_Call_2021-3-8_RT000000002953441078.pdf
[2] [5] [17]
20140801_DIRECT_LINE_INSU-_Earnings_Call_2014-8-1_FS000000002435732584.pdf
[2] [5] [17]
20150303_DIRECT_LINE_INSU-_Earnings_Call_2015-3-3_FS000000002421837526.pdf
[1] [4] [15]
20140502_DIRECT_LINE_INSU-_Sales_Results_Call_2014-5-6_FS000000002140834525.pdf
[2] [5] [13]
20180502_DIRECT_LINE_INSU-_Sales_Results_Call_2018-5-2_FS000000002424994470.pdf
[2] [5] [14]
20200303_DIRECT_LINE_INSU-_Earnings_Call_2020-3-3_RT000000002802387801.pdf
[1] [6] [19]
20170503_DIRECT_LINE_INSU-_Sales_Results_Call_2017-5-3_FS000000002352648498.pdf
[1] [4] [15]
20170801_DIRECT_LINE_INSU-_Earnings_Call_2017-8-1_FS000000002620927942.pdf
[2] [7] [15]
20210803_DIRECT_LINE_INSU-_Earnings_Call_2021-8-3_DN000000002961644574.pdf
[2] [5] [12

[2] [7] [13]
20190306_Helvetia_Holding_AG-_Earnings_Call_2019-3-6_RT000000002903072582.pdf
[2] [6] [12]
20160905_Helvetia_Holding_AG-_Earnings_Call_2016-9-5_FS000000002406735251.pdf
[2] [6] [14]
20180904_Helvetia_Holding_AG-_Earnings_Call_2018-9-4_FS000000002463951543.pdf
[2] [6] [10]
20220324_Helvetia_Holding_AG-_Earnings_Call_2022-3-24_RT000000002977049730.pdf
[1] [4] [11]
20160314_Helvetia_Holding_AG-_Earnings_Call_2016-3-14_FS000000002406735209.pdf
[2] [6] [12]
20170313_Helvetia_Holding_AG-_Earnings_Call_2017-3-13_FS000000002406735269.pdf
[1] [5] [14]
20120312_Helvetia_Holding_AG-_Earnings_Call_2012-3-12_SD000000002773341235.pdf
[1] [7] [15]
20170830_Baloise_Holding_AG-_Earnings_Call_2017-8-30_FS000000002370438556.pdf
[2] [7] [15]
.DS_Store
20180828_Baloise_Holding_AG-_Earnings_Call_2018-8-28_FS000000002462703395.pdf
[2] [6] [14]
20170323_Baloise_Holding_AG-_Earnings_Call_2017-3-23_FS000000002339844490.pdf
[2] [7] [15]
20200827_Baloise_Holding_AG-_Earnings_Call_2020-8-27_RT00000000

[1] [5] [15]
20181025_GJENSIDIGE_FORSI-_Earnings_Call_2018-10-25_FS000000002478230517.pdf
[2] [6] [16]
20130214_GJENSIDIGE_FORSI-_Earnings_Call_2013-2-14_SD000000002774161326.pdf
[2] [9] [15]
20190712_GJENSIDIGE_FORSI-_Earnings_Call_2019-7-12_RT000000002897842392.pdf
[1] [5] [11]
20160203_GJENSIDIGE_FORSI-_Earnings_Call_2016-2-8_DN000000002251680645.pdf
[2] [6] [14]
.DS_Store
20220427_GJENSIDIGE_FORSI-_Earnings_Call_2022-4-27_RT000000002979809129.pdf
[2] [6] [15]
20211124_GJENSIDIGE_FORSI-_Shareholder_Mtg_Call_2021-11-24_SD000000002968858983.pdf
[1] [10] [10]
20190425_GJENSIDIGE_FORSI-_Earnings_Call_2019-4-25_DN000000002625936944.pdf
[2] [6] [13]
20170209_GJENSIDIGE_FORSI-_Earnings_Call_2017-2-9_FS000000002328088208.pdf
[1] [5] [16]
20121024_GJENSIDIGE_FORSI-_Earnings_Call_2012-10-24_SD000000002763031043.pdf
[1] [7] [12]
20191023_GJENSIDIGE_FORSI-_Earnings_Call_2019-10-23_DN000000002730117838.pdf
[2] [6] [15]
20150506_GJENSIDIGE_FORSI-_Earnings_Call_2015-5-6_FS000000002213534596.pdf
[2

[1] [5] [11]
20170713_Storebrand_ASA-_Earnings_Call_2017-7-13_FS000000002360099560.pdf
[2] [6] [11]
20161026_Storebrand_ASA-_Earnings_Call_2016-10-26_DN000000002309770514.pdf
[2] [6] [12]
20130213_Storebrand_ASA-_Earnings_Call_2013-2-13_SD000000002756817622.pdf
[1] [5] [11]
20120307_Storebrand_ASA-_Conf-Presentation_Call_2012-3-7_SD000000002884377964.pdf
[2] [6] [9]
20190213_Storebrand_ASA-_Earnings_Call_2019-2-13_DN000000002584023243.pdf
[2] [7] [14]
20171025_Storebrand_ASA-_Earnings_Call_2017-10-25_FS000000002378676087.pdf
[2] [6] [12]
20201210_Storebrand_ASA-_Shareholder_Mtg_Call_2020-12-10_RT000000002945678378.pdf
[2] [13] [21]
20160427_Storebrand_ASA-_Earnings_Call_2016-4-27_FS000000002271042864.pdf
[2] [6] [12]
20151028_Storebrand_ASA-_Earnings_Call_2015-10-28_DN000000002236289356.pdf
[2] [8] [13]
20160714_Storebrand_ASA-_Earnings_Call_2016-7-14_DN000000002293349884.pdf
[1] [6] [12]
20190508_Storebrand_ASA-_Earnings_Call_2019-5-8_DN000000002653572889.pdf
[2] [6] [12]
20130424_Sto

[2] [7] [18]
20151116_Topdanmark_AS-_Earnings_Call_2015-11-16_FS000000002240497302.pdf
[2] [5] [11]
20170523_Topdanmark_AS-_Earnings_Call_2017-5-23_DN000000002351966504.pdf
[2] [5] [12]
20190718_Topdanmark_AS-_Earnings_Call_2019-7-18_DN000000002671490054.pdf
[2] [5] [13]
.DS_Store
20210716_Topdanmark_AS-_Earnings_Call_2021-7-16_DN000000002960591572.pdf
[1] [5] [12]
20171026_Topdanmark_AS-_Earnings_Call_2017-10-26_FS000000002379641872.pdf
[1] [4] [12]
20190124_Topdanmark_AS-_Earnings_Call_2019-1-24_FS000000002576327171.pdf
[2] [6] [12]
20211021_Topdanmark_AS-_Earnings_Call_2021-10-21_RT000000002966240672.pdf
[1] [4] [16]
20200123_Topdanmark_AS-_Earnings_Call_2020-1-23_RT000000002896792329.pdf
[2] [5] [12]
20200424_Topdanmark_AS-_Earnings_Call_2020-4-24_DN000000002828024078.pdf
[1] [5] [16]
20180423_Topdanmark_AS-_Earnings_Call_2018-4-23_FS000000002563315832.pdf
[1] [4] [14]
20200717_Topdanmark_AS-_Earnings_Call_2020-7-17_RT000000002871125053.pdf
[1] [3] [3]
20160211_Topdanmark_AS-_Earni

[1] [3] [5]
20200213_AMERICAN_INTERNA-_Earnings_Call_2020-2-13_DN000000002791462478.pdf
[1] [7] [16]
20190904_AMERICAN_INTERNA-_Conf-Presentation_Call_2019-9-4_DN000000002707398964.pdf
[1] [5] [7]
20180607_AMERICAN_INTERNA-_Shareholder_Mtg_Call_2018-6-7_FS000000002436258918.pdf
[1] [5] [17]
20190214_AMERICAN_INTERNA-_Earnings_Call_2019-2-14_FS000000002587918212.pdf
[13] [26] [43]
20171103_AMERICAN_INTERNA-_Earnings_Call_2017-11-3_FS000000002380381118.pdf
[13] [24] [41]
20200213_AMERICAN_INTERNA-_Conf-Presentation_Call_2020-2-13_RT000000002792406244.pdf
[2] [7] [10]
20200401_AMERICAN_INTERNA-_Conf-Presentation_Call_2020-4-1_DN000000002820532213.pdf
[0] [5] [5]
20190528_AMERICAN_INTERNA-_Conf-Presentation_Call_2019-5-31_RT000000002649547563.pdf
[1] [3] [6]
20210913_AMERICAN_INTERNA-_Conf-Presentation_Call_2021-9-13_SD000000002964327152.pdf
[1] [4] [4]
20210520_AMERICAN_INTERNA-_Conf-Presentation_Call_2021-5-20_RT000000002957634799.pdf
[1] [4] [6]
20200514_AMERICAN_INTERNA-_Conf-Presentat

[1] [6] [14]
20130808_Markel_Corp-_Earnings_Call_2013-8-8_SD000000002794232651.pdf
[1] [6] [12]
20130501_Markel_Corp-_Earnings_Call_2013-5-1_SD000000002713499472.pdf
[1] [6] [13]
20110506_Markel_Corp-_Earnings_Call_2011-5-6_SD000000002792411587.pdf
[1] [6] [11]
20110809_Markel_Corp-_Earnings_Call_2011-8-9_SD000000002792416822.pdf
[1] [6] [13]
20130205_Markel_Corp-_Earnings_Call_2013-2-5_SD000000002713496742.pdf
[1] [6] [14]
20150212_Markel_Corp-_Earnings_Call_2015-2-12_FS000000002193924088.pdf
[1] [6] [12]
20201028_Markel_Corp-_Earnings_Call_2020-10-28_DN000000002923866902.pdf
[1] [5] [12]
20110203_Markel_Corp-_Earnings_Call_2011-2-3_SD000000002713474764.pdf
[1] [6] [17]
20200205_Markel_Corp-_Earnings_Call_2020-2-5_RT000000002787564554.pdf
[1] [5] [11]
20160803_Markel_Corp-_Earnings_Call_2016-8-3_FS000000002295888288.pdf
[1] [6] [10]
20141106_Markel_Corp-_Earnings_Call_2014-11-6_FS000000002183184678.pdf
[1] [6] [10]
20190206_Markel_Corp-_Earnings_Call_2019-2-6_FS000000002580718668.pdf


[1] [3] [5]
20151020_TRAVELERS_COS_IN-_Earnings_Call_2015-10-20_FS000000002436721135.pdf
[1] [9] [19]
20170420_TRAVELERS_COS_IN-_Earnings_Call_2017-4-20_FS000000002343383103.pdf
[13] [26] [51]
20180123_TRAVELERS_COS_IN-_Earnings_Call_2018-1-23_FS000000002394433463.pdf
[13] [30] [55]
20160121_TRAVELERS_COS_IN-_Earnings_Call_2016-1-21_FS000000002248807791.pdf
[1] [9] [18]
20130911_TRAVELERS_COS_IN-_Conf-Presentation_Call_2013-9-11_SD000000002791195913.pdf
[1] [3] [6]
20190131_AXIS_CAPITAL-_Earnings_Call_2019-1-31_FS000000002579857335.pdf
[1] [5] [12]
20170202_AXIS_CAPITAL-_Earnings_Call_2017-2-2_FS000000002326307963.pdf
[1] [5] [12]
20180905_AXIS_CAPITAL-_Conf-Presentation_Call_2018-9-5_FS000000002466091307.pdf
[1] [4] [4]
20181025_AXIS_CAPITAL-_Earnings_Call_2018-10-25_FS000000002479087283.pdf
[1] [5] [14]
.DS_Store
20140730_AXIS_CAPITAL-_Earnings_Call_2014-7-30_FS000000002163645404.pdf
[1] [5] [14]
20170706_AXIS_CAPITAL-_M-A_Call_2017-7-6_FS000000002359539447.pdf
[2] [7] [12]
20150729_

[1] [6] [18]
.DS_Store
20210501_BERKSHIRE_HATH_A-_Shareholder_Mtg_Call_2021-5-1_DN000000002956245675.pdf
[1] [21] [21]
20160510_BERKSHIRE_HATH_A-_Conf-Presentation_Call_2016-5-11_DN000000002276541690.pdf
[1] [4] [4]
20200401_BERKSHIRE_HATH_A-_Conf-Presentation_Call_2020-4-1_DN000000002820646363.pdf
[0] [5] [5]
20171103_ARGO_GROUP_INTER-_Earnings_Call_2017-11-3_FS000000002448163152.pdf
[1] [6] [10]
20200804_ARGO_GROUP_INTER-_Earnings_Call_2020-8-4_DN000000002879610471.pdf
[1] [5] [8]
20160803_ARGO_GROUP_INTER-_Earnings_Call_2016-8-3_FS000000002448161772.pdf
[1] [5] [10]
20210312_ARGO_GROUP_INTER-_Shareholder_Mtg_Call_2021-3-12_RT000000002952706626.pdf
[1] [6] [10]
20180807_ARGO_GROUP_INTER-_Earnings_Call_2018-8-7_FS000000002453364806.pdf
[1] [7] [12]
20170214_ARGO_GROUP_INTER-_Earnings_Call_2017-2-14_FS000000002448162042.pdf
[1] [5] [10]
20210504_ARGO_GROUP_INTER-_Earnings_Call_2021-5-4_DN000000002956270895.pdf
[1] [5] [10]
.DS_Store
20161102_ARGO_GROUP_INTER-_Earnings_Call_2016-11-2_FS

[1] [5] [13]
20130207_RENAISSANCERE-_Earnings_Call_2013-2-7_SD000000002763382581.pdf
[1] [6] [15]
20180725_RENAISSANCERE-_Earnings_Call_2018-7-25_FS000000002447156303.pdf
[1] [5] [13]
20160203_RENAISSANCERE-_Earnings_Call_2016-2-3_FS000000002252131839.pdf
[1] [5] [16]
20130502_RENAISSANCERE-_Earnings_Call_2013-5-2_SD000000002764175854.pdf
[1] [6] [16]
20170726_RENAISSANCERE-_Earnings_Call_2017-7-26_FS000000002362516046.pdf
[1] [5] [12]
20190724_RENAISSANCERE-_Earnings_Call_2019-7-24_DN000000002674816592.pdf
[1] [5] [15]
.DS_Store
20211026_RENAISSANCERE-_Earnings_Call_2021-10-26_DN000000002966489222.pdf
[1] [5] [12]
20130731_RENAISSANCERE-_Earnings_Call_2013-7-31_SD000000002764725044.pdf
[1] [5] [14]
20130214_RENAISSANCERE-_Conf-Presentation_Call_2013-2-14_SD000000002790343022.pdf
[1] [3] [5]
20140205_RENAISSANCERE-_Earnings_Call_2014-2-5_SD000000002765850369.pdf
[1] [5] [16]
20200729_RENAISSANCERE-_Earnings_Call_2020-7-29_DN000000002876429382.pdf
[1] [5] [12]
20120910_RENAISSANCERE-_Co

Unnamed: 0,20190312_Quilter_PLC-_Earnings_Call_2019-3-12_RT000000002903022737.pdf,20210811_Quilter_PLC-_Earnings_Call_2021-8-11_RT000000002962310912.pdf,20200311_Quilter_PLC-_Earnings_Call_2020-3-11_DN000000002822346246.pdf,20200811_Quilter_PLC-_Earnings_Call_2020-8-11_DN000000002883967732.pdf,20190805_Quilter_PLC-_Earnings_Call_2019-8-5_RT000000002897819838.pdf,20180808_Quilter_PLC-_Earnings_Call_2018-8-8_FS000000002459369907.pdf,20210401_Quilter_PLC-_M-A_Call_2021-4-1_RT000000002954187686.pdf,20171219_Quilter_PLC-_M-A_Call_2017-12-19_SD000000002919419964.pdf,20210310_Quilter_PLC-_Earnings_Call_2021-3-10_RT000000002952420637.pdf,20140904_Sanlam_Ltd-_Earnings_Call_2014-9-4_DN000000002170588806.pdf,...,20140424_Everest_Re_Group_Ltd-_Earnings_Call_2014-4-24_SD000000002701577115.pdf,20211028_Everest_Re_Group_Ltd-_Earnings_Call_2021-10-28_DN000000002966766153.pdf,20170207_Everest_Re_Group_Ltd-_Earnings_Call_2017-2-7_FS000000002327179820.pdf,20160204_Everest_Re_Group_Ltd-_Earnings_Call_2016-2-4_FS000000002251252548.pdf,20140724_Everest_Re_Group_Ltd-_Earnings_Call_2014-7-24_FS000000002439093631.pdf,20171031_Everest_Re_Group_Ltd-_Earnings_Call_2017-10-31_DN000000002379633496.pdf,20150428_Everest_Re_Group_Ltd-_Earnings_Call_2015-4-28_FS000000002207058997.pdf,20120726_Everest_Re_Group_Ltd-_Earnings_Call_2012-7-26_SD000000002701554249.pdf,20121025_Everest_Re_Group_Ltd-_Earnings_Call_2012-10-25_SD000000002719409327.pdf,20130724_Everest_Re_Group_Ltd-_Earnings_Call_2013-7-24_SD000000002719418375.pdf
0,Paul Feeney,Paul Feeney,Paul Feeney,Paul Feeney,Paul Feeney,Paul W. Feeney,Operator,Management Business of Old Mutual Wealth to TA,Operator,S1 2014 Earnings Call,...,Operator,Operator,Operator,Operator,Operator,Operator,Operator,Operator,Operator,Operator
1,Good morning everybody. Welcome to our First F...,"Hello, and good morning, everyone. Thank you f...","Good morning, everybody. We'll follow the usua...","Good morning, everyone. And you'll understand ...","Good morning, everyone. Welcome to those of yo...",Welcome to our first results presentation as a...,(Call Starts Abruptly) This conference call is...,Associates,"Good morning, everyone. Given the circumstance...",Johan van Zyl,...,"Good morning, everyone and welcome to the Ever...",Welcome to the Everest Re Group Earnings Confe...,"Good day, everyone. Welcome to the Fourth Quar...","Good day, and welcome to the fourth quarter 20...","Good day, everyone and welcome to the Second Q...","Good day, everyone. Welcome to the Third Quart...","Good day, everyone, and welcome to the First Q...","Good day, everyone. Welcome to the Everest Re ...",Good day everyone and welcome to the Everest R...,"Good day, everyone. Welcome to the Second Quar..."
2,"know, we announced the CFO transition back in ...",for our Interim Results Presentation. Hopefull...,summary of 2019 and I'll spend a bit of time o...,today. But the broad format is similar to what...,and to those of you who have joined us on the ...,a bit early this morning. We know that one of ...,Paul Feeney. Please begin your meeting.,Operator,advance. I'll cover the business highlights an...,Welcome to our Interim Results Presentation. A...,...,2014 earnings call. Today's conference is bein...,are in a listen-only mode. After the speaker p...,Group Ltd. Today's conference is being recorded.,Limited. Today's conference is being recorded.,Re Group. Today's conference is being recorded...,Group Limited. Today's conference is being rec...,Group Limited. Today's conference is being rec...,earnings release call. Today's conference is b...,earnings conference release call.,Group. Today's conference is being recorded. A...
3,Tim Tookey. So you'll hear from both of them t...,"to a virtual format, and I look forward to mee...",to better position us for the growth opportuni...,"the business highlights, Mark will then talk t...",follow our usual format this morning. I'll giv...,and we wanted to make sure that those of you w...,Paul Feeney,Welcome to the Old Mutual Wealth update. (Oper...,talk to the financial performance and then we'...,"Johannesburg. More interest, more money here i...",...,remarks and introductions I would like to turn...,"Levenson, Head of Investor Relations. Please g...","At this time, for opening remarks and introduc...","At this time, I would like to turn the confere...","introductions, I would like to turn the confer...","and introductions, I'd like to turn the confer...","and introductions, I'd like to turn the confer...","remarks and introductions, I would like to tur...",[Operator Instructions],"introductions, I would like to turn the confer..."
4,"financials, and Mark will go through our optim...",you and your families are keeping safe and well.,through the financials and our progress on opt...,take questions.,performance and on the proposed sale of Quilte...,make both presentations.,Good morning all. Many thanks for joining us a...,the call is being recorded. I'm now pleased to...,"take your questions. And for the Q&A, I'll als...","Before we start with the presentation, I'd lik...",...,President of Investor Relations. Please go ahe...,Jon Levenson,"over to Ms. Beth Farrell, Vice President of In...",Investor Relations. Please go ahead.,Investor Relations. Please go ahead.,Investor Relations. Please go ahead.,Investor Relations. Please go ahead.,Beth Farrell,"As a reminder, today's presentation is being r...","Investor Relations. Please go ahead, ma'am."


## All participants

In [4]:
# get the value inside the all_participants 
all_participants = [item for sublist in all_participants for item in sublist]
all_participants = [i[0] for i in all_participants]
# print(all_participants)
# %%
# exclude the title of the participants, i.e.'Roland Vogel, CFO' to 'Roland Vogel" by using re
all_participants = [re.sub(r'\,.*', '', participant) for participant in all_participants]
# exclude the 'Property & Casualty Reinsurance'
all_participants = [re.sub(r'Property & Casualty Reinsurance', '', participant) for participant in all_participants]
# exclude the '[0682QB-E Ulrich Wallin]'
all_participants = [re.sub(r'\[0682QB-E Ulrich Wallin\]', '', participant) for participant in all_participants]
# drop duplicated participants
# all_participants = [i[0] for i in all_participants]
# drop the empty string
all_participants = [participant for participant in all_participants if participant != '']
# remove the sapce in the string
all_participants = [participant.strip() for participant in all_participants]
# add the 'Operator' to the list
all_participants.append('Operator')

# drop the duplicated participants
all_participants_copy = all_participants.copy()
all_participants = []
# drop the duplicated participants
for i in all_participants_copy: 
    if i not in all_participants: 
        all_participants.append(i) 

all_participants = sorted(all_participants)

## new_df

In [5]:
new_df = pd.DataFrame()
# identify the len before NaN of each column
for column in df_clean_na.columns:
    # end_index = len(df_clean_na[column])-df_clean_na.isnull().sum(axis = 0)[column]-1
    # identify all the rows in df with all_participants in it
    both_participants_row_index = df_clean_na[df_clean_na[column].isin(all_participants)].index.tolist()
    # # append the end_index to the end of both_participants_row_index
    # both_participants_row_index.append(end_index)
    # apply the both_participants_row_index to the df_clean_na['participants']
    new_df[column] = df_clean_na[column]
    new_df[f"participants_{column}"] = df_clean_na[column].apply(lambda x: x if x in all_participants else np.nan)
    # fill the NaN with the value of the previous row
    new_df[f"participants_{column}"] = new_df[f"participants_{column}"].fillna(method='ffill')
    # # exclude the row if pure_df[column]==pure_df[f"participants_{column}"]
    # pure_df = pure_df[pure_df[column] != pure_df[f"participants_{column}"]]
new_df

  new_df[column] = df_clean_na[column]
  new_df[f"participants_{column}"] = df_clean_na[column].apply(lambda x: x if x in all_participants else np.nan)


Unnamed: 0,20190312_Quilter_PLC-_Earnings_Call_2019-3-12_RT000000002903022737.pdf,participants_20190312_Quilter_PLC-_Earnings_Call_2019-3-12_RT000000002903022737.pdf,20210811_Quilter_PLC-_Earnings_Call_2021-8-11_RT000000002962310912.pdf,participants_20210811_Quilter_PLC-_Earnings_Call_2021-8-11_RT000000002962310912.pdf,20200311_Quilter_PLC-_Earnings_Call_2020-3-11_DN000000002822346246.pdf,participants_20200311_Quilter_PLC-_Earnings_Call_2020-3-11_DN000000002822346246.pdf,20200811_Quilter_PLC-_Earnings_Call_2020-8-11_DN000000002883967732.pdf,participants_20200811_Quilter_PLC-_Earnings_Call_2020-8-11_DN000000002883967732.pdf,20190805_Quilter_PLC-_Earnings_Call_2019-8-5_RT000000002897819838.pdf,participants_20190805_Quilter_PLC-_Earnings_Call_2019-8-5_RT000000002897819838.pdf,...,20171031_Everest_Re_Group_Ltd-_Earnings_Call_2017-10-31_DN000000002379633496.pdf,participants_20171031_Everest_Re_Group_Ltd-_Earnings_Call_2017-10-31_DN000000002379633496.pdf,20150428_Everest_Re_Group_Ltd-_Earnings_Call_2015-4-28_FS000000002207058997.pdf,participants_20150428_Everest_Re_Group_Ltd-_Earnings_Call_2015-4-28_FS000000002207058997.pdf,20120726_Everest_Re_Group_Ltd-_Earnings_Call_2012-7-26_SD000000002701554249.pdf,participants_20120726_Everest_Re_Group_Ltd-_Earnings_Call_2012-7-26_SD000000002701554249.pdf,20121025_Everest_Re_Group_Ltd-_Earnings_Call_2012-10-25_SD000000002719409327.pdf,participants_20121025_Everest_Re_Group_Ltd-_Earnings_Call_2012-10-25_SD000000002719409327.pdf,20130724_Everest_Re_Group_Ltd-_Earnings_Call_2013-7-24_SD000000002719418375.pdf,participants_20130724_Everest_Re_Group_Ltd-_Earnings_Call_2013-7-24_SD000000002719418375.pdf
0,Paul Feeney,Paul Feeney,Paul Feeney,Paul Feeney,Paul Feeney,Paul Feeney,Paul Feeney,Paul Feeney,Paul Feeney,Paul Feeney,...,Operator,Operator,Operator,Operator,Operator,Operator,Operator,Operator,Operator,Operator
1,Good morning everybody. Welcome to our First F...,Paul Feeney,"Hello, and good morning, everyone. Thank you f...",Paul Feeney,"Good morning, everybody. We'll follow the usua...",Paul Feeney,"Good morning, everyone. And you'll understand ...",Paul Feeney,"Good morning, everyone. Welcome to those of yo...",Paul Feeney,...,"Good day, everyone. Welcome to the Third Quart...",Operator,"Good day, everyone, and welcome to the First Q...",Operator,"Good day, everyone. Welcome to the Everest Re ...",Operator,Good day everyone and welcome to the Everest R...,Operator,"Good day, everyone. Welcome to the Second Quar...",Operator
2,"know, we announced the CFO transition back in ...",Paul Feeney,for our Interim Results Presentation. Hopefull...,Paul Feeney,summary of 2019 and I'll spend a bit of time o...,Paul Feeney,today. But the broad format is similar to what...,Paul Feeney,and to those of you who have joined us on the ...,Paul Feeney,...,Group Limited. Today's conference is being rec...,Operator,Group Limited. Today's conference is being rec...,Operator,earnings release call. Today's conference is b...,Operator,earnings conference release call.,Operator,Group. Today's conference is being recorded. A...,Operator
3,Tim Tookey. So you'll hear from both of them t...,Paul Feeney,"to a virtual format, and I look forward to mee...",Paul Feeney,to better position us for the growth opportuni...,Paul Feeney,"the business highlights, Mark will then talk t...",Paul Feeney,follow our usual format this morning. I'll giv...,Paul Feeney,...,"and introductions, I'd like to turn the confer...",Operator,"and introductions, I'd like to turn the confer...",Operator,"remarks and introductions, I would like to tur...",Operator,[Operator Instructions],Operator,"introductions, I would like to turn the confer...",Operator
4,"financials, and Mark will go through our optim...",Paul Feeney,you and your families are keeping safe and well.,Paul Feeney,through the financials and our progress on opt...,Paul Feeney,take questions.,Paul Feeney,performance and on the proposed sale of Quilte...,Paul Feeney,...,Investor Relations. Please go ahead.,Operator,Investor Relations. Please go ahead.,Operator,Beth Farrell,Beth Farrell,"As a reminder, today's presentation is being r...",Operator,"Investor Relations. Please go ahead, ma'am.",Operator
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,,Paul Feeney,,Paul Feeney,,Paul Feeney,,Jens Ehrenberg,,Andy McGlone,...,,John Doucette,,Amit Kumar,,Operator,,Matt Heimermann,,Operator
2496,,Paul Feeney,,Paul Feeney,,Paul Feeney,,Jens Ehrenberg,,Andy McGlone,...,,John Doucette,,Amit Kumar,,Operator,,Matt Heimermann,,Operator
2497,,Paul Feeney,,Paul Feeney,,Paul Feeney,,Jens Ehrenberg,,Andy McGlone,...,,John Doucette,,Amit Kumar,,Operator,,Matt Heimermann,,Operator
2498,,Paul Feeney,,Paul Feeney,,Paul Feeney,,Jens Ehrenberg,,Andy McGlone,...,,John Doucette,,Amit Kumar,,Operator,,Matt Heimermann,,Operator


## pure_df

In [6]:
pure_df = pd.DataFrame()
# identify the len before NaN of each column
for column in df_clean_na.columns:
    # exclude the row if pure_df[column]==pure_df[f"participants_{column}"]
    pure_df = new_df[new_df[column] != new_df[f"participants_{column}"]]
# drop the column if the column start with participants
pure_df = pure_df.drop(pure_df.columns[pure_df.columns.str.startswith('participants_')], axis=1).T

# append the text of each roll into one string by using s.str.cat(sep='. ')
pure_df = pure_df.apply(lambda x: x.str.cat(sep='. '), axis=1)
# change the pure_df to dataframe
pure_df = pd.DataFrame(pure_df)
# rename the column
pure_df.columns = ['meeting_text']
# extract the index as column from the text
pure_df['file_name'] = pure_df.index
# extract the date from the index column
pure_df['date'] = pure_df['file_name'].apply(lambda x: x.split('_')[0])
# change the date column to datetime
pure_df['date'] = pd.to_datetime(pure_df['date'])
# reset the index
pure_df = pure_df.reset_index(drop=True)
pure_df


Unnamed: 0,meeting_text,file_name,date
0,Good morning everybody. Welcome to our First F...,20190312_Quilter_PLC-_Earnings_Call_2019-3-12_...,2019-03-12
1,"Hello, and good morning, everyone. Thank you f...",20210811_Quilter_PLC-_Earnings_Call_2021-8-11_...,2021-08-11
2,"Good morning, everybody. We'll follow the usua...",20200311_Quilter_PLC-_Earnings_Call_2020-3-11_...,2020-03-11
3,"Good morning, everyone. And you'll understand ...",20200811_Quilter_PLC-_Earnings_Call_2020-8-11_...,2020-08-11
4,"Good morning, everyone. Welcome to those of yo...",20190805_Quilter_PLC-_Earnings_Call_2019-8-5_R...,2019-08-05
...,...,...,...
1830,"Good day, everyone. Welcome to the Third Quart...",20171031_Everest_Re_Group_Ltd-_Earnings_Call_2...,2017-10-31
1831,"Good day, everyone, and welcome to the First Q...",20150428_Everest_Re_Group_Ltd-_Earnings_Call_2...,2015-04-28
1832,"Good day, everyone. Welcome to the Everest Re ...",20120726_Everest_Re_Group_Ltd-_Earnings_Call_2...,2012-07-26
1833,Good day everyone and welcome to the Everest R...,20121025_Everest_Re_Group_Ltd-_Earnings_Call_2...,2012-10-25


## take a sample

In [7]:
df_sample = pure_df.sample(frac=0.50)

In [8]:
df_sample

Unnamed: 0,meeting_text,file_name,date
749,"Ladies and gentlemen. Good afternoon. Welcome,...",20190424_AXA_SA-_Shareholder_Mtg_Call_2019-4-2...,2019-04-24
1492,Greetings. Welcome to the Markel Corporation F...,20120510_Markel_Corp-_Earnings_Call_2012-5-10_...,2012-05-10
446,Good morning or good afternoon. Welcome to Swi...,20190731_Swiss_Re_AG-_Earnings_Call_2019-7-31_...,2019-07-31
1438,"Market Cap: 69,035.17. Current PX: 148.33. YTD...",20170726_Chubb_Ltd-_Earnings_Call_2017-7-26_FS...,2017-07-26
337,"Yasuyoshi Karasawa. Good afternoon, ladies and...",20190524_MS-AD_INSURANCE-_Guidance_Call_2019-5...,2019-05-24
...,...,...,...
423,"Ladies and gentlemen, good morning or good aft...",20160429_Swiss_Re_AG-_Earnings_Call_2016-4-29_...,2016-04-29
96,"Welcome, everyone, to our presentation today. ...",20200526_Aviva_PLC-_Shareholder_Mtg_Call_2020-...,2020-05-26
40,"Operator. (Operator Instructions) Good day, la...",20191204_Sanlam_Ltd-_Sales_Results_Call_2019-1...,2019-12-04
605,"Greetings, and welcome to the Lancashire Holdi...",20150729_LANCASHIRE_HOLDI-_Earnings_Call_2015-...,2015-07-29


In [10]:
df_sample.to_csv('df_sample.csv')