# QA files for regression prepration

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import re
import string 

import nltk 
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords          # module for stop words that come with NLTK
nltk.download('stopwords')
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

# ignore warning
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings('ignore')

# for stock price
import yfinance as yf
from yahoofinancials import YahooFinancials


[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/timliu/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# read the csv file 
topic_modelling_df_QA = pd.read_csv('./main_df_output/QA14_topic_modelling_df.csv')

# Define Functions

In [3]:
# clean text
def process_text(text):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    #text = text.str
    text = str(text)
    text = re.sub(r'\$\w*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'#', '', text)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    text_tokens = tokenizer.tokenize(text)

    text_clean = []
    for word in text_tokens:
        if (word not in stopwords_english and  
                word not in string.punctuation): 
            stem_word = stemmer.stem(word)  # stemming word
            text_clean.append(stem_word)
            
    sentence = ' '.join(text_clean)
    
    return sentence

In [4]:
# calculate the stock price change
def stock_price_change(stock_list, start_date, end_date):
    # function for find price change
    def price_change(aapl_df, n_day):
        before = []
        after = []
        for i in range(len(aapl_df)):
            # price on that date
            prc_tdy = aapl_df['Close'][i]

            if ((i >= n_day) & 
                (i < (len(aapl_df)-n_day))):
                # price change before n_day
                prc_before = aapl_df['Close'][i-n_day]
                prc_change_before = (prc_tdy - prc_before)/prc_before
                # price change after n_day
                prc_after = aapl_df['Close'][i+n_day]
                prc_change_after = (prc_after - prc_tdy)/prc_tdy
            else:
                prc_change_before = 0
                prc_change_after = 0
            # append into list
            before.append(prc_change_before)
            after.append(prc_change_after)
        return before, after
        
    # tickers and its closing stock price
    stock_df = pd.DataFrame()
    for i in stock_list:
        aapl_df = yf.download(i, 
                        start= start_date,  # start='2010-12-01', # start 1 month before
                        end = end_date,   # end='2022-01-30', # end 1 month later
                        progress=False,)
        aapl_df = aapl_df.reset_index(drop=False)
        # add ticker name
        aapl_df['ticker']=i
        # add price change
        # aapl_df['D0'] = aapl_df['Close']
        aapl_df['D-1'],aapl_df['D+1'] = price_change(aapl_df, 1)
        aapl_df['D-2'],aapl_df['D+2'] = price_change(aapl_df, 2)
        aapl_df['D-3'],aapl_df['D+3'] = price_change(aapl_df, 3)
        aapl_df['D-5'],aapl_df['D+5'] = price_change(aapl_df, 5)
        aapl_df['D-10'],aapl_df['D+10'] = price_change(aapl_df, 10)
        aapl_df['D-15'],aapl_df['D+15'] = price_change(aapl_df, 15)
        # append into one dataframe 
        stock_df = stock_df.append(aapl_df)
        
    # drop redundancy columns
    stock_price_df = stock_df.drop(columns=['Open', 'High','Low','Adj Close','Volume'])
    stock_price_df = stock_price_df.rename(columns={'Date': 'date'})
    # drop duplicated rows
    stock_price_df = stock_price_df.drop_duplicates()
    return stock_price_df

# Clean text and splitting to sentence

In [5]:
df = topic_modelling_df_QA.drop(['participants','idx','company_paticipants_yes','other_paticipants_yes',
                'paraghrph_noun', 'word_count', 'char_count', 'sentence_count',
                'avg_word_length', 'avg_sentence_length'], axis = 1)
df = df.rename(columns = {'paraghrph':'paraghraph','paraghrph_clean':'paraghraph_clean'})
df = df.sort_values(by=['file_name'])
df = df.reset_index(drop=True)

# splotting into the sentences
df['paraghraph'] = df['paraghraph'].apply(lambda r: r.replace("Mr. ","Mr."))
df['sentence'] = df['paraghraph'].apply(lambda r: r.split(". "))
df = df.explode('sentence')

# Apply the function to clean the text
df['clean_text'] = df['sentence'].apply(lambda r: process_text(r))
df.head(5)

df.head(5)

Unnamed: 0,file_name,date,company_name,paraghraph,paraghraph_clean,topic_1,topic_2,topic_3,topic_4,topic_5,...,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,sentence,clean_text
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.114488,0.146916,0.003557,0.004218,0.162122,0.003528,0.003467,0.002503,"James Quin, Citigroup",jame quin citigroup
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.114488,0.146916,0.003557,0.004218,0.162122,0.003528,0.003467,0.002503,James Quin 3878205 Three numbers questions p...,jame quin 3878205 three number question pleas
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.114488,0.146916,0.003557,0.004218,0.162122,0.003528,0.003467,0.002503,The first one is,first one
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.114488,0.146916,0.003557,0.004218,0.162122,0.003528,0.003467,0.002503,And I know this is something that's come up on...,know someth that' come previou call
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.114488,0.146916,0.003557,0.004218,0.162122,0.003528,0.003467,0.002503,"But I'm going to ask you anyway, is just could...",i'm go ask anyway could give us sens prior rel...


# Weighted topic Probability by files

In [11]:
topic_df = df.copy()
# cal len paragraph
topic_df['len_para'] = topic_df['paraghraph'].apply(lambda r: len(r))

In [12]:
cal_len_docs = topic_df[['file_name','len_para']]
# Use GroupBy() to compute the sum of the document
cal_len_docs = cal_len_docs.groupby('file_name').sum()
cal_len_docs = cal_len_docs.rename(columns={"len_para":"len_docs"})
cal_len_docs = cal_len_docs.reset_index(drop=False)
cal_len_docs.head(5)

Unnamed: 0,file_name,len_docs
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2003760
1,20110202_LEGAL_-_GEN_GRP-_Guidance_Call_2011-2...,494566
2,20110203_Markel_Corp-_Earnings_Call_2011-2-3_S...,1775115
3,20110209_INTACT_FINANCIAL-_Earnings_Call_2011-...,1178324
4,20110209_Sampo_Oyj-_Earnings_Call_2011-2-9_SD0...,1372839


In [13]:
# calculate weighted average for topic probability
# -----> merge the cal_len_docs
merge_df = cal_len_docs.merge(topic_df, how='inner', on='file_name')
# -----> caculate the weight of the sentiment
merge_df['weighted']=merge_df['len_para']/merge_df['len_docs']


In [16]:
merge_df.head(1)

Unnamed: 0,file_name,len_docs,date,company_name,paraghraph,paraghraph_clean,topic_1,topic_2,topic_3,topic_4,...,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,sentence,clean_text,len_para,weighted
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2003760,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,...,0.003557,0.004218,0.162122,0.003528,0.003467,0.002503,"James Quin, Citigroup",jame quin citigroup,7785,0.003885


In [17]:
# -----> give the weighted average of the probability based on the paragraph length
for i in range (1,15):
    merge_df['topic_'+str(i)] = merge_df['topic_'+str(i)]*merge_df['weighted']
# -----> sum WA sentiment of each files
topic_df = merge_df.copy()
topic_df = topic_df.groupby(['file_name','date','company_name']).sum().reset_index(drop=False)
# drop the column: 'len_docs', 'len_para', 'weighted'
topic_df = topic_df.drop(['len_docs','len_para','weighted'], axis = 1)
topic_df

Unnamed: 0,file_name,date,company_name,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,0.096250,0.132992,0.020148,0.033265,0.117693,0.168051,0.048251,0.113200,0.018714,0.020129,0.152547,0.016333,0.053487,0.008941
1,20110202_LEGAL_-_GEN_GRP-_Guidance_Call_2011-2...,2011-02-02,LEGAL -,0.127617,0.076979,0.022573,0.029721,0.041253,0.094814,0.009192,0.041482,0.129323,0.013827,0.130798,0.062614,0.198993,0.020814
2,20110203_Markel_Corp-_Earnings_Call_2011-2-3_S...,2011-02-03,Markel Corp-,0.050926,0.004667,0.031020,0.095309,0.236571,0.030839,0.035759,0.031255,0.109916,0.229294,0.060519,0.019064,0.045202,0.019659
3,20110209_INTACT_FINANCIAL-_Earnings_Call_2011-...,2011-02-09,INTACT FINANCIAL-,0.038856,0.068618,0.031789,0.034245,0.084972,0.014419,0.213693,0.098281,0.062604,0.111896,0.041733,0.025447,0.162875,0.010570
4,20110209_Sampo_Oyj-_Earnings_Call_2011-2-9_SD0...,2011-02-09,Sampo Oyj-,0.174198,0.008258,0.010340,0.023951,0.019675,0.136340,0.060804,0.073679,0.014802,0.021470,0.092501,0.207364,0.141348,0.015269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1429,20220310_Sanlam_Ltd-_Earnings_Call_2022-3-10_D...,2022-03-10,Sanlam Ltd-,0.032937,0.030364,0.128885,0.004561,0.034618,0.107990,0.204698,0.181052,0.006860,0.026012,0.055976,0.161205,0.006686,0.018156
1430,20220323_Poste_Italiane_SpA-_Earnings_Call_202...,2022-03-23,Poste Italiane,0.100787,0.010836,0.018120,0.058726,0.067110,0.104778,0.001195,0.175167,0.001515,0.008475,0.076943,0.062321,0.113932,0.200095
1431,20220324_Helvetia_Holding_AG-_Earnings_Call_20...,2022-03-24,Helvetia Holding,0.104347,0.014689,0.020236,0.164458,0.124871,0.137107,0.022293,0.031143,0.021055,0.038728,0.023075,0.037544,0.041679,0.218777
1432,20220426_Tryg_A-S-_Earnings_Call_2022-4-26_DN0...,2022-04-26,Tryg A-S-,0.149920,0.003449,0.017675,0.010974,0.032678,0.029958,0.554558,0.004351,0.038974,0.014372,0.029776,0.058212,0.030452,0.024651


# Weighted Sentiment Score by files

In [18]:
import pickle
sentiment = pickle.load(open('sentiment_model.pkl', 'rb'))
df['sentiment'] = sentiment.predict(df['clean_text'])
df.head(3)

Unnamed: 0,file_name,date,company_name,paraghraph,paraghraph_clean,topic_1,topic_2,topic_3,topic_4,topic_5,...,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,sentence,clean_text,sentiment
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.146916,0.003557,0.004218,0.162122,0.003528,0.003467,0.002503,"James Quin, Citigroup",jame quin citigroup,1
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.146916,0.003557,0.004218,0.162122,0.003528,0.003467,0.002503,James Quin 3878205 Three numbers questions p...,jame quin 3878205 three number question pleas,1
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.146916,0.003557,0.004218,0.162122,0.003528,0.003467,0.002503,The first one is,first one,1


In [19]:
# Weight of each sentimentc
# cal len paragraph
df['len_para'] = df['paraghraph'].apply(lambda r: len(r))
# cal len sentence 
df['len_sent'] = df['sentence'].apply(lambda r: len(r))
df.head(3)

Unnamed: 0,file_name,date,company_name,paraghraph,paraghraph_clean,topic_1,topic_2,topic_3,topic_4,topic_5,...,topic_10,topic_11,topic_12,topic_13,topic_14,sentence,clean_text,sentiment,len_para,len_sent
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.004218,0.162122,0.003528,0.003467,0.002503,"James Quin, Citigroup",jame quin citigroup,1,7785,25
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.004218,0.162122,0.003528,0.003467,0.002503,James Quin 3878205 Three numbers questions p...,jame quin 3878205 three number question pleas,1,7785,51
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,"James Quin, Citigroup. James Quin 3878205...",James Quin Citigroup James Quin Three n...,0.146993,0.001922,0.033852,0.066184,0.307268,...,0.004218,0.162122,0.003528,0.003467,0.002503,The first one is,first one,1,7785,16


In [20]:
cal_len_docs = df[['file_name','len_para']]
# Use GroupBy() to compute the sum of the document
cal_len_docs = cal_len_docs.groupby('file_name').sum()
cal_len_docs = cal_len_docs.rename(columns={"len_para":"len_docs"})
cal_len_docs = cal_len_docs.reset_index(drop=False)
cal_len_docs.head(5)

Unnamed: 0,file_name,len_docs
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2003760
1,20110202_LEGAL_-_GEN_GRP-_Guidance_Call_2011-2...,494566
2,20110203_Markel_Corp-_Earnings_Call_2011-2-3_S...,1775115
3,20110209_INTACT_FINANCIAL-_Earnings_Call_2011-...,1178324
4,20110209_Sampo_Oyj-_Earnings_Call_2011-2-9_SD0...,1372839


In [21]:
# calculate weighted average for sentiment
# -----> merge the cal_len_docs
merge_df = cal_len_docs.merge(df, how='inner', on='file_name')
# -----> caculate the weight of the sentiment
merge_df['weighted']=merge_df['len_sent']/merge_df['len_docs']
# -----> give the weighted average of the sentiment score based on the sentence
merge_df['WA_sentiment'] = merge_df['weighted']*merge_df['sentiment']
# -----> sum WA sentiment of each files
sentiment_df = merge_df[['file_name','date','company_name','WA_sentiment']]
sentiment_df = sentiment_df.groupby(['file_name','date','company_name']).sum().reset_index(drop=False)
sentiment_df

Unnamed: 0,file_name,date,company_name,WA_sentiment
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,0.003787
1,20110202_LEGAL_-_GEN_GRP-_Guidance_Call_2011-2...,2011-02-02,LEGAL -,0.014049
2,20110203_Markel_Corp-_Earnings_Call_2011-2-3_S...,2011-02-03,Markel Corp-,0.007116
3,20110209_INTACT_FINANCIAL-_Earnings_Call_2011-...,2011-02-09,INTACT FINANCIAL-,0.003145
4,20110209_Sampo_Oyj-_Earnings_Call_2011-2-9_SD0...,2011-02-09,Sampo Oyj-,0.000196
...,...,...,...,...
1429,20220310_Sanlam_Ltd-_Earnings_Call_2022-3-10_D...,2022-03-10,Sanlam Ltd-,0.005986
1430,20220323_Poste_Italiane_SpA-_Earnings_Call_202...,2022-03-23,Poste Italiane,0.002872
1431,20220324_Helvetia_Holding_AG-_Earnings_Call_20...,2022-03-24,Helvetia Holding,0.005447
1432,20220426_Tryg_A-S-_Earnings_Call_2022-4-26_DN0...,2022-04-26,Tryg A-S-,0.004821


# merge the sentiment_df and topic_df together

In [22]:
# merge the sentiment_df and topic_df together
QA_model_df = topic_df.merge(sentiment_df, how='inner', on=['file_name','date','company_name'])
QA_model_df

Unnamed: 0,file_name,date,company_name,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,WA_sentiment
0,20110202 _Mapfre_SA_Earnings_Call_SD0000000027...,2011-02-02,Mapfre SA,0.096250,0.132992,0.020148,0.033265,0.117693,0.168051,0.048251,0.113200,0.018714,0.020129,0.152547,0.016333,0.053487,0.008941,0.003787
1,20110202_LEGAL_-_GEN_GRP-_Guidance_Call_2011-2...,2011-02-02,LEGAL -,0.127617,0.076979,0.022573,0.029721,0.041253,0.094814,0.009192,0.041482,0.129323,0.013827,0.130798,0.062614,0.198993,0.020814,0.014049
2,20110203_Markel_Corp-_Earnings_Call_2011-2-3_S...,2011-02-03,Markel Corp-,0.050926,0.004667,0.031020,0.095309,0.236571,0.030839,0.035759,0.031255,0.109916,0.229294,0.060519,0.019064,0.045202,0.019659,0.007116
3,20110209_INTACT_FINANCIAL-_Earnings_Call_2011-...,2011-02-09,INTACT FINANCIAL-,0.038856,0.068618,0.031789,0.034245,0.084972,0.014419,0.213693,0.098281,0.062604,0.111896,0.041733,0.025447,0.162875,0.010570,0.003145
4,20110209_Sampo_Oyj-_Earnings_Call_2011-2-9_SD0...,2011-02-09,Sampo Oyj-,0.174198,0.008258,0.010340,0.023951,0.019675,0.136340,0.060804,0.073679,0.014802,0.021470,0.092501,0.207364,0.141348,0.015269,0.000196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1429,20220310_Sanlam_Ltd-_Earnings_Call_2022-3-10_D...,2022-03-10,Sanlam Ltd-,0.032937,0.030364,0.128885,0.004561,0.034618,0.107990,0.204698,0.181052,0.006860,0.026012,0.055976,0.161205,0.006686,0.018156,0.005986
1430,20220323_Poste_Italiane_SpA-_Earnings_Call_202...,2022-03-23,Poste Italiane,0.100787,0.010836,0.018120,0.058726,0.067110,0.104778,0.001195,0.175167,0.001515,0.008475,0.076943,0.062321,0.113932,0.200095,0.002872
1431,20220324_Helvetia_Holding_AG-_Earnings_Call_20...,2022-03-24,Helvetia Holding,0.104347,0.014689,0.020236,0.164458,0.124871,0.137107,0.022293,0.031143,0.021055,0.038728,0.023075,0.037544,0.041679,0.218777,0.005447
1432,20220426_Tryg_A-S-_Earnings_Call_2022-4-26_DN0...,2022-04-26,Tryg A-S-,0.149920,0.003449,0.017675,0.010974,0.032678,0.029958,0.554558,0.004351,0.038974,0.014372,0.029776,0.058212,0.030452,0.024651,0.004821


# Stock Price

In [23]:
# List all yahoo tickers
yahoo_ticker_list = [
    # Motor/Personal
    'ADM.L','DLG.L','SBRE.L','SAGA.L','AGS.BR',
    # Global Commercial
    'ALV.DE','CS.PA','ZURN.SW','G.MI',
    # London Market 
    'BEZ.L','HSX.L','LRE.L',
    # LN Equity 
    'LLOY.L',
    # US - Specialty/P&C/Reinsurance
    'AIG','AXS','TRV','ACGL','RNR','RE','MKL','HIG','ARGO','BRK-B','CB',
    # European (Re)Insurers
    'SCR.PA','MUV2.DE','SREN.SW','HNR1.DE',
    # Japanese & Pacific
    '8766.T','8630.T','8725.T','QBE.AX',
    # Run-off
    'ESGR','FFH.TO','RQIH.L',
    # Life Groups & Retail Life
    'PRU.L','MNG.L','LGEN.L','AV.L','PHNX.L','QLT.L','JUST.L','STJ.L','AGN.AS','DSY.JO','SLM.JO',
    # Other insurers
    'STB.OL','CNP.PA','GJF.OL','PST.MI','NN.AS','TOP.CO','BALN.SW',
    'SAMPO.HE','MAP.MC','TRYG.CO','0RHS.IL','HELN.SW','IFC.TO'
]

# Match it with company names from BoE
Insurer_Names_df = pd.read_excel('./input/Insurer_Names_for_possible_NLP_analysis.xlsx')  
Insurer_Names = Insurer_Names_df.drop(['Unnamed: 0', 'Unnamed: 4'], axis=1)
Insurer_Names = Insurer_Names.drop([0])
Insurer_Names.rename(columns={"Unnamed: 1": "Company", "Unnamed: 2": "bb ticker", "Unnamed: 3":"Group"}, inplace = True)
Insurer_Names['yahoo ticker'] = yahoo_ticker_list
Insurer_Names.head(5)
# Exported as CSV to manually matched with the extracted file names (column 'company_name' in model_df)
# Insurer_Names[['Company']].to_csv('./main_df_output/company_name_match.csv', index = False)

Unnamed: 0,Company,bb ticker,Group,yahoo ticker
1,ADMIRAL GROUP,ADM LN,Motor/Personal,ADM.L
2,DIRECT LINE INSU,DLG LN,Motor/Personal,DLG.L
3,SABRE INSUR,SBRE LN,Motor/Personal,SBRE.L
4,SAGA PLC,SAGA LN,Motor/Personal,SAGA.L
5,AGEAS,AGS BB,Motor/Personal,AGS.BR


In [24]:
company_name_match = pd.read_csv('./input/company_name_match.csv')
company_name_match.rename(columns={"Company": "company_name"}, inplace = True)
company_name_match

Groups = {
        "ADMIRAL GROUP" : "Motor/Personal", "DIRECT LINE INSU" : "Motor/Personal", 
        "SABRE INSUR" : "Motor/Personal", "SAGA PLC" : "Motor/Personal",
        "AGEAS" : "Motor/Personal",

        "ALLIANZ SE-REG" : "Global Commercial", "AXA" : "Global Commercial", 
        "ZURICH INSURANCE" : "Global Commercial", "GENERALI ASSIC" : "Global Commercial",

        "BEAZLEY PLC" : "London Market", "HISCOX LTD" : "London Market", 
        "LANCASHIRE HOLDI" : "London Market",
        
        "Society of Lloyd’s" : "LN Equity",

        "AMERICAN INTERNA" : "US - Specialty/P&C/Reinsurance", "AXIS CAPITAL" : "US - Specialty/P&C/Reinsurance",
        "TRAVELERS COS IN" : "US - Specialty/P&C/Reinsurance", "ARCH CAPITAL GRP" : "US - Specialty/P&C/Reinsurance",
        "RENAISSANCERE" : "US - Specialty/P&C/Reinsurance", "EVEREST RE GROUP" : "US - Specialty/P&C/Reinsurance",
        "MARKEL CORP" : "US - Specialty/P&C/Reinsurance", "HARTFORD FINL SV" : "US - Specialty/P&C/Reinsurance",
        "ARGO GROUP INTER" : "US - Specialty/P&C/Reinsurance", "BERKSHIRE HATH-B" : "US - Specialty/P&C/Reinsurance",
        "CHUBB LTD" : "US - Specialty/P&C/Reinsurance",

        "SCOR SE" : "European (Re)Insurers", "MUENCHENER RUE-R" : "European (Re)Insurers", 
        "SWISS RE AG" : "European (Re)Insurers", "HANNOVER RUECK S" : "European (Re)Insurers",

        "TOKIO MARINE HD" : "Japanese & Pacific", "SOMPO HOLDINGS I" : "Japanese & Pacific",
        "MS&AD INSURANCE" : "Japanese & Pacific", "QBE INSURANCE" : "Japanese & Pacific",

        "ENSTAR GROUP LTD" : "Run-off", "FAIRFAX FINL HLD" : "Run-off", "RANDALL & QUILTE" : "Run-off",

        "PRUDENTIAL PLC" : "Life Groups & Retail Life", "M&G PLC" : "Life Groups & Retail Life",
        "LEGAL & GEN GRP" : "Life Groups & Retail Life", "AVIVA PLC" : "Life Groups & Retail Life",
        "PHOENIX GROUP HO" : "Life Groups & Retail Life", "QUILTER PLC" : "Life Groups & Retail Life",
        "JUST GROUP" : "Life Groups & Retail Life", "ST JAMES'S PLACE" : "Life Groups & Retail Life",
        "AEGON NV" : "Life Groups & Retail Life", "DISCOVERY LTD" : "Life Groups & Retail Life",
        "SANLAM LTD" : "Life Groups & Retail Life",

        "STOREBRAND ASA" : "Other insurers", "CNP ASSURANCES" : "Other insurers",
        "GJENSIDIGE FORSI" : "Other insurers", "POSTE ITALIANE" : "Other insurers",
        "NN GROUP" : "Other insurers", "TOPDANMARK A/S" : "Other insurers",
        "BALOISE HOL-REG" : "Other insurers", "SAMPO OYJ-A SHS" : "Other insurers",
        "MAPFRE SA" : "Other insurers", "TRYG A/S" : "Other insurers",
        "ASR NEDERLAND NV" : "Other insurers", "HELVETIA HOL-REG" : "Other insurers",
        "INTACT FINANCIAL" : "Other insurers"
}

yahoo_ticker = {
        "ADMIRAL GROUP" : "ADM.L", "DIRECT LINE INSU" : "DLG.L", 
        "SABRE INSUR" : "SBRE.L", "SAGA PLC" : "SAGA.L",
        "AGEAS" : "AGS.BR", 
        
         "ALLIANZ SE-REG" : "ALV.DE", "AXA" : "CS.PA", 
        "ZURICH INSURANCE" : "ZURN.SW", "GENERALI ASSIC" : "G.MI",

        "BEAZLEY PLC" : "BEZ.L", "HISCOX LTD" : "HSX.L", 
        "LANCASHIRE HOLDI" : "LRE.L",

        "Society of Lloyd’s" : "LLOY.L",

        "AMERICAN INTERNA" : "AIG", "AXIS CAPITAL" : "AXS",
        "TRAVELERS COS IN" : "TRV", "ARCH CAPITAL GRP" : "ACGL",
        "RENAISSANCERE" : "RNR", "EVEREST RE GROUP" : "RE",
        "MARKEL CORP" : "MKL", "HARTFORD FINL SV" : "HIG",
        "ARGO GROUP INTER" : "ARGO", "BERKSHIRE HATH-B" : "BRK-B",
        "CHUBB LTD" : "CB",

        "SCOR SE" : "SCR.PA", "MUENCHENER RUE-R" : "MUV2.DE", 
        "SWISS RE AG" : "SREN.SW", "HANNOVER RUECK S" : "HNR1.DE",

        "TOKIO MARINE HD" : "8766.T", "SOMPO HOLDINGS I" : "8630.T",
        "MS&AD INSURANCE" : "8725.T", "QBE INSURANCE" : "QBE.AX",

        "ENSTAR GROUP LTD" : "ESGR", "FAIRFAX FINL HLD" : "FFH.TO", "RANDALL & QUILTE" : "RQIH.L",

        "PRUDENTIAL PLC" : "PRU.L", "M&G PLC" : "MNG.L",
        "LEGAL & GEN GRP" : "LGEN.L", "AVIVA PLC" : "AV.L",
        "PHOENIX GROUP HO" : "PHNX.L", "QUILTER PLC" : "QLT.L",
        "JUST GROUP" : "JUST.L", "ST JAMES'S PLACE" : "STJ.L",
        "AEGON NV" : "AGN.AS", "DISCOVERY LTD" : "DSY.JO",
        "SANLAM LTD" : "SLM.JO",

        "STOREBRAND ASA" : "STB.OL", "CNP ASSURANCES" : "CNP.PA",
        "GJENSIDIGE FORSI" : "GJF.OL", "POSTE ITALIANE" : "PST.MI",
        "NN GROUP" : "NN.AS", "TOPDANMARK A/S" : "TOP.CO",
        "BALOISE HOL-REG" : "BALN.SW", "SAMPO OYJ-A SHS" : "SAMPO.HE",
        "MAPFRE SA" : "MAP.MC", "TRYG A/S" : "TRYG.CO",
        "ASR NEDERLAND NV" : "0RHS.IL", "HELVETIA HOL-REG" : "HELN.SW",
        "INTACT FINANCIAL" : "IFC.TO"
}

company_name_match['Group'] =company_name_match['Real_Company_Name'].map(Groups)
company_name_match['yfiance_ticker'] =company_name_match['Real_Company_Name'].map(yahoo_ticker)
company_name_match

Unnamed: 0,Real_Company_Name,company_name,Group,yfiance_ticker
0,ADMIRAL GROUP,Admiral Group,Motor/Personal,ADM.L
1,DIRECT LINE INSU,DIRECT LINE,Motor/Personal,DLG.L
2,SABRE INSUR,SABRE INSUR-,Motor/Personal,SBRE.L
3,SAGA PLC,Saga PLC-,Motor/Personal,SAGA.L
4,AGEAS,Ageas SA-NV-,Motor/Personal,AGS.BR
...,...,...,...,...
62,MAPFRE SA,,Other insurers,MAP.MC
63,TRYG A/S,Tryg A-S-,Other insurers,TRYG.CO
64,ASR NEDERLAND NV,ASR Nederland,Other insurers,0RHS.IL
65,HELVETIA HOL-REG,Helvetia Holding,Other insurers,HELN.SW


In [25]:
df_company = company_name_match[['company_name','Group','yfiance_ticker']]
QA_model_df = df_company.merge(QA_model_df, how='inner', on='company_name')
QA_model_df = QA_model_df.rename(columns={'yfiance_ticker':'ticker'})
QA_model_df

Unnamed: 0,company_name,Group,ticker,file_name,date,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,WA_sentiment
0,Admiral Group,Motor/Personal,ADM.L,20110302_Admiral_Group_PLC-_Earnings_Call_2011...,2011-03-02,0.031949,0.071316,0.001887,0.044913,0.173669,0.000872,0.354589,0.062881,0.107171,0.067565,0.000753,0.027564,0.005498,0.049375,0.000826
1,Admiral Group,Motor/Personal,ADM.L,20110824_Admiral_Group_PLC-_Earnings_Call_2011...,2011-08-24,0.036994,0.094859,0.000492,0.092420,0.268424,0.033912,0.102734,0.103290,0.000900,0.066103,0.004690,0.005994,0.056138,0.133051,0.000971
2,Admiral Group,Motor/Personal,ADM.L,20111109_Admiral_Group_PLC-_Guidance_Call_2011...,2011-11-09,0.002439,0.032775,0.002953,0.001697,0.312097,0.030638,0.151864,0.124540,0.121835,0.129073,0.023103,0.030216,0.034974,0.001796,0.001851
3,Admiral Group,Motor/Personal,ADM.L,20120307_Admiral_Group_PLC-_Earnings_Call_2012...,2012-03-07,0.000834,0.068388,0.001439,0.004353,0.163258,0.000731,0.430827,0.004588,0.038603,0.110456,0.023270,0.031050,0.068767,0.053438,0.001353
4,Admiral Group,Motor/Personal,ADM.L,20130306_Admiral_Group_PLC-_Earnings_Call_2013...,2013-03-06,0.022024,0.012450,0.735922,0.015318,0.026735,0.019312,0.018163,0.021524,0.023038,0.027322,0.016677,0.022849,0.022453,0.016212,-0.046780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1365,INTACT FINANCIAL-,Other insurers,IFC.TO,20210210_INTACT_FINANCIAL-_Earnings_Call_2021-...,2021-02-10,0.039382,0.003099,0.015362,0.017073,0.443133,0.067375,0.008211,0.119520,0.058851,0.155070,0.023832,0.008830,0.029229,0.011034,0.007517
1366,INTACT FINANCIAL-,Other insurers,IFC.TO,20210512_INTACT_FINANCIAL-_Earnings_Call_2021-...,2021-05-12,0.077022,0.003349,0.017053,0.026311,0.456877,0.003710,0.185016,0.050716,0.052432,0.069368,0.019552,0.012051,0.004313,0.022230,0.003768
1367,INTACT FINANCIAL-,Other insurers,IFC.TO,20210728_INTACT_FINANCIAL-_Earnings_Call_2021-...,2021-07-28,0.019933,0.004280,0.008319,0.008932,0.298128,0.043115,0.058892,0.287943,0.021054,0.097541,0.028871,0.043148,0.061263,0.018581,0.009691
1368,INTACT FINANCIAL-,Other insurers,IFC.TO,20211110_INTACT_FINANCIAL-_Earnings_Call_2021-...,2021-11-10,0.017290,0.003103,0.016695,0.005882,0.292182,0.003791,0.278049,0.157903,0.002960,0.135480,0.017845,0.008058,0.022787,0.037975,0.000850


In [26]:
start_date='2010-12-01'
end_date='2022-01-30'
stock_list = company_name_match['yfiance_ticker']
df_stock = stock_price_change(stock_list, start_date, end_date)

# limit the dat till 2021-12-31
df_stock = df_stock[df_stock['date']<='2021-12-31']

df_stock


1 Failed download:
- CNP.PA: No data found, symbol may be delisted


Unnamed: 0,date,Close,ticker,D-1,D+1,D-2,D+2,D-3,D+3,D-5,D+5,D-10,D+10,D-15,D+15
0,2010-12-01,1561.000000,ADM.L,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2010-12-02,1589.000000,ADM.L,0.017937,0.010699,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2010-12-03,1606.000000,ADM.L,0.010699,0.002491,0.028828,0.005604,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,2010-12-06,1610.000000,ADM.L,0.002491,0.003106,0.013216,-0.021739,0.031390,-0.029193,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2010-12-07,1615.000000,ADM.L,0.003106,-0.024768,0.005604,-0.032198,0.016362,-0.036533,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2776,2021-12-23,163.240005,IFC.TO,-0.001101,0.001470,0.001841,0.008025,0.012090,0.006493,0.005049,0.002940,0.012153,0.010475,0.004678,-0.010843
2777,2021-12-24,163.479996,IFC.TO,0.001470,0.006545,0.000367,0.005016,0.003314,0.005750,0.003684,0.000856,0.013138,0.004037,0.006774,-0.013335
2778,2021-12-29,164.550003,IFC.TO,0.006545,-0.001519,0.008025,-0.000790,0.006915,-0.005044,0.020212,0.011972,0.020022,-0.002492,0.013114,-0.005409
2779,2021-12-30,164.300003,IFC.TO,-0.001519,0.000730,0.005016,-0.003530,0.006493,-0.004139,0.008347,0.003956,0.033984,-0.022581,0.004586,0.000000


In [27]:
df_stock['date'] = df_stock['date'].astype(str)
# merged company tickers into big dataframe
QA_model_df = QA_model_df.join(df_stock.set_index(["date","ticker"]), 
                         on=["date","ticker"],
                         how='left'
                        )
QA_model_df = QA_model_df.dropna()
QA_model_df = QA_model_df.reset_index(drop=True)
QA_model_df

Unnamed: 0,company_name,Group,ticker,file_name,date,topic_1,topic_2,topic_3,topic_4,topic_5,...,D-2,D+2,D-3,D+3,D-5,D+5,D-10,D+10,D-15,D+15
0,Admiral Group,Motor/Personal,ADM.L,20110302_Admiral_Group_PLC-_Earnings_Call_2011...,2011-03-02,0.031949,0.071316,0.001887,0.044913,0.173669,...,-0.020130,0.015106,-0.003612,-0.000604,-0.011350,-0.003021,-0.029326,-0.060423,-0.030463,-0.045317
1,Admiral Group,Motor/Personal,ADM.L,20110824_Admiral_Group_PLC-_Earnings_Call_2011...,2011-08-24,0.036994,0.094859,0.000492,0.092420,0.268424,...,-0.110454,-0.025868,-0.094983,-0.005913,-0.125969,0.011086,-0.031496,0.008130,-0.117417,-0.028825
2,Admiral Group,Motor/Personal,ADM.L,20111109_Admiral_Group_PLC-_Guidance_Call_2011...,2011-11-09,0.002439,0.032775,0.002953,0.001697,0.312097,...,-0.247881,-0.053521,-0.251686,-0.061972,-0.234914,-0.098028,-0.276691,-0.025352,-0.275510,0.039437
3,Admiral Group,Motor/Personal,ADM.L,20120307_Admiral_Group_PLC-_Earnings_Call_2012...,2012-03-07,0.000834,0.068388,0.001439,0.004353,0.163258,...,0.095785,0.012238,0.080264,0.022727,0.062210,0.036713,0.103182,0.020979,0.179990,0.014860
4,Admiral Group,Motor/Personal,ADM.L,20130306_Admiral_Group_PLC-_Earnings_Call_2013...,2013-03-06,0.022024,0.012450,0.735922,0.015318,0.026735,...,0.045455,0.014243,0.069767,0.018741,0.066347,0.006747,0.033308,-0.001499,0.044636,-0.008996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1318,INTACT FINANCIAL-,Other insurers,IFC.TO,20201104_INTACT_FINANCIAL-_Earnings_Call_2020-...,2020-11-04,0.058029,0.002024,0.010476,0.079064,0.277344,...,0.069258,-0.019425,0.073609,-0.060778,0.063179,-0.080203,0.031990,0.000203,0.026255,-0.019628
1319,INTACT FINANCIAL-,Other insurers,IFC.TO,20210210_INTACT_FINANCIAL-_Earnings_Call_2021-...,2021-02-10,0.039382,0.003099,0.015362,0.017073,0.443133,...,0.041432,-0.016994,0.039294,-0.032275,0.050876,-0.041101,0.061678,-0.054802,0.054013,-0.033263
1320,INTACT FINANCIAL-,Other insurers,IFC.TO,20210512_INTACT_FINANCIAL-_Earnings_Call_2021-...,2021-05-12,0.077022,0.003349,0.017053,0.026311,0.456877,...,-0.027909,-0.011521,-0.015945,-0.015218,-0.009157,-0.027663,-0.003744,0.002403,-0.002704,0.041341
1321,INTACT FINANCIAL-,Other insurers,IFC.TO,20210728_INTACT_FINANCIAL-_Earnings_Call_2021-...,2021-07-28,0.019933,0.004280,0.008319,0.008932,0.298128,...,0.008669,0.000706,0.007592,-0.002826,0.000294,0.000530,0.001238,0.026725,-0.010427,0.036320


In [28]:
QA_model_df.to_csv("./regression_df_input/QA_model_df.csv", index = False)