In [1]:
import os
import spacy
import pickle
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from scipy.optimize import linear_sum_assignment
from tqdm import tqdm
from rich import print
from itertools import product

spacy.require_gpu()
NER = spacy.load("en_core_web_trf")

## Format the DF

### Simple Rule Functions

In [2]:
# 0. remove the question if it is too short (< 15 words)
def remove_too_short_question(txt, len_limit=15):
    return '' if len(txt.split()) < len_limit else txt
    
# 1. remove sentences if it is too short (< 5 words)
# def remove_too_short_first_sentence(txt, len_limit=10):
#     first_setence = txt.split('.')[0]
#     if len(first_setence.split()) < len_limit:
#         return '.'.join(txt.split('.')[1:])
#     else:
#         return txt
def remove_too_short_sentence_in_question(txt, len_limit=5):
    setences = txt.split('.')
    ret_setences = [cur_setence for cur_setence in setences if len(cur_setence.split()) >= len_limit]

    return '.'.join(ret_setences)

# 2. remove the setence if it contains thank you, good after noon, good morning, thank you very much, thanks
def remove_setence_contrains_special_words(txt, words_to_remove=['thank you', 'good afternoon', 'good morning', 'thank you very much', 'thanks', 'hey']):
    setences = txt.split('.')
    ret_setences = [cur_setence for cur_setence in setences if all(word not in cur_setence.lower() for word in words_to_remove)]

    return '.'.join(ret_setences)

# def identify_setence_contrains_special_words(txt, words_to_remove=['thank you', 'good afternoon', 'good morning', 'thank you very much', 'thanks']):
#     setences = txt.split('.')
#     for cur_setence in setences:
#         return any((word in cur_setence.lower() for word in words_to_remove))

# 3. remove the question if it is too long ( > 200 words)
def remove_too_long_question(txt, len_limit=200):
    return '' if len(txt.split()) >= len_limit else txt

# 4. replace all the name with 'Tim'
def replace_name_with(txt, replace_with_name='John'):
    parsed_txt = NER(txt)
    for word in parsed_txt.ents:
        if word.label_ == 'PERSON':
            txt = txt.replace(word.text, replace_with_name)
    return txt

### Setences new parsed

In [3]:
setence_new_parsed = pd.read_csv(os.path.join(os.getcwd(), 'data', 'processed', 'Conf_transcripts_new_parsed_processed.csv'))
setence_new_parsed.head()

  setence_new_parsed = pd.read_csv(os.path.join(os.getcwd(), 'data', 'processed', 'Conf_transcripts_new_parsed_processed.csv'))


Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
0,1,"george paleologou, will kalutycz","Yes, generally speaking, a lot of the rebalanc...",sabahat khan,Thanks and good afternoon. Can you maybe talk ...,Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
1,2,will kalutycz,"Yes, it’s kind of a mixture, the Sandwich plan...",sabahat khan,"Alright, thanks. And then you mentioned that H...",Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
2,3,will kalutycz,"Yes, again, I mean we deal in all commodities ...",sabahat khan,Great. And then just if you could give us upda...,Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
3,5,george paleologou,Thanks George.,george doumet,"Hey, good afternoon guys and congrats on the q...",Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
4,6,george paleologou,"Yes, actually, it was more -- it was not too m...",george doumet,Just looking at Alberta first year-over-year g...,Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017


### AAPL: (2018-Q1, 2018-Q2)

All Years

In [4]:
# years
np.sort(setence_new_parsed[setence_new_parsed['equity'] == 'AAPL'].year.unique())

array([2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018], dtype=object)

Get only one year data

In [5]:
# subset
filter_equity = setence_new_parsed['equity'] == 'AAPL'
aapl_raw = setence_new_parsed[filter_equity]
filter_year = aapl_raw['year'].apply(lambda x: x in ['2018', 2018])
aapl_raw = aapl_raw[filter_year].copy()
print(aapl_raw.shape)

Apply Rules to filter the data

In [6]:
aapl_raw['question'] = aapl_raw['question'].apply(remove_too_short_question)
aapl_raw = aapl_raw[aapl_raw['question'] != ''].copy()
aapl_raw = aapl_raw[aapl_raw['question'].apply(lambda x: x is not None)].copy()
print('Too short questions', aapl_raw.shape)
aapl_raw['question'] = aapl_raw['question'].apply(remove_too_long_question)
aapl_raw = aapl_raw[aapl_raw['question'] != ''].copy()
aapl_raw = aapl_raw[aapl_raw['question'].apply(lambda x: x is not None)].copy()
print('Too long questions', aapl_raw.shape)
aapl_raw['question'] = aapl_raw['question'].apply(remove_too_short_sentence_in_question).copy()
aapl_raw = aapl_raw[aapl_raw['question'] != ''].copy()
aapl_raw = aapl_raw[aapl_raw['question'].apply(lambda x: x is not None)].copy()
print('remove too short fist setences', aapl_raw.shape)
aapl_raw['question'] = aapl_raw['question'].apply(remove_setence_contrains_special_words).copy()
aapl_raw = aapl_raw[aapl_raw['question'] != ''].copy()
aapl_raw = aapl_raw[aapl_raw['question'].apply(lambda x: x is not None)].copy()
print('remove_setence_contrains_special_words', aapl_raw.shape)
aapl_raw['question'] = aapl_raw['question'].apply(replace_name_with).copy()
print('replace name', aapl_raw.shape)
aapl_raw['question'] = aapl_raw['question'].apply(remove_too_short_question)
aapl_raw = aapl_raw[aapl_raw['question'] != ''].copy()
aapl_raw = aapl_raw[aapl_raw['question'].apply(lambda x: x is not None)].copy()
print('Too short questions', aapl_raw.shape)
aapl_raw

Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
1857862,1,"timothy donald cook - apple, inc.","Sure. Shannon, it's Tim. As Luca mentioned ear...",shannon s. cross - cross research llc,I wanted to ask about your thoughts on sort o...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857863,3,"timothy donald cook - apple, inc.","Hi, Katy. It's Tim. The services grew 31%. We ...",kathryn lynn huberty - morgan stanley & co. llc,The services growth acceleration is really th...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857864,4,"timothy donald cook - apple, inc.","Yeah, I think my own view is that China and th...",kathryn lynn huberty - morgan stanley & co. llc,And it doesn't look like the threat of a trade...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857865,5,"timothy donald cook - apple, inc.","Well, Mike, it's Tim. Again, the great thing a...",michael j. olson - piper jaffray & co.,"Just following on the services question, I'd ...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857866,6,"timothy donald cook - apple, inc.",It's a good question. iPhone X was the most po...,michael j. olson - piper jaffray & co.,"And then any potential tariff issues aside, w...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857869,9,"timothy donald cook - apple, inc.","Yeah, good question. Let me start with India, ...","brian j. white - monness, crespi, hardt & co.,...","Yes, John, I think there is China numbers are ...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857870,11,"timothy donald cook - apple, inc.","We price for the value that we're delivering, ...",next we'll hear from wamsi mohan with bank of ...,"Tim, can you comment on the price elasticity ...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857871,13,"timothy donald cook - apple, inc.","Yeah, Jim, thanks for the question. In terms o...","jim suva - citigroup global markets, inc.",And I'll ask actually both my questions at th...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1858981,1,luca maestri,"Of course, we've been talking about the import...",shannon cross,"John, I wanted to talk a little bit about, mor...","Apple, Inc. (NASDAQ:AAPL) Q1 2018 Earnings Con...",AAPL,Q1,2018
1858982,2,timcook,The revenue growth from iPhone across all the ...,shannon cross,"And then, John, maybe could you talk a little ...","Apple, Inc. (NASDAQ:AAPL) Q1 2018 Earnings Con...",AAPL,Q1,2018


Process index

In [7]:
# process index
for cur_query in tqdm(aapl_raw['time_text'].unique()):
    subset = aapl_raw[aapl_raw['time_text'] == cur_query]
    aapl_raw.loc[subset.index.tolist(), 'question_index'] = list(range(len(subset)))

print(aapl_raw['year'].unique())
print(aapl_raw['equity'].unique())
print(aapl_raw['quater'].unique())
aapl_raw

100%|██████████| 2/2 [00:00<00:00, 1007.40it/s]


Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
1857862,0,"timothy donald cook - apple, inc.","Sure. Shannon, it's Tim. As Luca mentioned ear...",shannon s. cross - cross research llc,I wanted to ask about your thoughts on sort o...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857863,1,"timothy donald cook - apple, inc.","Hi, Katy. It's Tim. The services grew 31%. We ...",kathryn lynn huberty - morgan stanley & co. llc,The services growth acceleration is really th...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857864,2,"timothy donald cook - apple, inc.","Yeah, I think my own view is that China and th...",kathryn lynn huberty - morgan stanley & co. llc,And it doesn't look like the threat of a trade...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857865,3,"timothy donald cook - apple, inc.","Well, Mike, it's Tim. Again, the great thing a...",michael j. olson - piper jaffray & co.,"Just following on the services question, I'd ...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857866,4,"timothy donald cook - apple, inc.",It's a good question. iPhone X was the most po...,michael j. olson - piper jaffray & co.,"And then any potential tariff issues aside, w...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857869,5,"timothy donald cook - apple, inc.","Yeah, good question. Let me start with India, ...","brian j. white - monness, crespi, hardt & co.,...","Yes, John, I think there is China numbers are ...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857870,6,"timothy donald cook - apple, inc.","We price for the value that we're delivering, ...",next we'll hear from wamsi mohan with bank of ...,"Tim, can you comment on the price elasticity ...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857871,7,"timothy donald cook - apple, inc.","Yeah, Jim, thanks for the question. In terms o...","jim suva - citigroup global markets, inc.",And I'll ask actually both my questions at th...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1858981,0,luca maestri,"Of course, we've been talking about the import...",shannon cross,"John, I wanted to talk a little bit about, mor...","Apple, Inc. (NASDAQ:AAPL) Q1 2018 Earnings Con...",AAPL,Q1,2018
1858982,1,timcook,The revenue growth from iPhone across all the ...,shannon cross,"And then, John, maybe could you talk a little ...","Apple, Inc. (NASDAQ:AAPL) Q1 2018 Earnings Con...",AAPL,Q1,2018


Format

In [8]:
# format
aapl = pd.DataFrame(
    {
        "fyear": aapl_raw.apply(
            lambda x: str(x["year"]) + "-" + str(x["quater"]), axis=1
        ),
        "pid": aapl_raw["question_index"],
        "text": aapl_raw["question"],
    }
)
aapl.reset_index(drop=True, inplace=True)
aapl

Unnamed: 0,fyear,pid,text
0,2018-Q2,0,I wanted to ask about your thoughts on sort o...
1,2018-Q2,1,The services growth acceleration is really th...
2,2018-Q2,2,And it doesn't look like the threat of a trade...
3,2018-Q2,3,"Just following on the services question, I'd ..."
4,2018-Q2,4,"And then any potential tariff issues aside, w..."
5,2018-Q2,5,"Yes, John, I think there is China numbers are ..."
6,2018-Q2,6,"Tim, can you comment on the price elasticity ..."
7,2018-Q2,7,And I'll ask actually both my questions at th...
8,2018-Q1,0,"John, I wanted to talk a little bit about, mor..."
9,2018-Q1,1,"And then, John, maybe could you talk a little ..."


### SKX: (2018-Q1, 2018-Q2)

All Years

In [9]:
# years
np.sort(setence_new_parsed[setence_new_parsed['equity'] == 'SKX'].year.unique())

array(['2007', '2008', '2009', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018'], dtype=object)

Get only one year data

In [10]:
# subset
filter_equity = setence_new_parsed['equity'] == 'SKX'
skx_raw = setence_new_parsed[filter_equity]
filter_year = skx_raw['year'].apply(lambda x: x in ['2017', 2017])
skx_raw = skx_raw[filter_year].copy()
filter_quater = skx_raw['quater'].apply(lambda x: x in ['Q3', 'Q4'])
skx_raw = skx_raw[filter_quater].copy()
print(skx_raw.shape)
skx_raw

Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
2988,1,"david weinberg - skechers usa, inc.",Hi.,corinna van der ghinst - citigroup global mark...,Hi. Hi David and welcome to John.,"SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2989,2,"david weinberg - skechers usa, inc.","Yeah, I think it's fair to say that we think w...",corinna van der ghinst - citigroup global mark...,It looks like you guys had a very productive f...,"SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2990,3,"david weinberg - skechers usa, inc.","You know, that's very difficult because we're ...",corinna van der ghinst - citigroup global mark...,Where along that kind of 12% to 13% spectrum t...,"SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2991,4,"david weinberg - skechers usa, inc., john m. v...","So, I'll start with the repurchase. Obviously ...",corinna van der ghinst - citigroup global mark...,"Okay, that's fair. And then, I just had a quic...","SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2992,5,"john m. vandemore - skechers usa, inc.",We're not giving international percentages. I ...,corinna van der ghinst - citigroup global mark...,"Okay. Great. I'm sorry, I just, if I could sne...","SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
...,...,...,...,...,...,...,...,...,...
3407,52,david weinberg,Thanks.,tom nikic,"All right, all right, thanks, David, good luck...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017
3408,53,david weinberg,"No, no.",sam poser,"Two questions, real quick, the domestic backlo...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017
3409,54,david weinberg,"No, because I want to get down to the detail. ...",sam poser,"Well, can you give us a range?",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017
3410,55,david weinberg,It's doing very well. It's probably heading th...,sam poser,"Okay. Thank you. Then secondly, there are some...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017


Appply rules to filter the data

In [11]:
skx_raw['question'] = skx_raw['question'].apply(remove_too_short_question)
skx_raw = skx_raw[skx_raw['question'] != ''].copy()
skx_raw = skx_raw[skx_raw['question'].apply(lambda x: x is not None)].copy()
print('Too short questions', skx_raw.shape)
skx_raw['question'] = skx_raw['question'].apply(remove_too_long_question)
skx_raw = skx_raw[skx_raw['question'] != ''].copy()
skx_raw = skx_raw[skx_raw['question'].apply(lambda x: x is not None)].copy()
print('Too long questions', skx_raw.shape)
skx_raw['question'] = skx_raw['question'].apply(remove_too_short_sentence_in_question).copy()
skx_raw = skx_raw[skx_raw['question'] != ''].copy()
skx_raw = skx_raw[skx_raw['question'].apply(lambda x: x is not None)].copy()
print('remove too short fist setences', skx_raw.shape)
skx_raw['question'] = skx_raw['question'].apply(remove_setence_contrains_special_words).copy()
skx_raw = skx_raw[skx_raw['question'] != ''].copy()
skx_raw = skx_raw[skx_raw['question'].apply(lambda x: x is not None)].copy()
print('remove_setence_contrains_special_words', skx_raw.shape)
skx_raw['question'] = skx_raw['question'].apply(replace_name_with).copy()
print('replace name', skx_raw.shape)
skx_raw['question'] = skx_raw['question'].apply(remove_too_short_question)
skx_raw = skx_raw[skx_raw['question'] != ''].copy()
skx_raw = skx_raw[skx_raw['question'].apply(lambda x: x is not None)].copy()
print('Too short questions', skx_raw.shape)
skx_raw

Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
2989,2,"david weinberg - skechers usa, inc.","Yeah, I think it's fair to say that we think w...",corinna van der ghinst - citigroup global mark...,It looks like you guys had a very productive f...,"SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2990,3,"david weinberg - skechers usa, inc.","You know, that's very difficult because we're ...",corinna van der ghinst - citigroup global mark...,Where along that kind of 12% to 13% spectrum t...,"SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2991,4,"david weinberg - skechers usa, inc., john m. v...","So, I'll start with the repurchase. Obviously ...",corinna van der ghinst - citigroup global mark...,"And then, I just had a quick follow-up on you...","SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2992,5,"john m. vandemore - skechers usa, inc.",We're not giving international percentages. I ...,corinna van der ghinst - citigroup global mark...,"I'm sorry, I just, if I could sneak in one fo...","SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2995,9,"david weinberg - skechers usa, inc.",Yeah. I think the biggest piece of the portfol...,scott d. krasik - the buckingham research grou...,"Just a couple of questions, so the sales shift...","SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
...,...,...,...,...,...,...,...,...,...
3404,49,david weinberg,I think it's most to do with the softness in t...,tom nikic,"All right, I have a question about the U. You ...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017
3405,50,david weinberg,Yes.,tom nikic,"Right, right, I just meant would the growth ra...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017
3406,51,david weinberg,"No, I mean, that will come out in the Q. So I ...",tom nikic,"Okay, and sorry if I missed this. Did you by ...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017
3408,53,david weinberg,"No, no.",sam poser,"Two questions, real quick, the domestic backlo...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017


Process Index

In [12]:
# process index
for cur_query in tqdm(skx_raw['year'].unique()):
    subset1 = skx_raw[skx_raw['year'] == cur_query]
    for cur_query2 in subset1['quater'].unique():
        subset2 = subset1[subset1['quater'] == cur_query2]
        skx_raw.loc[subset2.index.tolist(), 'question_index'] = list(range(len(subset2)))

print(skx_raw['year'].unique())
print(skx_raw['equity'].unique())
print(skx_raw['quater'].unique())
print(len(skx_raw))
skx_raw.head(5)
skx_raw

100%|██████████| 1/1 [00:00<00:00, 328.94it/s]


Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
2989,0,"david weinberg - skechers usa, inc.","Yeah, I think it's fair to say that we think w...",corinna van der ghinst - citigroup global mark...,It looks like you guys had a very productive f...,"SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2990,1,"david weinberg - skechers usa, inc.","You know, that's very difficult because we're ...",corinna van der ghinst - citigroup global mark...,Where along that kind of 12% to 13% spectrum t...,"SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2991,2,"david weinberg - skechers usa, inc., john m. v...","So, I'll start with the repurchase. Obviously ...",corinna van der ghinst - citigroup global mark...,"And then, I just had a quick follow-up on you...","SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2992,3,"john m. vandemore - skechers usa, inc.",We're not giving international percentages. I ...,corinna van der ghinst - citigroup global mark...,"I'm sorry, I just, if I could sneak in one fo...","SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
2995,4,"david weinberg - skechers usa, inc.",Yeah. I think the biggest piece of the portfol...,scott d. krasik - the buckingham research grou...,"Just a couple of questions, so the sales shift...","SKECHERS USA, Inc. (NYSE:SKX) Q4 2017 Earnings...",SKX,Q4,2017
...,...,...,...,...,...,...,...,...,...
3404,28,david weinberg,I think it's most to do with the softness in t...,tom nikic,"All right, I have a question about the U. You ...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017
3405,29,david weinberg,Yes.,tom nikic,"Right, right, I just meant would the growth ra...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017
3406,30,david weinberg,"No, I mean, that will come out in the Q. So I ...",tom nikic,"Okay, and sorry if I missed this. Did you by ...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017
3408,31,david weinberg,"No, no.",sam poser,"Two questions, real quick, the domestic backlo...",Skechers USA (NYSE:SKX) Q3 2017 Earnings Confe...,SKX,Q3,2017


Format

In [13]:
# format
skx = pd.DataFrame(
    {
        "fyear": skx_raw.apply(
            lambda x: str(x["year"]) + "-" + str(x["quater"]), axis=1
        ),
        "pid": skx_raw["question_index"],
        "text": skx_raw["question"],
    }
)
skx.reset_index(drop=True, inplace=True)
skx

Unnamed: 0,fyear,pid,text
0,2017-Q4,0,It looks like you guys had a very productive f...
1,2017-Q4,1,Where along that kind of 12% to 13% spectrum t...
2,2017-Q4,2,"And then, I just had a quick follow-up on you..."
3,2017-Q4,3,"I'm sorry, I just, if I could sneak in one fo..."
4,2017-Q4,4,"Just a couple of questions, so the sales shift..."
...,...,...,...
64,2017-Q3,28,"All right, I have a question about the U. You ..."
65,2017-Q3,29,"Right, right, I just meant would the growth ra..."
66,2017-Q3,30,"Okay, and sorry if I missed this. Did you by ..."
67,2017-Q3,31,"Two questions, real quick, the domestic backlo..."


## Calculate the pairwise similarity

### Setence Embedding Function

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2')

### Similarity Calculation Function

In [15]:
def compute_two_doc_sim(doc1_emb, doc2_emb):
    sim = np.dot(doc1_emb, doc2_emb.T)
    sim = (sim + 1) / 2
    return sim

def format_index_score(sim_score):
    iter_index = product(range(sim_score.shape[0]), range(sim_score.shape[1]))
    return [(i, j, sim_score[i, j]) for i, j in iter_index]

def compute_sim(df, bert_model):
    result = {}
    years = np.sort(df.fyear.unique())
    embeddings = bert_model.encode(
        df.text.tolist(),
        convert_to_tensor=False,
        normalize_embeddings=True,
        convert_to_numpy=True,
    )
    for i in range(len(years) - 1):
        temp_dict = {}
        for j in range(i + 1, len(years)):
            doc1_index = df[df.fyear == years[i]].index.tolist()
            doc2_index = df[df.fyear == years[j]].index.tolist()
            doc1 = embeddings[doc1_index]
            doc2 = embeddings[doc2_index]
            sim = compute_two_doc_sim(doc1, doc2)
            temp_dict[years[j]] = format_index_score(sim)
        result[years[i]] = temp_dict
    return result

# post process funcs
def pair_dict(listOflist):
    temp_dict = {}
    for i in listOflist:
        pair_name = f'{str(i[0])}-{str(i[1])}'
        temp_dict[pair_name] = i[2:]
    return temp_dict

def post_process_df(m):
    # add another layer
    for key in m:
        cur_dict = m.get(key)
        for key2 in cur_dict.keys():
            cur_dict[key2] = pair_dict(cur_dict.get(key2))
    # prepare multi-index
    tuple_list = []
    for key in m:
        cur_dict = m.get(key)
        for key2 in cur_dict.keys():
            tuple_list.extend((key, key2, key3) for key3 in cur_dict.get(key2).keys())
    index = pd.MultiIndex.from_tuples(tuple_list, names=["Year1", "Year2", "MaxSim-Question-Pair"])
    # get similarity score
    similarity_score = []
    for key in m:
        cur_dict = m.get(key)
        for key2 in cur_dict.keys():
            similarity_score.extend(cur_dict.get(key2).get(key3)[0] for key3 in cur_dict.get(key2).keys())

    return pd.DataFrame({'Similarity Score': similarity_score}, index=index)

# search text
def search_text(the_df, search_df):
    try:
        search_df = search_df.copy()
        the_df = the_df.copy()
        question_from_1 = []
        question_from_2 = []
        for index, row in tqdm(the_df.iterrows()):
            # unpack
            year1, year2, question_pair = index
            year1, year1_quater = year1.split("-")
            year2, year2_quater = year2.split("-")
            question_1_index, question_2_index = question_pair.split("-")
            # locate question 1
            year_filter = search_df['year'].apply(lambda x: x in [year1, int(year1)])
            question_1 = search_df[
                year_filter
                & (search_df["quater"] == year1_quater)
                & (search_df["question_index"] == int(question_1_index))
            ]["question"]
            question_from_1.append(question_1.iloc[0])
            # locate question 2
            year_filter = search_df['year'].apply(lambda x: x in [year2, int(year2)])
            question_2 = search_df[
                year_filter
                & (search_df["quater"] == year2_quater)
                & (search_df["question_index"] == int(question_2_index))
            ]["question"]
            question_from_2.append(question_2.iloc[0])
        the_df["Question_Year1"] = question_from_1
        the_df["Question_Year2"] = question_from_2
    except IndexError:
        print(year1, year2, year1_quater, year2_quater, question_1_index, question_2_index)
    
    return the_df

### Compuate Similarity AAPL

In [16]:
aapl_sim = compute_sim(aapl, model)
aapl_sim = post_process_df(aapl_sim)
aapl_sim = search_text(aapl_sim, aapl_raw)

120it [00:00, 697.52it/s]


In [17]:
aapl_sim.loc['2018-Q1'][:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Similarity Score,Question_Year1,Question_Year2
Year2,MaxSim-Question-Pair,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-Q2,0-0,0.600629,"John, I wanted to talk a little bit about, mor...",I wanted to ask about your thoughts on sort o...
2018-Q2,0-1,0.611807,"John, I wanted to talk a little bit about, mor...",The services growth acceleration is really th...
2018-Q2,0-2,0.55965,"John, I wanted to talk a little bit about, mor...",And it doesn't look like the threat of a trade...
2018-Q2,0-3,0.564583,"John, I wanted to talk a little bit about, mor...","Just following on the services question, I'd ..."
2018-Q2,0-4,0.600771,"John, I wanted to talk a little bit about, mor...","And then any potential tariff issues aside, w..."
2018-Q2,0-5,0.571988,"John, I wanted to talk a little bit about, mor...","Yes, John, I think there is China numbers are ..."
2018-Q2,0-6,0.566988,"John, I wanted to talk a little bit about, mor...","Tim, can you comment on the price elasticity ..."
2018-Q2,0-7,0.666451,"John, I wanted to talk a little bit about, mor...",And I'll ask actually both my questions at th...
2018-Q2,1-0,0.785688,"And then, John, maybe could you talk a little ...",I wanted to ask about your thoughts on sort o...
2018-Q2,1-1,0.609269,"And then, John, maybe could you talk a little ...",The services growth acceleration is really th...


In [18]:
with open(os.path.join(os.getcwd(), 'data', 'result', 'pairwise_similarity', 'aapl_sim.pkl'), 'wb') as f:
    pickle.dump(aapl_sim, f)
aapl_sim.to_csv(os.path.join(os.getcwd(), 'data', 'result', 'pairwise_similarity', 'aapl_sim.csv'))

### Compute Similarity SKX

In [19]:
skx_sim = compute_sim(skx, model)
skx_sim = post_process_df(skx_sim)
skx_sim = search_text(skx_sim, skx_raw)

1188it [00:01, 831.47it/s]


In [20]:
skx_sim

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Similarity Score,Question_Year1,Question_Year2
Year1,Year2,MaxSim-Question-Pair,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-Q3,2017-Q4,0-0,0.815662,I was hoping to start with just the operating ...,It looks like you guys had a very productive f...
2017-Q3,2017-Q4,0-1,0.622075,I was hoping to start with just the operating ...,Where along that kind of 12% to 13% spectrum t...
2017-Q3,2017-Q4,0-2,0.678226,I was hoping to start with just the operating ...,"And then, I just had a quick follow-up on you..."
2017-Q3,2017-Q4,0-3,0.693569,I was hoping to start with just the operating ...,"I'm sorry, I just, if I could sneak in one fo..."
2017-Q3,2017-Q4,0-4,0.663784,I was hoping to start with just the operating ...,"Just a couple of questions, so the sales shift..."
2017-Q3,2017-Q4,...,...,...,...
2017-Q3,2017-Q4,32-31,0.672904,"Then secondly, there are some new styles that...",I just wanted to touch on some of the newer i...
2017-Q3,2017-Q4,32-32,0.577031,"Then secondly, there are some new styles that...","And then, last quarter you talked a little bi..."
2017-Q3,2017-Q4,32-33,0.511471,"Then secondly, there are some new styles that...",And then what sort of comps do you have embed...
2017-Q3,2017-Q4,32-34,0.572161,"Then secondly, there are some new styles that...","Well, given all these changes with buybacks an..."


In [21]:
with open(os.path.join(os.getcwd(), 'data', 'result', 'pairwise_similarity', 'skx_sim.pkl'), 'wb') as f:
    pickle.dump(skx_sim, f)
skx_sim.to_csv(os.path.join(os.getcwd(), 'data', 'result', 'pairwise_similarity', 'skx_sim.csv'))