In [50]:
import os
import pickle
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from scipy.optimize import linear_sum_assignment
from tqdm import tqdm
from rich import print

## Match doc & Similarirty functions

In [2]:
def match_two_docs(doc1, doc2):
    # pairwise distance between sentences
    sim = np.dot(doc1, doc2.T)
    sim = (sim + 1) / 2 #! why?
    # print(sim.shape, min(sim.shape))
    # assignment is to minimize the distance of matched sentences
    # WMD may be used here (to do)
    row_ind, col_ind = linear_sum_assignment(1 - sim)
    # print(row_ind)
    # print(col_ind)
    # total similarity of matched sentences
    score = sim[row_ind, col_ind].sum()
    # print(doc_sim)
    # normalize by min. size of two docs
    score1 = score / min(sim.shape)
    # normalize by max. size of two docs
    score2 = score / max(sim.shape)
    score = [score1, score2]
    return score, row_ind, col_ind, sim

def format_index_score(index_a, index_b, score, normalized_score, sim_shape):
    normalized_by_min, normalized_by_max = normalized_score
    return [[i, j, k, normalized_by_min, normalized_by_max, sim_shape] for i, j, k in zip(index_a, index_b, score)]

def compute_sim(df, bert_model, window=1):
    # ! cik: ?
    # * window: year interval
    result = {}
    years = np.sort(df.fyear.unique())
    embeddings = bert_model.encode(
        df.text.tolist(),
        convert_to_tensor=False,
        normalize_embeddings=True,
        convert_to_numpy=True,
    )
    for i in range(len(years) - 1):
        temp_dict = {}
        for j in range(i + 1, len(years)):
            if int(years[j].split('-Q')[1]) - int(years[i].split('-Q')[1]) <= window:
                doc1_index = df[df.fyear == years[i]].index.tolist()
                doc2_index = df[df.fyear == years[j]].index.tolist()
                doc1 = embeddings[doc1_index]
                doc2 = embeddings[doc2_index]
                score, row_ind, col_ind, sim = match_two_docs(doc1, doc2)
                # get matched pair similarity
                matched_sim = sim[row_ind, col_ind]
                temp_dict[years[j]] = format_index_score(row_ind, col_ind, matched_sim, score, sim.shape)
        result[years[i]] = temp_dict
    return result

# structure example
# m = {
#     "2017-Q1": {
#         "2017-Q2": [[0, 0, 0.5], [1, 9, 0.6]],
#         "2017-Q3": [[2, 8, 0.9], [10, 11, 0.12]],
#     },
#     "2017-Q2": {
#         "2017-Q1": [[13, 14, 0.15], [16, 17, 0.18]],
#         "2017-Q3": [[19, 20, 0.21], [22, 23, 0.24]],
#     },
# }

# post process funcs
def pair_dict(listOflist):
    temp_dict = {}
    for i in listOflist:
        pair_name = f'{str(i[0])}-{str(i[1])}'
        temp_dict[pair_name] = i[2:]
    return temp_dict

def post_process_df(m):
    # add another layer
    for key in m:
        cur_dict = m.get(key)
        for key2 in cur_dict.keys():
            cur_dict[key2] = pair_dict(cur_dict.get(key2))
    # prepare multi-index
    tuple_list = []
    for key in m:
        cur_dict = m.get(key)
        for key2 in cur_dict.keys():
            tuple_list.extend((key, key2, key3) for key3 in cur_dict.get(key2).keys())
    index = pd.MultiIndex.from_tuples(tuple_list, names=["Year1", "Year2", "MaxSim-Question-Pair"])
    # get similarity score
    similarity_score = []
    normalized_by_min = []
    normalized_by_max = []
    doc1_doc2_shape = []
    for key in m:
        cur_dict = m.get(key)
        for key2 in cur_dict.keys():
            for key3 in cur_dict.get(key2).keys():
                similarity_score.append(cur_dict.get(key2).get(key3)[0])
                normalized_by_min.append(cur_dict.get(key2).get(key3)[1])
                normalized_by_max.append(cur_dict.get(key2).get(key3)[2])
                doc1_doc2_shape.append(cur_dict.get(key2).get(key3)[3])
    return pd.DataFrame({'Similarity Score': similarity_score, 'Normalized_by_min': normalized_by_min, 'Normalized_by_max': normalized_by_max, 'Doc1_Doc2_shape': doc1_doc2_shape}, index=index)

# search text
def search_text(the_df, search_df):
    search_df = aapl_raw.copy()
    the_df = aapl_sim.copy()
    question_from_1 = []
    question_from_2 = []
    for index, row in tqdm(the_df.iterrows()):
        # unpack
        year1, year2, question_pair = index
        year1, year1_quater = year1.split("-")
        year2, year2_quater = year2.split("-")
        question_1_index, question_2_index = question_pair.split("-")
        # locate question 1
        question_1 = search_df[
            (search_df["year"] == int(year1))
            & (search_df["quater"] == year1_quater)
            & (search_df["question_index"] == int(question_1_index))
        ]["question"]
        question_from_1.append(question_1.iloc[0])
        # locate question 2
        question_2 = search_df[
            (search_df["year"] == int(year2))
            & (search_df["quater"] == year2_quater)
            & (search_df["question_index"] == int(question_2_index))
        ]["question"]
        question_from_2.append(question_2.iloc[0])
    the_df["Question_Year1"] = question_from_1
    the_df["Question_Year2"] = question_from_2
    
    return the_df


## Format DF

In [3]:
setence_new_parsed = pd.read_csv(os.path.join(os.getcwd(), 'data', 'processed', 'Conf_transcripts_new_parsed_processed.csv'))
setence_new_parsed.head()

  setence_new_parsed = pd.read_csv(os.path.join(os.getcwd(), 'data', 'processed', 'Conf_transcripts_new_parsed_processed.csv'))


Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
0,1,"george paleologou, will kalutycz","Yes, generally speaking, a lot of the rebalanc...",sabahat khan,Thanks and good afternoon. Can you maybe talk ...,Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
1,2,will kalutycz,"Yes, it’s kind of a mixture, the Sandwich plan...",sabahat khan,"Alright, thanks. And then you mentioned that H...",Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
2,3,will kalutycz,"Yes, again, I mean we deal in all commodities ...",sabahat khan,Great. And then just if you could give us upda...,Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
3,5,george paleologou,Thanks George.,george doumet,"Hey, good afternoon guys and congrats on the q...",Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
4,6,george paleologou,"Yes, actually, it was more -- it was not too m...",george doumet,Just looking at Alberta first year-over-year g...,Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017


### AAPL

In [4]:
# years
np.sort(setence_new_parsed[setence_new_parsed['equity'] == 'AAPL'].year.unique())

array([2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018], dtype=object)

In [5]:
# subset
filter_equity = setence_new_parsed['equity'] == 'AAPL'
aapl_raw = setence_new_parsed[filter_equity]
filter_year = aapl_raw['year'].apply(lambda x: x in ['2018', 2018])
filter_year_2 = aapl_raw['year'].apply(lambda x: x in ['2017', 2017])
final_filter = np.logical_or(filter_year, filter_year_2)
aapl_raw = aapl_raw[final_filter].copy()

# filter two short sentences
aapl_raw = aapl_raw[aapl_raw['question'].apply(lambda x: len(x.split()) > 5)]

# process index
for cur_query in tqdm(aapl_raw['time_text'].unique()):
    subset = aapl_raw[aapl_raw['time_text'] == cur_query]
    aapl_raw.loc[subset.index.tolist(), 'question_index'] = list(range(len(subset)))

print(aapl_raw['year'].unique())
print(aapl_raw['equity'].unique())
print(aapl_raw['quater'].unique())
aapl_raw.head(5)

100%|██████████| 5/5 [00:00<00:00, 1638.53it/s]


Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
1857862,0,"timothy donald cook - apple, inc.","Sure. Shannon, it's Tim. As Luca mentioned ear...",shannon s. cross - cross research llc,Thank you very much. I wanted to ask about you...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857863,1,"timothy donald cook - apple, inc.","Hi, Katy. It's Tim. The services grew 31%. We ...",kathryn lynn huberty - morgan stanley & co. llc,Thank you. Good afternoon. The services growth...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857864,2,"timothy donald cook - apple, inc.","Yeah, I think my own view is that China and th...",kathryn lynn huberty - morgan stanley & co. llc,And it doesn't look like the threat of a trade...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857865,3,"timothy donald cook - apple, inc.","Well, Mike, it's Tim. Again, the great thing a...",michael j. olson - piper jaffray & co.,"Hey. Good afternoon, and thank you for taking ...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857866,4,"timothy donald cook - apple, inc.",It's a good question. iPhone X was the most po...,michael j. olson - piper jaffray & co.,Okay. And then any potential tariff issues asi...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018


In [6]:
# format
aapl = pd.DataFrame(
    {
        "fyear": aapl_raw.apply(
            lambda x: str(x["year"]) + "-" + str(x["quater"]), axis=1
        ),
        "pid": aapl_raw["question_index"],
        "text": aapl_raw["question"],
    }
)
aapl.reset_index(drop=True, inplace=True)
aapl

Unnamed: 0,fyear,pid,text
0,2018-Q2,0,Thank you very much. I wanted to ask about you...
1,2018-Q2,1,Thank you. Good afternoon. The services growth...
2,2018-Q2,2,And it doesn't look like the threat of a trade...
3,2018-Q2,3,"Hey. Good afternoon, and thank you for taking ..."
4,2018-Q2,4,Okay. And then any potential tariff issues asi...
...,...,...,...
56,2017-Q4,11,Thank you so much for your details. Thank you.
57,2017-Q4,12,"Thanks a lot for taking my question, guys. I h..."
58,2017-Q4,13,Got it. That's really helpful. And I guess if ...
59,2017-Q4,14,"Yeah, Tim, I'm wondering if we take a look at ..."


### SKX

In [40]:
# years
np.sort(setence_new_parsed[setence_new_parsed['equity'] == 'SKX'].year.unique())

array(['2007', '2008', '2009', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018'], dtype=object)

In [42]:
# subset
filter_equity = setence_new_parsed['equity'] == 'SKX'
skx_raw = setence_new_parsed[filter_equity].copy()
filter_year = skx_raw['year'].apply(lambda x: x in ['2018', 2018])
filter_year_2 = skx_raw['year'].apply(lambda x: x in ['2017', 2017])
final_filter = np.logical_or(filter_year, filter_year_2)
skx_raw = skx_raw[final_filter].copy()

# filter two short sentences
skx_raw = skx_raw[skx_raw['question'].apply(lambda x: len(x.split()) > 5)]

# process index
for cur_query in tqdm(skx_raw['time_text'].unique()):
    subset = skx_raw[skx_raw['time_text'] == cur_query]
    skx_raw.loc[subset.index.tolist(), 'question_index'] = list(range(len(subset)))

print(skx_raw['year'].unique())
print(skx_raw['equity'].unique())
print(skx_raw['quater'].unique())
print(len(skx_raw))
skx_raw.head(5)

100%|██████████| 6/6 [00:00<00:00, 1467.82it/s]


Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
2825,0,david weinberg,"Hey Scott, pretty good.",scott krasik,"Hey David, how are you doing?","SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017
2826,1,david weinberg,"Well, those -- all those items you mentioned, ...",scott krasik,"Good job. So, two questions here. First, just ...","SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017
2827,2,david weinberg,"Yes, I think it's fair to say the new product ...",scott krasik,And as you look -- I mean are you assuming tha...,"SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017
2828,3,david weinberg,Thanks.,scott krasik,Okay. Awesome. Thanks so much. Good luck.,"SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017
2829,4,david weinberg,Thank you.,david buckley,"Hi, this is David Buckley on for John Kernan. ...","SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017


In [43]:
# format
skx = pd.DataFrame(
    {
        "fyear": skx_raw.apply(
            lambda x: str(x["year"]) + "-" + str(x["quater"]), axis=1
        ),
        "pid": skx_raw["question_index"],
        "text": skx_raw["question"],
    }
)
skx.reset_index(drop=True, inplace=True)
skx

Unnamed: 0,fyear,pid,text
0,2017-Q1,0,"Hey David, how are you doing?"
1,2017-Q1,1,"Good job. So, two questions here. First, just ..."
2,2017-Q1,2,And as you look -- I mean are you assuming tha...
3,2017-Q1,3,Okay. Awesome. Thanks so much. Good luck.
4,2017-Q1,4,"Hi, this is David Buckley on for John Kernan. ..."
...,...,...,...
257,2018-Q1,33,And are you seeing a similar mix in your own s...
258,2018-Q1,34,And as far as visibility in the second quarter...
259,2018-Q1,35,"Yes, of course. And then your own retail store..."
260,2018-Q1,36,Right and of course. And just one more. In loo...


## Model

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

## Compute Similarity

### AAPL

In [8]:
aapl_sim = compute_sim(aapl, model, window=10)
aapl_sim = post_process_df(aapl_sim)
aapl_sim = search_text(aapl_sim, aapl_raw)

aapl_sim

100it [00:00, 974.59it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year1,Year2,MaxSim-Question-Pair,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-Q1,2017-Q3,0-0,0.826229,0.807500,0.646000,"(8, 10)","Yes, thank you. First, Luca, what are the fact...","Yes, thanks. Good afternoon. Luca, first quest..."
2017-Q1,2017-Q3,1-8,0.841549,0.807500,0.646000,"(8, 10)","Amit Daryanani - RBC Capital Markets LLC', 'Th...","Thanks a lot. Good afternoon, guys. I guess to..."
2017-Q1,2017-Q3,2-6,0.803858,0.807500,0.646000,"(8, 10)","Luca, I'm wondering if you could talk a little...","Yes, thank you. I have one for Luca and one fo..."
2017-Q1,2017-Q3,3-2,0.818691,0.807500,0.646000,"(8, 10)","Hi, thank you. I wanted to dig in a little bit...",Great. Thank you. And then can you talk a bit ...
2017-Q1,2017-Q3,4-1,0.828638,0.807500,0.646000,"(8, 10)","Thank you, and then just one follow-up on Chin...",Thank you very much for taking my question. Ti...
...,...,...,...,...,...,...,...,...
2018-Q1,2018-Q2,6-9,0.778559,0.745475,0.512514,"(16, 11)",I have a question and follow-up. You commented...,Thank you very much. And I'll ask actually bot...
2018-Q1,2018-Q2,7-1,0.722830,0.745475,0.512514,"(16, 11)","And just a follow-up, maybe I could clarify tw...",Thank you. Good afternoon. The services growth...
2018-Q1,2018-Q2,10-7,0.694836,0.745475,0.512514,"(16, 11)","Yes, I was asking as a percent, the ownership....","Yes, Tim, I think there is China numbers are a..."
2018-Q1,2018-Q2,14-4,0.925555,0.745475,0.512514,"(16, 11)",I have two questions as well. I guess first on...,Okay. And then any potential tariff issues asi...


### SKX

In [44]:
skx_sim = compute_sim(skx, model, window=10)
skx_sim = post_process_df(skx_sim)
skx_sim = search_text(skx_sim, skx_raw)

skx_sim

100it [00:00, 858.59it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year1,Year2,MaxSim-Question-Pair,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-Q1,2017-Q3,0-0,0.826229,0.807500,0.646000,"(8, 10)","Yes, thank you. First, Luca, what are the fact...","Yes, thanks. Good afternoon. Luca, first quest..."
2017-Q1,2017-Q3,1-8,0.841549,0.807500,0.646000,"(8, 10)","Amit Daryanani - RBC Capital Markets LLC', 'Th...","Thanks a lot. Good afternoon, guys. I guess to..."
2017-Q1,2017-Q3,2-6,0.803858,0.807500,0.646000,"(8, 10)","Luca, I'm wondering if you could talk a little...","Yes, thank you. I have one for Luca and one fo..."
2017-Q1,2017-Q3,3-2,0.818691,0.807500,0.646000,"(8, 10)","Hi, thank you. I wanted to dig in a little bit...",Great. Thank you. And then can you talk a bit ...
2017-Q1,2017-Q3,4-1,0.828638,0.807500,0.646000,"(8, 10)","Thank you, and then just one follow-up on Chin...",Thank you very much for taking my question. Ti...
...,...,...,...,...,...,...,...,...
2018-Q1,2018-Q2,6-9,0.778559,0.745475,0.512514,"(16, 11)",I have a question and follow-up. You commented...,Thank you very much. And I'll ask actually bot...
2018-Q1,2018-Q2,7-1,0.722830,0.745475,0.512514,"(16, 11)","And just a follow-up, maybe I could clarify tw...",Thank you. Good afternoon. The services growth...
2018-Q1,2018-Q2,10-7,0.694836,0.745475,0.512514,"(16, 11)","Yes, I was asking as a percent, the ownership....","Yes, Tim, I think there is China numbers are a..."
2018-Q1,2018-Q2,14-4,0.925555,0.745475,0.512514,"(16, 11)",I have two questions as well. I guess first on...,Okay. And then any potential tariff issues asi...


In [46]:
skx_sim.loc['2017-Q1']

Unnamed: 0_level_0,Unnamed: 1_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year2,MaxSim-Question-Pair,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-Q3,0-0,0.826229,0.8075,0.646,"(8, 10)","Yes, thank you. First, Luca, what are the fact...","Yes, thanks. Good afternoon. Luca, first quest..."
2017-Q3,1-8,0.841549,0.8075,0.646,"(8, 10)","Amit Daryanani - RBC Capital Markets LLC', 'Th...","Thanks a lot. Good afternoon, guys. I guess to..."
2017-Q3,2-6,0.803858,0.8075,0.646,"(8, 10)","Luca, I'm wondering if you could talk a little...","Yes, thank you. I have one for Luca and one fo..."
2017-Q3,3-2,0.818691,0.8075,0.646,"(8, 10)","Hi, thank you. I wanted to dig in a little bit...",Great. Thank you. And then can you talk a bit ...
2017-Q3,4-1,0.828638,0.8075,0.646,"(8, 10)","Thank you, and then just one follow-up on Chin...",Thank you very much for taking my question. Ti...
2017-Q3,5-5,0.899853,0.8075,0.646,"(8, 10)","Yes, thank you. I just wanted to better unders...","Thank you, a question for Tim maybe on the iPh..."
2017-Q3,6-7,0.655626,0.8075,0.646,"(8, 10)","Thank you. First, I wanted to ask about the iP...",Good afternoon. I just have one question for T...
2017-Q3,7-9,0.785555,0.8075,0.646,"(8, 10)","Okay, that's great, thank you. And then, Tim, ...","Yes. Tim, growth in the smartphone market is n..."
2017-Q4,0-0,0.740561,0.789106,0.394553,"(8, 16)","Yes, thank you. First, Luca, what are the fact...","Thank you. Congrats on the quarter. Luca, when..."
2017-Q4,1-13,0.813766,0.789106,0.394553,"(8, 16)","Amit Daryanani - RBC Capital Markets LLC', 'Th...",Got it. That's really helpful. And I guess if ...


In [49]:
print(skx_sim.loc['2017-Q1', '2018-Q2', '4-7']['Question_Year1'])
print(skx_sim.loc['2017-Q1', '2018-Q2', '4-7']['Question_Year2'])

## Save Results

In [51]:
# save aapl
with open(os.path.join(os.getcwd(), 'data', 'result', 'aapl_sim.pkl'), 'wb') as f:
    pickle.dump(aapl_sim, f)
    
# save skx
with open(os.path.join(os.getcwd(), 'data', 'result', 'skx_sim.pkl'), 'wb') as f:
    pickle.dump(skx_sim, f)

In [52]:
# test load
with open(os.path.join(os.getcwd(), 'data', 'result', 'aapl_sim.pkl'), 'rb') as f:
    aapl_test = pickle.load(f)
aapl_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year1,Year2,MaxSim-Question-Pair,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-Q1,2017-Q3,0-0,0.826229,0.807500,0.646000,"(8, 10)","Yes, thank you. First, Luca, what are the fact...","Yes, thanks. Good afternoon. Luca, first quest..."
2017-Q1,2017-Q3,1-8,0.841549,0.807500,0.646000,"(8, 10)","Amit Daryanani - RBC Capital Markets LLC', 'Th...","Thanks a lot. Good afternoon, guys. I guess to..."
2017-Q1,2017-Q3,2-6,0.803858,0.807500,0.646000,"(8, 10)","Luca, I'm wondering if you could talk a little...","Yes, thank you. I have one for Luca and one fo..."
2017-Q1,2017-Q3,3-2,0.818691,0.807500,0.646000,"(8, 10)","Hi, thank you. I wanted to dig in a little bit...",Great. Thank you. And then can you talk a bit ...
2017-Q1,2017-Q3,4-1,0.828638,0.807500,0.646000,"(8, 10)","Thank you, and then just one follow-up on Chin...",Thank you very much for taking my question. Ti...
...,...,...,...,...,...,...,...,...
2018-Q1,2018-Q2,6-9,0.778559,0.745475,0.512514,"(16, 11)",I have a question and follow-up. You commented...,Thank you very much. And I'll ask actually bot...
2018-Q1,2018-Q2,7-1,0.722830,0.745475,0.512514,"(16, 11)","And just a follow-up, maybe I could clarify tw...",Thank you. Good afternoon. The services growth...
2018-Q1,2018-Q2,10-7,0.694836,0.745475,0.512514,"(16, 11)","Yes, I was asking as a percent, the ownership....","Yes, Tim, I think there is China numbers are a..."
2018-Q1,2018-Q2,14-4,0.925555,0.745475,0.512514,"(16, 11)",I have two questions as well. I guess first on...,Okay. And then any potential tariff issues asi...


In [53]:
# test load
with open(os.path.join(os.getcwd(), 'data', 'result', 'skx_sim.pkl'), 'rb') as f:
    skx_test = pickle.load(f)
skx_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year1,Year2,MaxSim-Question-Pair,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-Q1,2017-Q3,0-0,0.826229,0.807500,0.646000,"(8, 10)","Yes, thank you. First, Luca, what are the fact...","Yes, thanks. Good afternoon. Luca, first quest..."
2017-Q1,2017-Q3,1-8,0.841549,0.807500,0.646000,"(8, 10)","Amit Daryanani - RBC Capital Markets LLC', 'Th...","Thanks a lot. Good afternoon, guys. I guess to..."
2017-Q1,2017-Q3,2-6,0.803858,0.807500,0.646000,"(8, 10)","Luca, I'm wondering if you could talk a little...","Yes, thank you. I have one for Luca and one fo..."
2017-Q1,2017-Q3,3-2,0.818691,0.807500,0.646000,"(8, 10)","Hi, thank you. I wanted to dig in a little bit...",Great. Thank you. And then can you talk a bit ...
2017-Q1,2017-Q3,4-1,0.828638,0.807500,0.646000,"(8, 10)","Thank you, and then just one follow-up on Chin...",Thank you very much for taking my question. Ti...
...,...,...,...,...,...,...,...,...
2018-Q1,2018-Q2,6-9,0.778559,0.745475,0.512514,"(16, 11)",I have a question and follow-up. You commented...,Thank you very much. And I'll ask actually bot...
2018-Q1,2018-Q2,7-1,0.722830,0.745475,0.512514,"(16, 11)","And just a follow-up, maybe I could clarify tw...",Thank you. Good afternoon. The services growth...
2018-Q1,2018-Q2,10-7,0.694836,0.745475,0.512514,"(16, 11)","Yes, I was asking as a percent, the ownership....","Yes, Tim, I think there is China numbers are a..."
2018-Q1,2018-Q2,14-4,0.925555,0.745475,0.512514,"(16, 11)",I have two questions as well. I guess first on...,Okay. And then any potential tariff issues asi...
