In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from scipy.optimize import linear_sum_assignment
from tqdm import tqdm
from rich import print

## Match doc & Similarirty functions

In [2]:
def match_two_docs(doc1, doc2):
    # pairwise distance between sentences
    sim = np.dot(doc1, doc2.T)
    sim = (sim + 1) / 2 #! why?
    # print(sim.shape, min(sim.shape))
    # assignment is to minimize the distance of matched sentences
    # WMD may be used here (to do)
    row_ind, col_ind = linear_sum_assignment(1 - sim)
    # print(row_ind)
    # print(col_ind)
    # total similarity of matched sentences
    score = sim[row_ind, col_ind].sum()
    # print(doc_sim)
    # normalize by min. size of two docs
    score1 = score / min(sim.shape)
    # normalize by max. size of two docs
    score2 = score / max(sim.shape)
    score = [score1, score2]
    return score, row_ind, col_ind, sim

def format_index_score(index_a, index_b, score, normalized_score, sim_shape):
    normalized_by_min, normalized_by_max = normalized_score
    return [[i, j, k, normalized_by_min, normalized_by_max, sim_shape] for i, j, k in zip(index_a, index_b, score)]

def compute_sim(df, bert_model, window=1):
    # ! cik: ?
    # * window: year interval
    result = {}
    years = np.sort(df.fyear.unique())
    embeddings = bert_model.encode(
        df.text.tolist(),
        convert_to_tensor=False,
        normalize_embeddings=True,
        convert_to_numpy=True,
    )
    for i in range(len(years) - 1):
        temp_dict = {}
        for j in range(i + 1, len(years)):
            if int(years[j].split('-Q')[1]) - int(years[i].split('-Q')[1]) <= window:
                doc1_index = df[df.fyear == years[i]].index.tolist()
                doc2_index = df[df.fyear == years[j]].index.tolist()
                doc1 = embeddings[doc1_index]
                doc2 = embeddings[doc2_index]
                score, row_ind, col_ind, sim = match_two_docs(doc1, doc2)
                # get matched pair similarity
                matched_sim = sim[row_ind, col_ind]
                temp_dict[years[j]] = format_index_score(row_ind, col_ind, matched_sim, score, sim.shape)
        result[years[i]] = temp_dict
    return result

# structure example
# m = {
#     "2017-Q1": {
#         "2017-Q2": [[0, 0, 0.5], [1, 9, 0.6]],
#         "2017-Q3": [[2, 8, 0.9], [10, 11, 0.12]],
#     },
#     "2017-Q2": {
#         "2017-Q1": [[13, 14, 0.15], [16, 17, 0.18]],
#         "2017-Q3": [[19, 20, 0.21], [22, 23, 0.24]],
#     },
# }

# post process funcs
def pair_dict(listOflist):
    temp_dict = {}
    for i in listOflist:
        pair_name = f'{str(i[0])}-{str(i[1])}'
        temp_dict[pair_name] = i[2:]
    return temp_dict

def post_process_df(m):
    # add another layer
    for key in m:
        cur_dict = m.get(key)
        for key2 in cur_dict.keys():
            cur_dict[key2] = pair_dict(cur_dict.get(key2))
    # prepare multi-index
    tuple_list = []
    for key in m:
        cur_dict = m.get(key)
        for key2 in cur_dict.keys():
            tuple_list.extend((key, key2, key3) for key3 in cur_dict.get(key2).keys())
    index = pd.MultiIndex.from_tuples(tuple_list, names=["Year1", "Year2", "MaxSim-Question-Pair"])
    # get similarity score
    similarity_score = []
    normalized_by_min = []
    normalized_by_max = []
    doc1_doc2_shape = []
    for key in m:
        cur_dict = m.get(key)
        for key2 in cur_dict.keys():
            for key3 in cur_dict.get(key2).keys():
                similarity_score.append(cur_dict.get(key2).get(key3)[0])
                normalized_by_min.append(cur_dict.get(key2).get(key3)[1])
                normalized_by_max.append(cur_dict.get(key2).get(key3)[2])
                doc1_doc2_shape.append(cur_dict.get(key2).get(key3)[3])
    return pd.DataFrame({'Similarity Score': similarity_score, 'Normalized_by_min': normalized_by_min, 'Normalized_by_max': normalized_by_max, 'Doc1_Doc2_shape': doc1_doc2_shape}, index=index)

# search text
def search_text(the_df, search_df):
    search_df = search_df.copy()
    the_df = the_df.copy()
    question_from_1 = []
    question_from_2 = []
    for index, row in tqdm(the_df.iterrows()):
        # unpack
        year1, year2, question_pair = index
        year1, year1_quater = year1.split("-")
        year2, year2_quater = year2.split("-")
        question_1_index, question_2_index = question_pair.split("-")
        # locate question 1
        year_filter = search_df['year'].apply(lambda x: x in [year1, int(year1)])
        question_1 = search_df[
            year_filter
            & (search_df["quater"] == year1_quater)
            & (search_df["question_index"] == int(question_1_index))
        ]["question"]
        question_from_1.append(question_1.iloc[0])
        # locate question 2
        year_filter = search_df['year'].apply(lambda x: x in [year2, int(year2)])
        question_2 = search_df[
            year_filter
            & (search_df["quater"] == year2_quater)
            & (search_df["question_index"] == int(question_2_index))
        ]["question"]
        question_from_2.append(question_2.iloc[0])
    the_df["Question_Year1"] = question_from_1
    the_df["Question_Year2"] = question_from_2
    
    return the_df


## Format DF

In [3]:
setence_new_parsed = pd.read_csv(os.path.join(os.getcwd(), 'data', 'processed', 'Conf_transcripts_new_parsed_processed.csv'))
setence_new_parsed.head()

  setence_new_parsed = pd.read_csv(os.path.join(os.getcwd(), 'data', 'processed', 'Conf_transcripts_new_parsed_processed.csv'))


Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
0,1,"george paleologou, will kalutycz","Yes, generally speaking, a lot of the rebalanc...",sabahat khan,Thanks and good afternoon. Can you maybe talk ...,Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
1,2,will kalutycz,"Yes, it’s kind of a mixture, the Sandwich plan...",sabahat khan,"Alright, thanks. And then you mentioned that H...",Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
2,3,will kalutycz,"Yes, again, I mean we deal in all commodities ...",sabahat khan,Great. And then just if you could give us upda...,Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
3,5,george paleologou,Thanks George.,george doumet,"Hey, good afternoon guys and congrats on the q...",Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017
4,6,george paleologou,"Yes, actually, it was more -- it was not too m...",george doumet,Just looking at Alberta first year-over-year g...,Premium Brands Holding Corp. (OTC:PRBZF) Q1 20...,PRBZF,Q1,2017


### AAPL

In [4]:
# years
np.sort(setence_new_parsed[setence_new_parsed['equity'] == 'AAPL'].year.unique())

array([2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018], dtype=object)

In [5]:
# subset
filter_equity = setence_new_parsed['equity'] == 'AAPL'
aapl_raw = setence_new_parsed[filter_equity]
filter_year = aapl_raw['year'].apply(lambda x: x in ['2018', 2018])
filter_year_2 = aapl_raw['year'].apply(lambda x: x in ['2017', 2017])
final_filter = np.logical_or(filter_year, filter_year_2)
aapl_raw = aapl_raw[final_filter].copy()

# filter two short sentences
aapl_raw = aapl_raw[aapl_raw['question'].apply(lambda x: len(x.split()) > 5)]

# process index
for cur_query in tqdm(aapl_raw['time_text'].unique()):
    subset = aapl_raw[aapl_raw['time_text'] == cur_query]
    aapl_raw.loc[subset.index.tolist(), 'question_index'] = list(range(len(subset)))

print(aapl_raw['year'].unique())
print(aapl_raw['equity'].unique())
print(aapl_raw['quater'].unique())
aapl_raw.head(5)

100%|██████████| 5/5 [00:00<00:00, 1273.16it/s]


Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
1857862,0,"timothy donald cook - apple, inc.","Sure. Shannon, it's Tim. As Luca mentioned ear...",shannon s. cross - cross research llc,Thank you very much. I wanted to ask about you...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857863,1,"timothy donald cook - apple, inc.","Hi, Katy. It's Tim. The services grew 31%. We ...",kathryn lynn huberty - morgan stanley & co. llc,Thank you. Good afternoon. The services growth...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857864,2,"timothy donald cook - apple, inc.","Yeah, I think my own view is that China and th...",kathryn lynn huberty - morgan stanley & co. llc,And it doesn't look like the threat of a trade...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857865,3,"timothy donald cook - apple, inc.","Well, Mike, it's Tim. Again, the great thing a...",michael j. olson - piper jaffray & co.,"Hey. Good afternoon, and thank you for taking ...","Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018
1857866,4,"timothy donald cook - apple, inc.",It's a good question. iPhone X was the most po...,michael j. olson - piper jaffray & co.,Okay. And then any potential tariff issues asi...,"Apple, Inc. (NASDAQ:AAPL) Q2 2018 Earnings Cal...",AAPL,Q2,2018


In [6]:
# format
aapl = pd.DataFrame(
    {
        "fyear": aapl_raw.apply(
            lambda x: str(x["year"]) + "-" + str(x["quater"]), axis=1
        ),
        "pid": aapl_raw["question_index"],
        "text": aapl_raw["question"],
    }
)
aapl.reset_index(drop=True, inplace=True)
aapl

Unnamed: 0,fyear,pid,text
0,2018-Q2,0,Thank you very much. I wanted to ask about you...
1,2018-Q2,1,Thank you. Good afternoon. The services growth...
2,2018-Q2,2,And it doesn't look like the threat of a trade...
3,2018-Q2,3,"Hey. Good afternoon, and thank you for taking ..."
4,2018-Q2,4,Okay. And then any potential tariff issues asi...
...,...,...,...
56,2017-Q4,11,Thank you so much for your details. Thank you.
57,2017-Q4,12,"Thanks a lot for taking my question, guys. I h..."
58,2017-Q4,13,Got it. That's really helpful. And I guess if ...
59,2017-Q4,14,"Yeah, Tim, I'm wondering if we take a look at ..."


### SKX

In [7]:
# years
np.sort(setence_new_parsed[setence_new_parsed['equity'] == 'SKX'].year.unique())

array(['2007', '2008', '2009', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018'], dtype=object)

In [8]:
# subset
filter_equity = setence_new_parsed['equity'] == 'SKX'
skx_raw = setence_new_parsed[filter_equity].copy()
filter_year = skx_raw['year'].apply(lambda x: x in ['2018', 2018])
filter_year_2 = skx_raw['year'].apply(lambda x: x in ['2017', 2017])
final_filter = np.logical_or(filter_year, filter_year_2)
skx_raw = skx_raw[final_filter].copy()


# filter two short sentences
skx_raw = skx_raw[skx_raw['question'].apply(lambda x: len(x.split()) > 5)]

# process index
# for cur_query in tqdm(skx_raw['time_text'].unique()):
#     subset = skx_raw[skx_raw['time_text'] == cur_query]
#     skx_raw.loc[subset.index.tolist(), 'question_index'] = list(range(len(subset)))
for cur_query in tqdm(skx_raw['year'].unique()):
    subset1 = skx_raw[skx_raw['year'] == cur_query]
    for cur_query2 in subset1['quater'].unique():
        subset2 = subset1[subset1['quater'] == cur_query2]
        skx_raw.loc[subset2.index.tolist(), 'question_index'] = list(range(len(subset2)))

print(skx_raw['year'].unique())
print(skx_raw['equity'].unique())
print(skx_raw['quater'].unique())
print(len(skx_raw))
skx_raw.head(5)

100%|██████████| 2/2 [00:00<00:00, 416.83it/s]


Unnamed: 0,question_index,answeree,answer,questioner,question,time_text,equity,quater,year
2825,0,david weinberg,"Hey Scott, pretty good.",scott krasik,"Hey David, how are you doing?","SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017
2826,1,david weinberg,"Well, those -- all those items you mentioned, ...",scott krasik,"Good job. So, two questions here. First, just ...","SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017
2827,2,david weinberg,"Yes, I think it's fair to say the new product ...",scott krasik,And as you look -- I mean are you assuming tha...,"SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017
2828,3,david weinberg,Thanks.,scott krasik,Okay. Awesome. Thanks so much. Good luck.,"SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017
2829,4,david weinberg,Thank you.,david buckley,"Hi, this is David Buckley on for John Kernan. ...","SKECHERS USA, Inc. (NYSE:SKX)Q1 2017 Results E...",SKX,Q1,2017


In [9]:
# format
skx = pd.DataFrame(
    {
        "fyear": skx_raw.apply(
            lambda x: str(x["year"]) + "-" + str(x["quater"]), axis=1
        ),
        "pid": skx_raw["question_index"],
        "text": skx_raw["question"],
    }
)
skx.reset_index(drop=True, inplace=True)
skx

Unnamed: 0,fyear,pid,text
0,2017-Q1,0,"Hey David, how are you doing?"
1,2017-Q1,1,"Good job. So, two questions here. First, just ..."
2,2017-Q1,2,And as you look -- I mean are you assuming tha...
3,2017-Q1,3,Okay. Awesome. Thanks so much. Good luck.
4,2017-Q1,4,"Hi, this is David Buckley on for John Kernan. ..."
...,...,...,...
257,2018-Q1,33,And are you seeing a similar mix in your own s...
258,2018-Q1,34,And as far as visibility in the second quarter...
259,2018-Q1,35,"Yes, of course. And then your own retail store..."
260,2018-Q1,36,Right and of course. And just one more. In loo...


## Model

In [10]:
model = SentenceTransformer('all-MiniLM-L6-v2')

## Compute Similarity

### AAPL

In [11]:
aapl_sim = compute_sim(aapl, model, window=10)
aapl_sim = post_process_df(aapl_sim)
aapl_sim = search_text(aapl_sim, aapl_raw)

aapl_sim

100it [00:00, 739.09it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year1,Year2,MaxSim-Question-Pair,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-Q1,2017-Q3,0-0,0.826229,0.807500,0.646000,"(8, 10)","Yes, thank you. First, Luca, what are the fact...","Yes, thanks. Good afternoon. Luca, first quest..."
2017-Q1,2017-Q3,1-8,0.841549,0.807500,0.646000,"(8, 10)","Amit Daryanani - RBC Capital Markets LLC', 'Th...","Thanks a lot. Good afternoon, guys. I guess to..."
2017-Q1,2017-Q3,2-6,0.803858,0.807500,0.646000,"(8, 10)","Luca, I'm wondering if you could talk a little...","Yes, thank you. I have one for Luca and one fo..."
2017-Q1,2017-Q3,3-2,0.818691,0.807500,0.646000,"(8, 10)","Hi, thank you. I wanted to dig in a little bit...",Great. Thank you. And then can you talk a bit ...
2017-Q1,2017-Q3,4-1,0.828638,0.807500,0.646000,"(8, 10)","Thank you, and then just one follow-up on Chin...",Thank you very much for taking my question. Ti...
...,...,...,...,...,...,...,...,...
2018-Q1,2018-Q2,6-9,0.778559,0.745475,0.512514,"(16, 11)",I have a question and follow-up. You commented...,Thank you very much. And I'll ask actually bot...
2018-Q1,2018-Q2,7-1,0.722830,0.745475,0.512514,"(16, 11)","And just a follow-up, maybe I could clarify tw...",Thank you. Good afternoon. The services growth...
2018-Q1,2018-Q2,10-7,0.694836,0.745475,0.512514,"(16, 11)","Yes, I was asking as a percent, the ownership....","Yes, Tim, I think there is China numbers are a..."
2018-Q1,2018-Q2,14-4,0.925555,0.745475,0.512514,"(16, 11)",I have two questions as well. I guess first on...,Okay. And then any potential tariff issues asi...


### SKX

In [14]:
skx_sim = compute_sim(skx, model, window=10)
skx_sim = post_process_df(skx_sim)
skx_sim = search_text(skx_sim, skx_raw)

skx_sim

431it [00:00, 719.95it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year1,Year2,MaxSim-Question-Pair,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-Q1,2017-Q2,0-0,0.834134,0.753525,0.395600,"(42, 80)","Hey David, how are you doing?","Hey, David. How are you doing? Congrats on a b..."
2017-Q1,2017-Q2,1-14,0.792789,0.753525,0.395600,"(42, 80)","Good job. So, two questions here. First, just ...","Good afternoon, David. Thanks for taking my qu..."
2017-Q1,2017-Q2,2-39,0.789668,0.753525,0.395600,"(42, 80)",And as you look -- I mean are you assuming tha...,Okay. And then I think you see more spring 201...
2017-Q1,2017-Q2,3-38,0.639594,0.753525,0.395600,"(42, 80)",Okay. Awesome. Thanks so much. Good luck.,Okay. And then the price increase went through...
2017-Q1,2017-Q2,4-5,0.666190,0.753525,0.395600,"(42, 80)","Hi, this is David Buckley on for John Kernan. ...","Good afternoon, David. Thanks for taking my qu..."
...,...,...,...,...,...,...,...,...
2017-Q4,2018-Q1,42-31,0.980718,0.778037,0.579714,"(51, 38)",All right. Thanks very much. Best of luck.,All right. Thanks and best of luck.
2017-Q4,2018-Q1,43-11,0.926188,0.778037,0.579714,"(51, 38)",Good afternoon. Thanks for taking my question.,Hi. Good afternoon everybody. Thanks for takin...
2017-Q4,2018-Q1,44-12,0.731424,0.778037,0.579714,"(51, 38)",Hi. I just wanted to touch on some of the newe...,I think the SG&A dollar growth is obviously a ...
2017-Q4,2018-Q1,46-36,0.728796,0.778037,0.579714,"(51, 38)","Hey, thanks for fitting me back in. Just a cou...",Right and of course. And just one more. In loo...


In [15]:
skx_sim.loc['2017-Q1']

Unnamed: 0_level_0,Unnamed: 1_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year2,MaxSim-Question-Pair,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-Q2,0-0,0.834134,0.753525,0.395600,"(42, 80)","Hey David, how are you doing?","Hey, David. How are you doing? Congrats on a b..."
2017-Q2,1-14,0.792789,0.753525,0.395600,"(42, 80)","Good job. So, two questions here. First, just ...","Good afternoon, David. Thanks for taking my qu..."
2017-Q2,2-39,0.789668,0.753525,0.395600,"(42, 80)",And as you look -- I mean are you assuming tha...,Okay. And then I think you see more spring 201...
2017-Q2,3-38,0.639594,0.753525,0.395600,"(42, 80)",Okay. Awesome. Thanks so much. Good luck.,Okay. And then the price increase went through...
2017-Q2,4-5,0.666190,0.753525,0.395600,"(42, 80)","Hi, this is David Buckley on for John Kernan. ...","Good afternoon, David. Thanks for taking my qu..."
...,...,...,...,...,...,...,...
2018-Q1,35-36,0.769019,0.745954,0.674911,"(42, 38)",Hi David. I wonder if you can just maybe give ...,Right and of course. And just one more. In loo...
2018-Q1,36-18,0.814759,0.745954,0.674911,"(42, 38)",Okay. Understood. And then not to beat a dead ...,Thank you for taking my questions. Good aftern...
2018-Q1,37-26,0.706237,0.745954,0.674911,"(42, 38)","Okay, good. And then one final one. Just on Ca...",Should we think about that number as like in t...
2018-Q1,38-23,0.719074,0.745954,0.674911,"(42, 38)",Hey David. Thanks for squeezing me in. Just wa...,Okay. Great. And then just a follow-up on Tom'...


## Save Results

In [17]:
# save aapl
with open(os.path.join(os.getcwd(), 'data', 'result', 'aapl_sim.pkl'), 'wb') as f:
    pickle.dump(aapl_sim, f)
    
# save skx
with open(os.path.join(os.getcwd(), 'data', 'result', 'skx_sim.pkl'), 'wb') as f:
    pickle.dump(skx_sim, f)

In [18]:
# test load
with open(os.path.join(os.getcwd(), 'data', 'result', 'aapl_sim.pkl'), 'rb') as f:
    aapl_test = pickle.load(f)
aapl_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year1,Year2,MaxSim-Question-Pair,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-Q1,2017-Q3,0-0,0.826229,0.807500,0.646000,"(8, 10)","Yes, thank you. First, Luca, what are the fact...","Yes, thanks. Good afternoon. Luca, first quest..."
2017-Q1,2017-Q3,1-8,0.841549,0.807500,0.646000,"(8, 10)","Amit Daryanani - RBC Capital Markets LLC', 'Th...","Thanks a lot. Good afternoon, guys. I guess to..."
2017-Q1,2017-Q3,2-6,0.803858,0.807500,0.646000,"(8, 10)","Luca, I'm wondering if you could talk a little...","Yes, thank you. I have one for Luca and one fo..."
2017-Q1,2017-Q3,3-2,0.818691,0.807500,0.646000,"(8, 10)","Hi, thank you. I wanted to dig in a little bit...",Great. Thank you. And then can you talk a bit ...
2017-Q1,2017-Q3,4-1,0.828638,0.807500,0.646000,"(8, 10)","Thank you, and then just one follow-up on Chin...",Thank you very much for taking my question. Ti...
...,...,...,...,...,...,...,...,...
2018-Q1,2018-Q2,6-9,0.778559,0.745475,0.512514,"(16, 11)",I have a question and follow-up. You commented...,Thank you very much. And I'll ask actually bot...
2018-Q1,2018-Q2,7-1,0.722830,0.745475,0.512514,"(16, 11)","And just a follow-up, maybe I could clarify tw...",Thank you. Good afternoon. The services growth...
2018-Q1,2018-Q2,10-7,0.694836,0.745475,0.512514,"(16, 11)","Yes, I was asking as a percent, the ownership....","Yes, Tim, I think there is China numbers are a..."
2018-Q1,2018-Q2,14-4,0.925555,0.745475,0.512514,"(16, 11)",I have two questions as well. I guess first on...,Okay. And then any potential tariff issues asi...


In [19]:
aapl_test.loc['2017-Q1']

Unnamed: 0_level_0,Unnamed: 1_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year2,MaxSim-Question-Pair,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-Q3,0-0,0.826229,0.8075,0.646,"(8, 10)","Yes, thank you. First, Luca, what are the fact...","Yes, thanks. Good afternoon. Luca, first quest..."
2017-Q3,1-8,0.841549,0.8075,0.646,"(8, 10)","Amit Daryanani - RBC Capital Markets LLC', 'Th...","Thanks a lot. Good afternoon, guys. I guess to..."
2017-Q3,2-6,0.803858,0.8075,0.646,"(8, 10)","Luca, I'm wondering if you could talk a little...","Yes, thank you. I have one for Luca and one fo..."
2017-Q3,3-2,0.818691,0.8075,0.646,"(8, 10)","Hi, thank you. I wanted to dig in a little bit...",Great. Thank you. And then can you talk a bit ...
2017-Q3,4-1,0.828638,0.8075,0.646,"(8, 10)","Thank you, and then just one follow-up on Chin...",Thank you very much for taking my question. Ti...
2017-Q3,5-5,0.899853,0.8075,0.646,"(8, 10)","Yes, thank you. I just wanted to better unders...","Thank you, a question for Tim maybe on the iPh..."
2017-Q3,6-7,0.655626,0.8075,0.646,"(8, 10)","Thank you. First, I wanted to ask about the iP...",Good afternoon. I just have one question for T...
2017-Q3,7-9,0.785555,0.8075,0.646,"(8, 10)","Okay, that's great, thank you. And then, Tim, ...","Yes. Tim, growth in the smartphone market is n..."
2017-Q4,0-0,0.740561,0.789106,0.394553,"(8, 16)","Yes, thank you. First, Luca, what are the fact...","Thank you. Congrats on the quarter. Luca, when..."
2017-Q4,1-13,0.813766,0.789106,0.394553,"(8, 16)","Amit Daryanani - RBC Capital Markets LLC', 'Th...",Got it. That's really helpful. And I guess if ...


In [20]:
aapl_test.loc['2017-Q1', '2017-Q3', '3-2']['Question_Year1']

'Hi, thank you. I wanted to dig in a little bit more into the iPhone upside in the quarter with record revenues in every region except for Greater China. I think you touched on the percent of switchers in China. But can you give a little more color on the split of upgrades and switchers in some of the other regions and overall as well?'

In [21]:
aapl_test.loc['2017-Q1', '2017-Q3', '3-2']['Question_Year2']

"Great. Thank you. And then can you talk a bit about the composition of the installed base of iPhones at this point, as obviously we're getting close to a refresh? Just you brought in the iPhone SE. You've obviously had strength at the high end. I'm just trying to think about what percent do you think have upgraded in the prior generation, any color you can give us on that?"

In [22]:
aapl_test.loc['2017-Q1', '2018-Q2', '4-7']['Question_Year1']

"Thank you, and then just one follow-up on China specifically. As your comps get easier this year, I was curious if you think you're going to be returning to growth in that region. And then just to give us the context, I know you talked about the 6% constant currency increase in revenue in Mainland China. But curious if you strip out the double-digit increase in iPad and MacBook and potentially the mix shift to the iPhone 7 Plus, I'm curious what underlying iPhone units did in Mainland China."

In [23]:
aapl_test.loc['2017-Q1', '2018-Q2', '4-7']['Question_Year2']

"Yes, Tim, I think there is China numbers are actually phenomenal in the quarter and third consecutive quarter of growth. I think there's been a lot of concerns just Apple in China and maybe misinformation out there. But what do you see as the drivers for Apple in both Mainland China and Greater China over the next few years? And also if you could just give us an update on what you're seeing in India."

In [24]:
# test load
with open(os.path.join(os.getcwd(), 'data', 'result', 'skx_sim.pkl'), 'rb') as f:
    skx_test = pickle.load(f)
skx_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year1,Year2,MaxSim-Question-Pair,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-Q1,2017-Q2,0-0,0.834134,0.753525,0.395600,"(42, 80)","Hey David, how are you doing?","Hey, David. How are you doing? Congrats on a b..."
2017-Q1,2017-Q2,1-14,0.792789,0.753525,0.395600,"(42, 80)","Good job. So, two questions here. First, just ...","Good afternoon, David. Thanks for taking my qu..."
2017-Q1,2017-Q2,2-39,0.789668,0.753525,0.395600,"(42, 80)",And as you look -- I mean are you assuming tha...,Okay. And then I think you see more spring 201...
2017-Q1,2017-Q2,3-38,0.639594,0.753525,0.395600,"(42, 80)",Okay. Awesome. Thanks so much. Good luck.,Okay. And then the price increase went through...
2017-Q1,2017-Q2,4-5,0.666190,0.753525,0.395600,"(42, 80)","Hi, this is David Buckley on for John Kernan. ...","Good afternoon, David. Thanks for taking my qu..."
...,...,...,...,...,...,...,...,...
2017-Q4,2018-Q1,42-31,0.980718,0.778037,0.579714,"(51, 38)",All right. Thanks very much. Best of luck.,All right. Thanks and best of luck.
2017-Q4,2018-Q1,43-11,0.926188,0.778037,0.579714,"(51, 38)",Good afternoon. Thanks for taking my question.,Hi. Good afternoon everybody. Thanks for takin...
2017-Q4,2018-Q1,44-12,0.731424,0.778037,0.579714,"(51, 38)",Hi. I just wanted to touch on some of the newe...,I think the SG&A dollar growth is obviously a ...
2017-Q4,2018-Q1,46-36,0.728796,0.778037,0.579714,"(51, 38)","Hey, thanks for fitting me back in. Just a cou...",Right and of course. And just one more. In loo...


In [35]:
skx_test.loc['2017-Q1']

Unnamed: 0_level_0,Unnamed: 1_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year2,MaxSim-Question-Pair,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-Q2,0-0,0.834134,0.753525,0.395600,"(42, 80)","Hey David, how are you doing?","Hey, David. How are you doing? Congrats on a b..."
2017-Q2,1-14,0.792789,0.753525,0.395600,"(42, 80)","Good job. So, two questions here. First, just ...","Good afternoon, David. Thanks for taking my qu..."
2017-Q2,2-39,0.789668,0.753525,0.395600,"(42, 80)",And as you look -- I mean are you assuming tha...,Okay. And then I think you see more spring 201...
2017-Q2,3-38,0.639594,0.753525,0.395600,"(42, 80)",Okay. Awesome. Thanks so much. Good luck.,Okay. And then the price increase went through...
2017-Q2,4-5,0.666190,0.753525,0.395600,"(42, 80)","Hi, this is David Buckley on for John Kernan. ...","Good afternoon, David. Thanks for taking my qu..."
...,...,...,...,...,...,...,...
2018-Q1,35-36,0.769019,0.745954,0.674911,"(42, 38)",Hi David. I wonder if you can just maybe give ...,Right and of course. And just one more. In loo...
2018-Q1,36-18,0.814759,0.745954,0.674911,"(42, 38)",Okay. Understood. And then not to beat a dead ...,Thank you for taking my questions. Good aftern...
2018-Q1,37-26,0.706237,0.745954,0.674911,"(42, 38)","Okay, good. And then one final one. Just on Ca...",Should we think about that number as like in t...
2018-Q1,38-23,0.719074,0.745954,0.674911,"(42, 38)",Hey David. Thanks for squeezing me in. Just wa...,Okay. Great. And then just a follow-up on Tom'...


In [41]:
skx_test.loc['2017-Q1', '2017-Q2', '1-14']['Question_Year1']

"Good job. So, two questions here. First, just trying to understand the backlog, maybe how do we think about mid-single-digits, I guess domestically, I think it was a mid to high single at the end of the year and clearly you're not guiding to that level of growth. So, what's assumed in your flat to up low single-digit growth for the second quarter? Is it much more weighted to 3Q? Does it depend on how back-to-school ships? And then how are the new products sort of built in to that?"

In [42]:
skx_test.loc['2017-Q1', '2017-Q2', '1-14']['Question_Year2']

'Good afternoon, David. Thanks for taking my question. I want to follow-up on the domestic wholesale guide for the third quarter. Can you potentially qualify in dollar terms how much was shifted from 3Q into 2Q? And is there any potential pull-forward from 4Q into 3Q that we should consider for this year?'

In [45]:
skx_test.loc['2017-Q4']

Unnamed: 0_level_0,Unnamed: 1_level_0,Similarity Score,Normalized_by_min,Normalized_by_max,Doc1_Doc2_shape,Question_Year1,Question_Year2
Year2,MaxSim-Question-Pair,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-Q1,1-21,0.786189,0.778037,0.579714,"(51, 38)",It looks like you guys had a very productive f...,Good afternoon. Thanks for taking my questions...
2018-Q1,2-34,0.674866,0.778037,0.579714,"(51, 38)",Where along that kind of 12% to 13% spectrum t...,And as far as visibility in the second quarter...
2018-Q1,3-14,0.808947,0.778037,0.579714,"(51, 38)","Okay, that's fair. And then, I just had a quic...",Okay. And then just few quick follow-up questi...
2018-Q1,4-8,0.80514,0.778037,0.579714,"(51, 38)","Okay. Great. I'm sorry, I just, if I could sne...","Thank you. And just lastly, could you just may..."
2018-Q1,6-19,0.70272,0.778037,0.579714,"(51, 38)","Just a couple of questions, so the sales shift...",I understand that. Okay. So let's move on from...
2018-Q1,7-26,0.739748,0.778037,0.579714,"(51, 38)","Okay. And then, thanks for sharing that $6 bil...",Should we think about that number as like in t...
2018-Q1,8-15,0.872132,0.778037,0.579714,"(51, 38)","Great. All right, well, good luck. Thanks.",All right. Thanks guys. Best of luck.
2018-Q1,9-9,0.771075,0.778037,0.579714,"(51, 38)","Hi, good afternoon, guys. Thanks for taking my...",Have the comps held up as you got into Q2? Wha...
2018-Q1,10-32,0.80374,0.778037,0.579714,"(51, 38)","Impressive. John, did you give any detail betw...","Thank you very much. This is Westcott, on for ..."
2018-Q1,11-37,0.933767,0.778037,0.579714,"(51, 38)","Okay, great. Thanks. Best of luck.",Okay. Thank you very much. Good luck.


In [46]:
skx_test.loc['2017-Q4', '2018-Q1', '29-24']['Question_Year1']

"That's great. And then lastly and last question here is on tax. I think you guys gave a nice range here of 12% to 17% for the year. Obviously, there's still a lack of visibility on what's going to happen further going forward. But at least for the first quarter, any kind of directional guidance on where we could think the tax rate can be for the quarter?"

In [47]:
skx_test.loc['2017-Q4', '2018-Q1', '29-24']['Question_Year2']

"Okay. That's very helpful. And then the tax rate cam in shy of 10%. Can you help us think about the puts and takes in terms of by quarter, should we think of the second quarter just to meaningful step up function to get to your full year tax rate range?"

In [43]:
skx_test.loc['2017-Q4', '2018-Q1', '44-12']['Question_Year1']

"Hi. I just wanted to touch on some of the newer international markets you've taken over in the last couple of years, Latin America and Korea. I know Korea is called out as one of the best growers. Overall, how are those doing? Did you start to see any margin improvement in fourth quarter? And how do you expect the margins in those businesses to trend next year?"

In [44]:
skx_test.loc['2017-Q4', '2018-Q1', '44-12']['Question_Year2']

"I think the SG&A dollar growth is obviously a point on the income statement that the sell-side has had trouble modeling in the past year or so. And it looks like the guidance implied for Q2 is pretty significant in terms of overall growth. You are now guiding to leverage, as I understand, in Q3. Can you just help us understand how you see SG&A dollar growth and SG&A rates progressing for the rest of the year? Because I think it's just creating a lot of volatility in terms of expectations. Thank you."