### Data Path

The following code sets data path, replace them with your own dir path. You can find samples of data in the corresponding path.

* Put the original earning conference call transcript htmls from seeking alpha in `origin_path`. 
* Download the stock price file from WRDS and put it in the current folder (see `price_sample.csv`).  
* Download `glove.840B.300d.zip` from <https://nlp.stanford.edu/projects/glove/> and unzip it in the current folder.

In [1]:
origin_path = "origin_sample"

html_path = "html_sample"
json_path = "json_sample"
price_file = "price_sample.csv"

### Import libs

if you don't have the libraries listed below, install them.

In [2]:
import os 
import re
import json
import shutil
import pickle

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from dateutil.parser import parse
from nltk.corpus import stopwords

In [3]:
def parse_time(s):
    dt = parse(s, fuzzy=True, ignoretz=True)
    fdt = "{}{:02d}{:02d}".format(dt.year, dt.month, dt.day)
    return fdt

### Rename Download HTMLs

In [4]:
files = os.listdir(origin_path)

k = 0

with tqdm(files) as tq:
    for file in tq:
        if not os.path.isdir(file):
            # if "transcript" not in file:
            #     continue
            with open(origin_path + "/" + file) as f:
                soup = BeautifulSoup(f.read(), "lxml")
                ta = soup.select("div.a-info.clearfix time")
    #             na1 = soup.select("span#about_primary_stocks a")
                na2 = soup.select("#a-body > p:nth-child(-n+3)")

                if not ta or not na2:
                    continue
                    
                t = parse_time(ta[0].string)
    #             n1 = na1[0]["href"].split("/")[-1]
    
                flag = False
                for n in na2:
                    nt = n.text.strip()
                    
                    if not nt:
                        continue

                    ns = re.findall(r'[(](.*?)[)]', nt)
                    if not ns:
                        continue
                        
                    n2 = ns[0].split(":")[-1].strip()
                    flag = True
                    break
                
                if not flag or not n2:
                    continue

                key = t + "_" + n2
                
            
            shutil.copyfile(origin_path + "/" + file, html_path + "/" + key + ".html")

  0%|          | 0/4 [00:00<?, ?it/s]

### Process HTMLs

In [5]:
files = os.listdir(html_path)
not_found_rdq = []
    
for file in tqdm(files):
    if not os.path.isdir(file):
#         print(file)
        rdq_key = file.split(".")[0]
        date, ticker = rdq_key.split("_")
        with open(html_path + "/" + file) as f:
            ret = {
                "date": date,
                "ticker": ticker,
                "participant": [],
                "transcript": []
            }
            key, values, tp = None, [], "pre"
            soup = BeautifulSoup(f.read(), "lxml")

            ps = soup.select("div#a-body p")
            for p in ps:
                if p.get_text().strip() == "Question-and-Answer Session":
                    ret["transcript"].append({"name": key, "speech": values, "type": tp})
                    tp = "qa"
                    key = None
                    continue

                if p.find("strong"):
                    if not key or not values:
                        values = []
                        key = p.get_text().strip()
                        continue

                    if key == "Executives" or key == "Analysts":
                        for v in values:
    #                             print(v)
                            items = v.replace("–", "-").split(" - ")
                            n, d = items[0], "-".join(items[1: ])
                            ret["participant"].append({"name": n, "description": d, "position": key})
                    elif key == "Question-and-Answer Session":
                        tp = "qa"
                    else:
                        ret["transcript"].append({"name": key, "speech": values, "type": tp})
                    values = []
                    key = p.get_text().strip()
                else:
                    v = p.get_text()
                    v = v.replace("[", "").replace("]", "").strip()
                    if v:
                        values.append(v)

            if key and values:
                ret["transcript"].append({"name": key, "speech": values, "type": tp})
                
        fname = date + "_" + ticker + ".json"
        with open(json_path + "/" + fname, "w", encoding="utf-8") as f:
            json.dump(ret, f)

  0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
print(not_found_rdq)

[]


### Encode Text

In [23]:
def load_glove(file):
    print("Loading Glove Model")
    f = open(file, "r")
    glove = {}
    for line in tqdm(f):
        split_lines = line.split(" ")
        word = split_lines[0]
        word_embedding = np.array([float(value) for value in split_lines[1:]])
        glove[word] = word_embedding

    print(len(glove), " words loaded!")
    return glove

In [14]:
def clean_text(s):
    s = s.lower()
    s = re.sub("\W+", " ", s).replace("_", " ")
    s = re.sub("\s+", " ", s)
    s = s.split(" ")
    words = stopwords.words("english")
    s = [w for w in s if w not in words]
    return s

In [15]:
def get_embedding(data, vectors):
    vocab, max_sen_len, max_p_len = {}, 0, 0
    for key in data:
        for t in data[key]["transcript"]:
            sen_len = [0]
            for s in t["speech"]:
                length = 0
                for w in s:
                    if w not in vectors:
                        continue
                    if w not in vocab:
                        vocab[w] = 0
                    vocab[w] += 1
                    length += 1
                sen_len.append(length)
            max_sen_len = max(max_sen_len, max(sen_len))
            max_p_len = max(max_p_len, sum(sen_len))
    idx2words = list(vocab.keys())
    word2idx = {w: i + 1 for i, w in enumerate(idx2words)}
    W = [np.zeros(300)] + [vectors[w] for w in idx2words]
    word2idx = {w: i for i, w in enumerate(idx2words)}
    W = [vectors[w] for w in idx2words]
    return vocab, word2idx, W, max_sen_len, max_p_len

In [16]:
def read_files(path):
    files = os.listdir(path)
    data = {}

    for file in files:
        if not os.path.isdir(file):
            key = file.replace(".json", "")
            if key == "20180504_SM" or key == "20150729_BEN":
                continue
            with open(path + "/" + file, "r") as f:
                d = json.load(f)
            data[key] = d

    return data

In [31]:
def get_label(price_file, data):
    df = pd.read_csv(price_file)
    df = df.fillna(0)
    df["date"] = df["date"].apply(str)

    tic2id = {}
    for index, row in tqdm(df[["TICKER", "PERMNO"]].drop_duplicates().iterrows()):
        tic2id[row["TICKER"]] = row["PERMNO"]

    for key in tqdm(data):
        date, tic = key.split("_")
        ndate = "{}/{}/{}".format(date[4:6], date[6:], date[:4])
        if tic == "WELL":
            tic = "HCN"
        # price = []
        if tic not in tic2id:
            print(tic)
            continue

        idx = df[
            ((df["PERMNO"] == tic2id[tic]) | (df["TICKER"] == tic))
            & (df["date"] == ndate)
        ].index[0]

        pprice = list(reversed(df.loc[idx - 31 : idx - 1, "RETX"].tolist()))
        price = df.loc[idx : idx + 30, "RETX"].tolist()

        if not price:
            print(key)

        pprice = [float(p) for p in pprice]
        price = [float(p) for p in price]

        data[key]["label"] = {
            -3: np.log(np.std(pprice[:4], ddof=1)),
            -7: np.log(np.std(pprice[:8], ddof=1)),
            -15: np.log(np.std(pprice[:16], ddof=1)),
            -30: np.log(np.std(pprice, ddof=1)),
            3: np.log(np.std(price[:4], ddof=1)),
            7: np.log(np.std(price[:8], ddof=1)),
            15: np.log(np.std(price[:16], ddof=1)),
            30: np.log(np.std(price, ddof=1)),
        }
    return data

In [33]:
def preprocess(data, glove):
    for key in data:
        for j, t in enumerate(data[key]["transcript"]):
            new_speech = []
            for k, s in enumerate(t["speech"]):
                ns = clean_text(data[key]["transcript"][j]["speech"][k])
                if ns:
                    new_speech.append(ns)
            data[key]["transcript"][j]["speech"] = new_speech

    vocab, word2idx, W, max_sen_len, max_p_len = get_embedding(data, glove)
    # for key in data:
    #     with open("/home/sangyx/data/ecc/json_label/{}.json".format(key), "w") as f:
    #         json.dump(data[key], f)
    return vocab, word2idx, W, max_sen_len, max_p_len, data

In [24]:
glove = load_glove("glove.840B.300d.txt")

Loading Glove Model


0it [00:00, ?it/s]

2196016  words loaded!


In [34]:
data = read_files(json_path)
vocab, word2idx, W, max_sen_len, max_p_len, data = preprocess(data, glove)
data = get_label("price_sample.csv", data)
with open("data_sample.pkl", "wb") as f:
    pickle.dump([data, vocab, word2idx, W, max_p_len, max_sen_len], f)

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]