# Preperations: 

In [6]:
# Utils: 
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
# Torch: 
from torch.utils.data import Dataset, DataLoader
import torch
# NLP: 
from transformers import BertTokenizer, BertModel
from collections import defaultdict

In [5]:
# Load Pre-trained model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
# Moving model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [1]:
# Load raw data
import ncs
train_call_statements = ncs.load_call_statements('train')
test_call_statements = ncs.load_call_statements('test')

# Data loading & Embedding

In [11]:
class TextDataset(Dataset):
    '''
    Inherited class structure from torch.utils.data.Dataset; data structure
    '''
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        doc_id, text = self.texts[idx]
        return doc_id, text

def sliding_window(text, stride, max_len, doc_id):
    '''
    Inherited; Segment longer `text` into overlapping trunks based on `stride` and `max_len`
    '''
    tokens = text.lower().split()
    stride_len = max_len - stride
    windowed_tokens = [tokens[i: i+max_len] for i in range(0, len(tokens), stride_len)]
    return [(doc_id, " ".join(chunk)) for chunk in windowed_tokens]

def create_dataset(texts:list, doc_ids:list) -> Dataset:
    '''
    Inherited; wrapper for `create_dataset`
    '''
    chunked_texts = []
    for text, doc_id in zip(texts, doc_ids):
        chunked_texts.extend(sliding_window(text, stride=50, max_len=510, doc_id=doc_id))
    dataset = TextDataset(chunked_texts)
    return dataset

def generate_embeddings(tokenizer, model, dataset, device) -> dict:
    '''
    Inherited; Generate embedded dataset based on CLS token embedding. Torch based. 
    '''
    # Mapping from doc_id to a list of embeddings
    embeddings = defaultdict(list)

    # Set model to eval mode
    model.eval()

    # Create the DataLoader
    dataloader = DataLoader(dataset, batch_size=32, num_workers=4)

    # Process the data in batches
    for doc_ids, texts in tqdm(dataloader):
        tokenized_texts = tokenizer(texts, truncation=True, padding='longest', return_tensors='pt')

        input_ids = tokenized_texts['input_ids'].to(device)
        attention_mask = tokenized_texts['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get the CLS token embeddings
        cls_embeddings = outputs.last_hidden_state[:, 0, :]

        # Add the embeddings to our mapping
        for doc_id, embedding in zip(doc_ids, cls_embeddings):
            embeddings[doc_id].append(embedding.cpu().numpy())  # .item() gets the value of a Tensor with a single value

    # Finally, compute the mean embedding for each text
    mean_embeddings = {doc_id: np.mean(embeddings[doc_id], axis=0) for doc_id in embeddings}
    return mean_embeddings

In [None]:

train_texts = train_call_statements.text.to_list()
train_doc_ids = train_call_statements.statement_uid.to_list()
test_texts = test_call_statements.text.to_list()
test_doc_ids = test_call_statements.statement_uid.to_list()

os.makedirs('features/train', exist_ok=True)
os.makedirs('features/test', exist_ok=True)

# Encode the training data
train_bert_features = pd.DataFrame(
    generate_embeddings(tokenizer, model,
              create_dataset(train_texts, train_doc_ids),
              device)).T
train_bert_features.columns = [f'bert{i}' for i in range(len(train_bert_features.columns))]
train_bert_features = train_bert_features.reindex(train_doc_ids, fill_value=0)
train_bert_features.to_parquet('features/train/bert.parquet')

# Encode the testing data
test_bert_features = pd.DataFrame(
    generate_embeddings(tokenizer, model,
              create_dataset(test_texts, test_doc_ids),
              device)).T
test_bert_features.columns = [f'bert{i}' for i in range(len(test_bert_features.columns))]
test_bert_features = test_bert_features.reindex(test_doc_ids, fill_value=0)
test_bert_features.to_parquet('features/test/bert.parquet')


# Model training: 

1. **Role Weights**: These are the weights we assign to the role of the person speaking the statement. The roles can be `CEO`, `CFO`, `Senior Manager`, and `Other`. For instance, if we think that statements made by the CEO carry more weight or are more influential to the call, we can assign a higher weight to the `CEO` role.

2. **Section Weights**: These weights are assigned to the different sections of the earnings call. The sections can be `qa` for the Question and Answer section, and `pres` for the Presentation section. If we think that statements made during the Question and Answer section are more revealing and influential, we can assign a higher weight to the `qa` section.

3. **Statement Type Weights**: These are the weights we assign to different types of statements. The types can be `A` for answers, `Q` for questions, `O` for operator statements, `P` for presenter remarks, and `U` for unknown statements. For example, if we feel that answers (`A`) and questions (`Q`) carry more information than the other types of statements, we can assign a higher weight to them.

4. By manipulating these weights, we can configure our model to focus on the parts of the earnings call that we believe are the most important. Feel free to try out different weight setups and observe how they influence the model's performance!

In [None]:
#@title Model Configuration { display-mode: "form" }
model_mapping = dict(
    lr='logistic_regression', rf='random_forest', nn='neural_network'
)
model = 'random_forest' #@param ["logistic_regression", "random_forest", "neural_network"] {type:"string"}
features = ['bert']
holding_period = "10" #@param [1, 5, 10] {type:"string"}
holding_period = int(holding_period)

out_folder = '_'.join(features)+f'_{holding_period}days'
root_folder = '.'

In [None]:
role_weights = {
    'CEO': 1.0,
    'CFO': 1.0,
    'Senior Manager': 0.7,
    'Other': 0.5,
}
section_weights = {
    'qa': 1.0,
    'pres': 0.5,
}
statement_type_weights = {
    'A': 1.0, 'Q': 1.0,
    'O': 0.0, 'P': 1.0,
    'U': 0.0
}

In [None]:
os.makedirs(f'{root_folder}/results/{out_folder}', exist_ok=True)
feature_train_files = [f'{root_folder}/features/train/{feature}.parquet' for feature in features]
feature_test_files = [f'{root_folder}/features/test/{feature}.parquet' for feature in features]
action_file = f'{root_folder}/results/{out_folder}/{model}_actions.csv'
model_file = f'{root_folder}/results/{out_folder}/{model}.pkl'

ncs.model_train(feature_files=feature_train_files,
                classifier=model,
                role_weights=role_weights,
                section_weights=section_weights,
                statement_type_weights=statement_type_weights,
                holding_period=holding_period,
                save_model=model_file)

# Make prediction: 

In [None]:
ncs.model_inference(feature_files=feature_test_files,
                    model_file=model_file,
                    action_file=action_file)

# Simulate strategies: 

In [None]:
portfolio_file = f'{root_folder}/results/{out_folder}/{model}_portfolio.parquet'

ncs.run_strategy(
  action_file=action_file,
  holding_period=holding_period,
  save_portfolio_path=portfolio_file
)

In [None]:
ncs.demo_benchmark(strategy='random', holding_period=holding_period)

In [None]:
ncs.report_strategy_analysis(
  actions=action_file,
  holding_period=holding_period,
  portfolio=portfolio_file,
  model_name=f'BERT-Embedding {model}'
)