In [3]:
from src.vectorDatabase import get_earnings_all_quarters_data
from langchain.schema import Document
from src.secData import sec_main

In [4]:
def get_data(ticker:str,year:int):
    docs = []
    print("Earnings Call Q1")
    docs,speakers_list_1 = get_earnings_all_quarters_data(docs, "Q1", ticker, year)
    print("Earnings Call Q2")
    docs,speakers_list_2 = get_earnings_all_quarters_data(docs, "Q2", ticker, year)
    print("Earnings Call Q3")
    docs,speakers_list_3 = get_earnings_all_quarters_data(docs, "Q3", ticker, year)

    print("SEC")
    section_texts = sec_main(ticker, year)

    for filings in section_texts:
        texts_dict = filings[-1]

        for section_name, text in texts_dict.items():
            docs.append(
                Document(
                    page_content=text,
                    metadata={
                        "accessionNumber": filings[0],
                        "filing_type": filings[1],
                        "filingDate": filings[2],
                        "reportDate": filings[3],
                        "sectionName": section_name,
                    },
                )
            )
    return docs

In [10]:
ticker = 'AAPL'

In [5]:
docs = get_data(ticker,2023)

Earnings Call Q1
Earnings Call Q2
Earnings Call Q3
SEC
Started Scraping
Scraped
Started Extracting
Extracted


In [7]:
import colbert
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

In [8]:
collection = [doc.page_content for doc in docs]

In [11]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens
max_id = 10000

index_name = f'SEC.Earningcalls.{ticker}.{nbits}bits'

In [13]:
checkpoint = 'colbert-ir/colbertv2.0'

with Run().context(RunConfig(nranks=1, experiment='notebook')):  # nranks specifies the number of GPUs to use
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                # Consider larger numbers for small datasets.

    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=collection, overwrite=True)

Downloading artifact.metadata:   0%|          | 0.00/1.63k [00:00<?, ?B/s]



[Feb 04, 16:42:21] #> Creating directory /home/athekunal/Finance Project/experiments/notebook/indexes/SEC.Earningcalls.AAPL.2bits 


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 400000,
    "save_every": null,
    "warmup": 20000,
    "warmup_bert": null,
    "relu": false,
    "nway": 64,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 300,
    "mask_punctuatio

Downloading config.json: 100%|██████████| 743/743 [00:00<00:00, 5.46MB/s]
Downloading model.safetensors: 100%|██████████| 438M/438M [00:27<00:00, 15.9MB/s] 
Downloading tokenizer_config.json: 100%|██████████| 405/405 [00:00<00:00, 2.88MB/s]
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 4.65MB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 10.4MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 941kB/s]


[Feb 04, 16:42:55] [0] 		 # of sampled PIDs = 226 	 sampled_pids[:3] = [106, 187, 2]
[Feb 04, 16:42:55] [0] 		 #> Encoding 226 passages..
[Feb 04, 16:42:59] [0] 		 avg_doclen_est = 103.1460189819336 	 len(local_sample) = 226
[Feb 04, 16:42:59] [0] 		 Creating 2,048 partitions.
[Feb 04, 16:42:59] [0] 		 *Estimated* 23,311 embeddings.
[Feb 04, 16:42:59] [0] 		 #> Saving the indexing plan to /home/athekunal/Finance Project/experiments/notebook/indexes/SEC.Earningcalls.AAPL.2bits/plan.json ..




Clustering 22146 points in 128D to 2048 clusters, redo 1 times, 4 iterations
  Preprocessing in 0.00 s
  Iteration 3 (0.07 s, search 0.06 s): objective=4118.64 imbalance=1.503 nsplit=0       
[Feb 04, 16:43:01] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


In [None]:
indexer.get_index() # You can get the absolute path of the index, if needed.

In [None]:
# To create the searcher using its relative name (i.e., not a full path), set
# experiment=value_used_for_indexing in the RunConfig.
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name, collection=collection)


# If you want to customize the search latency--quality tradeoff, you can also supply a
# config=ColBERTConfig(ncells=.., centroid_score_threshold=.., ndocs=..) argument.
# The default settings with k <= 10 (1, 0.5, 256) gives the fastest search,
# but you can gain more extensive search by setting larger values of k or
# manually specifying more conservative ColBERTConfig settings (e.g. (4, 0.4, 4096)).

In [None]:
query = "What did Apple talk about Generative AI?" # try with an in-range query or supply your own
print(f"#> {query}")

# Find the top-3 passages for this query
results = searcher.search(query, k=3)

# Print out the top-k retrieved passages
for passage_id, passage_rank, passage_score in zip(*results):
    print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")