### Task #2 - GenAI

Dataset
* Year: 2018 - 2020
* Filing type: 10K
* Sections: All
* Company: choose 1
* Choose 5 data attributes to extract from a single year.
  
Steps
* Convert the documents to chunks,
* Convert the chunks into embeddings,
* Create a query
* Create a promt to extract data from chunks from a specific year
* create a validation dataset (5 true values from chunks)
* Demonstrate that your LLM can retrieve the correct chunks from your embedding object for the correct year


In [3]:
# Import Libraries

from datasets import load_dataset
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import random
import pickle
import json
import pprint
import os

from transformers import AutoTokenizer

random.seed(42)
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Set working Directory
# import os
# current_path = os.getcwd()
# current_path

In [4]:
# Set RooT Dir
# ROOT_DIR ="/Users/Vader/Documents/Notebook_GenAI_Projects/AIG_Assessment"

Datafile : https://huggingface.co/datasets/eloukas/edgar-corpus

In [4]:
# Due to resource contraints, files are manually downloaded and filtered

# Define a function to load a JSONL file
def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue  # skip any malformed line
    return data

# List of file names for the years 2018, 2019, 2020
file_list = ['train_2018.jsonl', 'train_2019.jsonl', 'train_2020.jsonl']

# Load data from all files into a single list
all_data = []
for file in file_list:
    if os.path.exists(file):
        data = load_jsonl(file)
        all_data.extend(data)
    else:
        print(f"Warning: {file} not found.")

# View the first record to inspect the structure
pprint.pprint(all_data[0])

# scrolling feature
from IPython.display import display, HTML
js = "<script>$('.output_scroll').removeClass('output_scroll')</script>"
display(HTML(js))

{'cik': '1566373',
 'filename': '1566373_2018.htm',
 'section_1': 'Item 1. Business\n'
              'Overview\n'
              'We are a clinical-stage biopharmaceutical company engaged in '
              'the discovery and development of a novel class of therapeutics '
              'for the treatment of viral infections, inflammatory diseases '
              'and certain cancers using our proprietary small molecule '
              'nucleotide platform. We design our compounds to selectively '
              'target and modulate the activity of specific proteins '
              'implicated in various disease states. We are developing our '
              'lead product candidate, inarigivir soproxil, or inarigivir, for '
              'the treatment of chronic hepatitis B virus, or HBV. We have '
              'designed our antiviral product candidates, including '
              'inarigivir, to selectively activate within infected hepatic '
              'cells the cellular protein, ret

In [44]:
# Load 1 year (2020) and filter by 1 company

# define global params
YEAR = "2020"
data_file = "train_2020.jsonl"
COMPANY_CIK = "789019"   # for Microsoft
ATTRIBUTES = ["filename", "cik", "year", "section_1", "section_7"]    # features selected


# Load the data
data_records = []
with open(data_file, "r") as f:
    for line in f:
        record = json.loads(line)
        data_records.append(record)

print(f"Loaded {len(data_records)} records from {data_file}.")

Loaded 5480 records from train_2020.jsonl.


In [41]:
# Filter records for Microsoft
filtered_records = [record for record in data_records if record.get("cik", "") == COMPANY_CIK]

print(f"Filtered records for CIK '{COMPANY_CIK}': {len(filtered_records)} records found.")

Filtered records for CIK '789019': 1 records found.


In [42]:
# Extract the Selected Attributes and Create Text Chunks

# Define a helper function to extract our five chosen attributes from a record.
def extract_attributes(record):
    return {attr: record.get(attr, None) for attr in ATTRIBUTES}

extracted_data = [extract_attributes(record) for record in filtered_records]

# For demonstration, we work with the first record from the filtered list.
if extracted_data:
    sample_record = extracted_data[0]
else:
    raise ValueError(f"No records found for CIK {COMPANY_CIK} in year {YEAR}")


# Define a function to split a long text into overlapping chunks.
def chunk_text(text, chunk_size=200, overlap=20):
    """
    Splits the input text into overlapping chunks.
    :param text: The string to split.
    :param chunk_size: Maximum number of words per chunk.
    :param overlap: Number of words to overlap between consecutive chunks.
    """
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

# For demonstration, we choose one of the sections (e.g., 'section_1') to process.
section_text = sample_record.get("section_1", "")
if section_text:
    text_chunks = chunk_text(section_text, chunk_size=200, overlap=20)
    print(f"Section 'section_1' was split into {len(text_chunks)} chunks.")
else:
    raise ValueError("No text found in 'section_1' of the sample record.")


Section 'section_1' was split into 35 chunks.


In [45]:
# Convert Text Chunks to Embeddings

# Load a pre-trained SentenceTransformer model to create embeddings.
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert text chunk to an embedding vector.
chunk_embeddings = model.encode(text_chunks, show_progress_bar=True)

# Create a list that holds each chunk along with its embedding and metadata.
chunks_data = []
for i, chunk in enumerate(text_chunks):
    chunks_data.append({
        "chunk_id": i,
        "text": chunk,
        "embedding": chunk_embeddings[i],
        "year": YEAR,
        "cik": COMPANY_CIK,
        "source_section": "section_1"
    })

print(f"Converted {len(chunks_data)} text chunks into embeddings.")

Batches: 100%|████████████████████████████████████| 2/2 [00:00<00:00, 20.86it/s]

Converted 35 text chunks into embeddings.





In [46]:
# Create a Query and an Extraction Prompt

# Define a sample query string that might represent a user or system query.
query = "Retrieve details from the filing regarding business operations and key company identifiers."

# Build a prompt to simulate sending to an LLM.
def create_extraction_prompt(chunk_text):
    prompt = (
        "Extract the following details from the filing chunk:\n"
        "1. Filename\n"
        "2. CIK\n"
        "3. Year\n"
        "4. Key content from section_1\n"
        "5. A brief summary of section_7 if available (if not, skip it)\n\n"
        "Document Chunk:\n" + chunk_text + "\n\n"
        "Provide your answer in JSON format with keys: filename, cik, year, section_1_summary, section_7_summary."
    )
    return prompt

# Generate a prompt using the first chunk.
demo_prompt = create_extraction_prompt(chunks_data[0]["text"])
print("Example extraction prompt:")
print(demo_prompt)


Example extraction prompt:
Extract the following details from the filing chunk:
1. Filename
2. CIK
3. Year
4. Key content from section_1
5. A brief summary of section_7 if available (if not, skip it)

Document Chunk:
Item 1 Investing in Digital Skills With a continued focus on digital transformation, Microsoft is making efforts to help ensure that no one is left behind, particularly as economies start to recover from the COVID-19 pandemic. We are expanding access to the digital skills that have become increasingly vital to many of the world’s jobs, and especially to individuals hardest hit by recent job losses, including those with lower incomes, women, and underrepresented minorities. Our skills initiative brings together learning resources, certification opportunities, and job-seeker tools from LinkedIn, GitHub, and Microsoft Learn, and is built on data insights drawn from LinkedIn’s Economic Graph. This is combined with $20 million we are investing in key non-profit partnerships thr

In [48]:
# Create a Validation Dataset from 5 Random Chunks

# For validation, randomly select 5 chunks from our.
if len(chunks_data) < 5:
    validation_chunks = chunks_data
else:
    validation_chunks = random.sample(chunks_data, 5)

# Create a DataFrame to display a preview of these validation chunks.
validation_data = pd.DataFrame([
    {
        "chunk_id": chunk["chunk_id"],
        "text_preview": chunk["text"][:100] + "...",
        "year": chunk["year"],
        "cik": chunk["cik"],
        "source_section": chunk["source_section"]
    } for chunk in validation_chunks
])

print("Validation dataset (5 randomly selected chunks):")
display(validation_data)


Validation dataset (5 randomly selected chunks):


Unnamed: 0,chunk_id,text_preview,year,cik,source_section
0,7,including Premier Support Services and Microso...,2020,789019,section_1
1,1,and rational allocation of resources within bu...,2020,789019,section_1
2,17,"services, and continued strong exclusive conte...",2020,789019,section_1
3,15,other devices. Xbox Live is designed to benefi...,2020,789019,section_1
4,14,"Game Studios, a collection of first-party stud...",2020,789019,section_1


In [49]:
# Retrieve Chunks Based on a Query Using Cosine Similarity

# Compute the embedding for the query.
query_embedding = model.encode([query])[0]

# Function to retrieve top N chunks similar to the query embedding.
def retrieve_similar_chunks(query_emb, chunks, top_n=3):
    embeddings = np.array([chunk["embedding"] for chunk in chunks])
    similarities = cosine_similarity([query_emb], embeddings)[0]
    top_indices = similarities.argsort()[-top_n:][::-1]
    return [(chunks[i], similarities[i]) for i in top_indices]

# Retrieve the top 3 chunks matching the query.
retrieved_chunks = retrieve_similar_chunks(query_embedding, chunks_data, top_n=3)

print("Retrieved chunks based on query similarity:")
for chunk, score in retrieved_chunks:
    print(f"\nChunk ID: {chunk['chunk_id']}, Similarity Score: {score:.4f}")
    print("Text preview:", chunk['text'][:200], "...")


Retrieved chunks based on query similarity:

Chunk ID: 33, Similarity Score: 0.5139
Text preview: INFORMATION Our Internet address is www.microsoft.com. At our Investor Relations website, www.microsoft.com/investor, we make available free of charge a variety of information for investors. Our goal  ...

Chunk ID: 32, Similarity Score: 0.3632
Text preview: corporate finance organization. Ms. Hood also serves on the Board of Directors of 3M Corporation. Mr. Smith was appointed President and Chief Legal Officer in September 2015. He served as Executive Vi ...

Chunk ID: 1, Similarity Score: 0.3181
Text preview: and rational allocation of resources within businesses. Additional information on our operating segments and geographic and product information is contained in Note 19 - Segment Information and Geogra ...
