In [None]:
%pip install transformers torch einops
%pip install numpy==1.24.1
%pip install rank_bm25

In [1]:
import re
import numpy as np
from typing import List, Tuple, Dict

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
from transformers import pipeline

from rank_bm25 import BM25Okapi

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs =["""
The History and Impact of Artificial Neural Networks

Artificial Neural Networks (ANNs) represent a fundamental shift in how we approach computation and artificial intelligence. Inspired by biological neural networks, these systems have evolved from simple perceptrons in the 1950s to today's sophisticated deep learning architectures.

Early Development (1940s-1950s):
The first artificial neuron was proposed by Warren McCulloch and Walter Pitts in 1943. Their mathematical model showed how neurons might work, demonstrating that simple neural networks could compute basic logical functions. In 1957, Frank Rosenblatt developed the perceptron, the first algorithm that could learn specific patterns through iterative training.

The AI Winter (1970s):
Despite early promise, neural network research faced significant setbacks in the 1970s. Marvin Minsky and Seymour Papert's 1969 book "Perceptrons" highlighted fundamental limitations of single-layer networks, particularly their inability to solve the XOR problem. This led to reduced funding and interest in neural network research, a period known as the "AI Winter."

Renaissance (1980s-1990s):
The field experienced a revival with several breakthrough developments:
1. The backpropagation algorithm became widely recognized as a solution for training multi-layer networks
2. Improvements in computer processing power made larger networks feasible
3. New architectures like Convolutional Neural Networks (CNNs) emerged
4. Successful applications in pattern recognition and speech processing demonstrated practical value

Modern Era (2000s-Present):
The explosion of big data and computational power has led to remarkable achievements:
- Deep learning models have surpassed human performance in various tasks
- Applications range from computer vision to natural language processing
- Transfer learning has enabled more efficient model training
- Architectures like transformers have revolutionized language models

Technical Foundations:

Neural networks consist of interconnected layers of nodes, each performing weighted calculations:
1. Input Layer: Receives raw data
2. Hidden Layers: Process information through weighted connections
3. Output Layer: Produces final results

Key concepts include:
- Activation functions (ReLU, sigmoid, tanh)
- Weight initialization and adjustment
- Loss functions and optimization algorithms
- Regularization techniques

Practical Applications:

Modern neural networks have found applications across numerous fields:
* Healthcare: Disease diagnosis, drug discovery, medical image analysis
* Finance: Risk assessment, fraud detection, algorithmic trading
* Transportation: Autonomous vehicles, traffic prediction, route optimization
* Entertainment: Content recommendations, game AI, art generation

Challenges and Future Directions:

Despite their success, neural networks face several ongoing challenges:
1. Interpretability and explainability of decisions
2. Energy consumption and computational requirements
3. Data privacy and ethical considerations
4. Robustness against adversarial attacks

Research continues in areas such as:
- More efficient architectures
- Unsupervised learning approaches
- Neuromorphic computing
- Integration with symbolic AI systems

The field of neural networks continues to evolve rapidly, with new architectures and applications emerging regularly. As our understanding of both biological and artificial neural networks deepens, we can expect further innovations in this transformative technology.  
    """,
    """The Crystal Songkeeper: A Tale of the Echoing Peaks
In the shadow of the Echoing Peaks, where ancient crystals hummed with forgotten melodies, lived a young apprentice named Lyra. Her small cottage, perched precariously on the mountainside, glowed with an ethereal blue light that pulsed in rhythm with the crystals scattered throughout the valley below. Every morning, she would wake to the harmonic resonance of the mountain's song, a symphony that had guided her people for countless generations.
"The crystals are growing restless," Lyra whispered to her mentor, Master Theron, one particularly bright morning. The old Songkeeper's eyes, as deep and blue as the crystals themselves, narrowed with concern. "Yes, young one. The Discord approaches." His weathered hands traced the intricate patterns carved into his staff, each line telling the story of songs past.
The Discord was no mere legend. Every thousand years, the crystal songs would begin to falter, their harmonies becoming discordant and chaotic. Without intervention, the dissonance would grow until it shattered every crystal in the valley, unleashing catastrophic energy that could reshape the very mountains themselves. According to the ancient texts, only a true Songkeeper could prevent this disaster by performing the Grand Harmony - a complex melody that would restore balance to the crystal network.
Lyra spent her days practicing the traditional songs, her voice carrying through the valley as she learned to match the precise frequencies of each crystal type. The basic crystals responded to simple melodies: the blue ones resonated with gentle lullabies, while the green crystals preferred more lively folk tunes. But the rare purple crystals, those were different. They required complex harmonies that few could master.
"Your technique has improved," Master Theron observed, watching as Lyra successfully activated a cluster of purple crystals. The crystals pulsed with a deep violet light, their energy synchronizing with her voice. "But the Grand Harmony requires more than just technical skill. It requires understanding the very essence of the crystal songs."
One evening, as the setting sun painted the crystals in brilliant hues of orange and pink, Lyra made a discovery. She noticed that when she sang to multiple crystals simultaneously, they didn't just resonate individually - they created entirely new harmonies between themselves. "Master Theron!" she called excitedly. "Listen to this!" She demonstrated her finding, causing three different crystal types to create an intricate counterpoint that none of them could produce alone.
The old Songkeeper's eyes widened in amazement. "In all my years..." he muttered, stroking his silver beard. "Child, you may have uncovered something that was lost to time. The ancient texts speak of the Crystal Chorus - the ability to weave multiple crystal songs into a single, greater harmony."
Over the next few weeks, Lyra devoted herself to exploring this new technique. She mapped the relationships between different crystal types, creating complex musical patterns that hadn't been heard in centuries. The valley seemed to come alive with her experiments, the crystals glowing brighter and more vibrantly than anyone could remember.
But time was running short. The signs of the approaching Discord grew more obvious each day. Crystal formations that had sung the same melodies for generations began to waver, their songs becoming erratic and unpredictable. Small fissures appeared in some of the larger crystals, sending discordant notes echoing through the valley.
When the day of the Discord finally arrived, the sky darkened with ominous clouds that swirled above the highest peaks. The crystals throughout the valley pulsed with irregular rhythms, their usual harmonious songs degrading into chaos. Master Theron looked at Lyra with a mix of pride and concern. "It's time," he said simply.
Lyra took her position in the Crystal Amphitheater, an ancient structure formed from crystalline formations that spiraled up the mountainside. As she began to sing, she didn't follow the traditional Grand Harmony that had been passed down through generations. Instead, she used her understanding of the Crystal Chorus to weave together the songs of every crystal type in the valley.
The effect was immediate and extraordinary. Waves of colored light rippled through the crystal networks, each formation adding its voice to her song. The discordant energies that had been building for months began to shift and change, finding new patterns within her complex harmony. Even the oldest and largest crystals, which had remained dormant for centuries, awakened to join her symphony.
Master Theron watched in awe as his apprentice conducted the largest Crystal Chorus ever attempted. "She's not just preventing the Discord," he realized. "She's transforming it into something new." The chaotic energies that threatened to shatter the crystals were being reformed into stable, beautiful harmonies that strengthened the very formations they had meant to destroy.
Hours passed as Lyra maintained the incredible performance, her voice never wavering as she guided the crystal energies into their new configuration. When the last notes finally faded away, the valley had been transformed. The crystals glowed with a steady, brilliant light, their songs clearer and stronger than ever before.
"You've done more than save the valley," Master Theron said, tears in his eyes. "You've elevated our art to heights we never imagined possible." Lyra smiled, exhausted but proud. She could feel the crystals humming contentedly, their songs now interwoven in ways that would prevent the Discord from ever threatening the valley again.
In the years that followed, Lyra's discovery of the Crystal Chorus revolutionized the art of crystal singing. She established a new school of Songkeeping that taught not just the individual songs of each crystal type, but the complex harmonies that could be created between them. The Echoing Peaks became known not just as a place of power, but as the birthplace of a new age of crystal harmony.
And on quiet evenings, when the sun's last rays caught the crystal formations just right, visitors to the valley would sometimes see a figure standing in the Crystal Amphitheater, her voice rising and falling with the eternal songs of the mountains. For Lyra had become more than just a Songkeeper - she had become part of the very melody of the peaks themselves, her legacy resonating through the crystals for generations to come.
"""]

# Utils

In [3]:
def mean_pooling(token_embeddings: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    """
    Compute mean pooling for segment-level token embeddings.
    
    Args:
        token_embeddings: Token embeddings of shape [num_tokens, hidden_dim]
        attention_mask: Attention mask of shape [num_tokens]
    
    Returns:
        Pooled embedding of shape [hidden_dim]
    """
    
    # Add batch dimension if not present
    if token_embeddings.dim() == 2:
        token_embeddings = token_embeddings.unsqueeze(0)  # [1, num_tokens, hidden_dim]
    if attention_mask.dim() == 1:
        attention_mask = attention_mask.unsqueeze(0)      # [1, num_tokens]
        
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    return (sum_embeddings / sum_mask)[0]

In [4]:
def split_with_delimiter(text: str, delimiters: List[str]) -> Tuple[List[str], List[Tuple[int, int]]]:
    """
    Split text on multiple delimiters and return segments with their offsets.
    
    Args:
        text: Input text to split
        delimiters: List of delimiter strings
    
    Returns:
        Tuple containing:
        - List of segments
        - List of (start, end) offsets for each segment
    
    """

    escaped_delimiters = [re.escape(d) for d in delimiters]
    pattern = f"[^{''.join(escaped_delimiters)}]*[{''.join(escaped_delimiters)}]|[^{''.join(escaped_delimiters)}]+$"
    

    matches = list(re.finditer(pattern, text))
    segments, offsets = [], []
    for match in matches:
        segment = match.group().strip()
        if segment: 
            segments.append(segment)
            offsets.append((match.start(), match.end()))
    
    return segments, offsets

In [5]:
def find_overlapping_tuples(
    list1: List[Tuple[int, int]], 
    list2: List[Tuple[int, int]]
) -> List[int]:
    """
    Find indices in list2 where tuples overlap with any tuple in list1.
    A tuple overlaps if its range intersects with another tuple's range.
    
    Args:
        list1: First list of (start, end) tuples
        list2: Second list of (start, end) tuples
    
    Returns:
        List of indices from list1 where tuples overlap with list2
    """
    overlapping_indices_list = []
    
    for (start1, end1) in list1:
        overlapping_indices = []
        for idx, (start2, end2) in enumerate(list2):
            if (start2 <= end1 and end2 >= start1):
                overlapping_indices.append(idx)
            if start2 > end1:
                break
        overlapping_indices_list.append(overlapping_indices)
    
    return overlapping_indices_list


In [6]:
def optimal_segmentation(
    values, 
    min_chunk_size, 
    max_chunk_size
    ):
    """
    Segments the input values into chunks that maximize the similarity within each chunk.
    
    Args:
        values (numpy.ndarray): A 2D array where each row represents a data point.
        min_chunk_size (int): The minimum size of each chunk.
        max_chunk_size (int): The maximum size of each chunk.
        
    Returns:
        list of tuples: A list of tuples where each tuple represents the start and end indices of a chunk.
    """
    n = len(values)
    similarity_matrix = np.dot(values, values.T)
    mean_similarity = np.mean(similarity_matrix[np.triu_indices(similarity_matrix.shape[0], k=1)])
    similarity_matrix = similarity_matrix - mean_similarity
    np.fill_diagonal(similarity_matrix, 0)

    dp = np.zeros(n)
    segmentation = np.zeros(n, dtype=int)

    for i in range(n):
        max_reward = float('-inf')
        best_start = i

        for size in range(min_chunk_size, min(max_chunk_size + 1, i + 2)):
            if i - size + 1 >= 0:
                reward = np.sum(similarity_matrix[i - size + 1:i + 1, i - size + 1:i + 1])
                if i - size >= 0:
                    reward += dp[i - size]
                if reward > max_reward:
                    max_reward = reward
                    best_start = i - size + 1

        dp[i] = max_reward
        segmentation[i] = best_start

    boundaries = []
    i = n - 1
    while i >= 0:
        boundaries.append((segmentation[i], i))
        i = segmentation[i] - 1

    boundaries.reverse()
    return boundaries

# Models

In [7]:
device: str = "cuda" if torch.cuda.is_available() else "cpu"

# Embedding model

task: str = 'retrieval.passage'
max_tokens: int = 8192
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3", use_fast=True)
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to(device)
model.eval()

# multilingual NER model

ner_tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
ner_model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
model.eval()
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)



# Processing documents

In [8]:
# Step 1: Get all tokens, offsets, and attention masks for the documents

documents = []
for doc in docs:
    doc_tokens = tokenizer(
        doc,
        return_offsets_mapping=True,
        return_attention_mask=True,
        add_special_tokens=False,
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    ).to(device)
    doc_tokens['text'] = doc
    documents.append(doc_tokens)

In [9]:
# Step 2: get embedding at the token level for all documents

max_tokens = 8192

for document in documents:
    doc_input_ids = torch.split(document['input_ids'], max_tokens, dim=1)
    doc_attention_mask = torch.split(document['attention_mask'], max_tokens, dim=1)
    doc_offsets = torch.split(document['offset_mapping'], max_tokens, dim=1)
    
    doc_embeddings = []
    for input_ids, attention_mask, offsets in zip(doc_input_ids, doc_attention_mask, doc_offsets):
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            tokens_embeddings = outputs.last_hidden_state
            doc_embeddings.append(tokens_embeddings)   
    
    document['tokens_embeddings'] = torch.concat(doc_embeddings, dim=1)
documents[0]['tokens_embeddings'].shape

torch.Size([1, 740, 1024])

In [10]:
# Step 3: Compute mean-pooled embeddings for each document at the low level structure level

split_delimiters: List[str] = [".", "!", "?", ":", "\n"]

for document in documents:
    segments_text, segment_offset= split_with_delimiter(document['text'], split_delimiters)
    segments_tokens_idx = find_overlapping_tuples(segment_offset, document['offset_mapping'][0])
    segments = []
    for segment_token_idx in segments_tokens_idx:
        segment_tokens_embeddings = document['tokens_embeddings'][0][segment_token_idx]
        segment_attention_mask = document['attention_mask'][0][segment_token_idx]
        segment_embeddings = mean_pooling(segment_tokens_embeddings, segment_attention_mask)
        segments.append(segment_embeddings)
    document['segments_embeddings'] = torch.stack(segments)
    document['segments_text'] = segments_text
    document['segments_offset'] = segment_offset  
    document['segments_tokens_offset'] = [(min(seg), max(seg)) for seg in segments_tokens_idx]
      

In [11]:
# Step 4 : build optimal chunks for each document

min_chunk_size = 5
max_chunk_size = 20
for document in documents:
    boundaries = optimal_segmentation(document['segments_embeddings'], min_chunk_size, max_chunk_size)
    chunk_text, chunk_embedding = [],[]
    for (first_segment, last_segment) in boundaries:
        start_token = document['segments_tokens_offset'][first_segment][0]
        end_token = document['segments_tokens_offset'][last_segment][1]+1
        chunk_tokens_embeddings = document['tokens_embeddings'][0, start_token:end_token]
        chunk_attention_mask = document['attention_mask'][0, start_token:end_token]
        chunk_text.append(" ".join(document['segments_text'][first_segment:last_segment+1]))
        chunk_embedding.append(mean_pooling(chunk_tokens_embeddings, chunk_attention_mask))   
    document['chunks_text'] = chunk_text
    document['chunks_embedding'] = torch.stack(chunk_embedding) 

  similarity_matrix = np.dot(values, values.T)


In [12]:
# Step 5: Extracting keywords

for document in documents:
    chunks_keywords = []
    for chunk in document['chunks_text']:
        entities = ner_pipeline(chunk)
        # TODO: filter entities based on confidence score and type (entity['entity_group'], entity['score'])
        chunks_keywords.append([entity['word'] for entity in entities])
    document['chunks_keywords'] = chunks_keywords

In [None]:
# Step 6: Extracting keyphrases with bm25
# TODO: one separate BM25 per chunk

chunks_text = [doc['chunks_text'] for doc in documents]
flattened_chunks_text = [chunk for sublist in chunks_text for chunk in sublist]
tokenized_chunks = [chunk.split() for chunk in flattened_chunks_text]

bm25 = BM25Okapi(tokenized_chunks)

for document in documents:
    bm25_embeddings = []    
    for chunk in document['chunks_text']:
        bm25_embedding = bm25.get_scores(chunk)
        bm25_embeddings.append(bm25_embedding)
    document['chunks_bm25'] = bm25_embeddings

In [14]:
# Cleaning step
for doc in documents:
    for k in ['input_ids', 'attention_mask', 'offset_mapping', 'tokens_embeddings', 'segments_embeddings', 'segments_text', 'segments_offset', 'segments_tokens_offset']:
        doc.pop(k)
documents[0].keys()

dict_keys(['text', 'chunks_text', 'chunks_embedding', 'chunks_keywords', 'chunks_bm25'])

# Testing

In [None]:
query = "What are the practical applications of neural networks?"
query_tokens = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
query_tokens_embeddings = model(**query_tokens.to(device))
query_embeddings = mean_pooling(query_tokens_embeddings.last_hidden_state, query_tokens['attention_mask'])
query_bm25 = bm25.get_scores(query.split())
query_keywords = [entity['word'] for entity in ner_pipeline(query)]

In [None]:
print(f"keywords: {query_keywords}")
print(f"bm25: {query_bm25}")
print(f"query embedding: {query_embeddings}")

keywords: ['neu']
bm25: [3.46378043 4.85887784 1.01981112 0.         3.42006948 0.87089473
 0.         4.89224418 3.14475968 1.82215622 1.89617406 1.9507797
 1.92552558 1.0941129  0.98224641 2.25799814]
query embedding: tensor([ 1.0069, -2.5083, -0.2263,  ..., -0.6470,  0.8299,  0.3015])


In [None]:
for document in documents:
    for chunk_text, chunk_bm25, chunk_embedding, chunk_keywords in zip(document['chunks_text'], document['chunks_bm25'], document['chunks_embedding'], document['chunks_keywords']):
        print(f"chunk: {chunk_text}")
        print(f"keywords: {chunk_keywords}")
        print(f"bm25: {chunk_bm25}")
        print(f"embedding: {chunk_embedding}")
        bm25_score = bm25.get


chunk: The History and Impact of Artificial Neural Networks Artificial Neural Networks (ANNs) represent a fundamental shift in how we approach computation and artificial intelligence. Inspired by biological neural networks, these systems have evolved from simple perceptrons in the 1950s to today's sophisticated deep learning architectures. Early Development (1940s-1950s): The first artificial neuron was proposed by Warren McCulloch and Walter Pitts in 1943. Their mathematical model showed how neurons might work, demonstrating that simple neural networks could compute basic logical functions. In 1957, Frank Rosenblatt developed the perceptron, the first algorithm that could learn specific patterns through iterative training. The AI Winter (1970s): Despite early promise, neural network research faced significant setbacks in the 1970s. Marvin Minsky and Seymour Papert's 1969 book "Perceptrons" highlighted fundamental limitations of single-layer networks, particularly their inability to so

# Other

In [15]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=64, random_state=411)
kmeans.fit(token_embeddings.cpu().numpy())

# Normalize the token embeddings and cluster centers
normalized_token_embeddings = F.normalize(token_embeddings, p=2, dim=1)
concepts = F.normalize(torch.tensor(kmeans.cluster_centers_), p=2, dim=1)

# Compute the cosine similarity
concepts_target = torch.mm(normalized_token_embeddings, concepts.T)

concepts.shape, concepts_target.shape

ModuleNotFoundError: No module named 'sklearn'

In [None]:
queries  = [
    "What were the major developments in neural networks during the 1980s and 1990s?",
    "Explain the basic components of a neural network's architecture.",
    "What are the current applications of neural networks in healthcare?",
    "What caused the AI Winter in the 1970s?",
    "What are the main challenges facing neural networks today?",
]

In [None]:
query_token_embeddings, _, query_attention_mask = get_embedding(queries[1:2])
query_embedding = mean_pooling(query_token_embeddings.unsqueeze(0), query_attention_mask.unsqueeze(0))
query_embedding.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between query embedding and chunk embeddings
similarities = cosine_similarity(query_embedding.cpu().numpy(), np.stack([chunk['embedding'].cpu().numpy() for chunk in chunks]))

for i, similarity in enumerate(similarities[0]):
    print(f"chunk: {i}")
    print(f"Similarity: {np.max(similarity)}")
    print()
    
# Get the most similar chunks
print("most similar chunk:")
most_similar_idx = np.argmax(similarities)
most_similar_chunk = chunks[most_similar_idx]

most_similar_chunk

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

concept_similarities = cosine_similarity(query_embedding.cpu().numpy(), concepts.cpu().numpy())

concepts_target_np = np.array(concepts_target)
concept_similarities_np = np.array(concept_similarities.T)
token_importances = np.dot(concepts_target_np, concept_similarities_np)[:,0]
token_importances = (token_importances - np.min(token_importances)) / (np.max(token_importances) - np.min(token_importances))

In [None]:
def find_dense_subsequences(
    values: np.ndarray,
    min_size: int = 1,
    max_size: int = None,
    num_sequences: int = 3,
    min_density: float = None,
    min_gap: int = 0
) -> list[tuple[int, int, float]]:

    if max_size is None:
        max_size = len(values)
        
    n = len(values)
    results = []
    used_positions = np.zeros(n, dtype=bool)

    def is_valid_region(start: int, end: int) -> bool:
        for s in range(max(0, start - min_gap), min(n, end + min_gap)):
            if used_positions[s]:
                return False
        return True
    
    cumsum = np.concatenate(([0], np.cumsum(values)))
    
    while len(results) < num_sequences:
        max_density = float('-inf')
        best_start = None
        best_end = None
        

        for length in range(min_size, min(n + 1, max_size + 1)):
            for start in range(n - length + 1):
                end = start + length

                if not is_valid_region(start, end):
                    continue

                curr_sum = cumsum[end] - cumsum[start]
                density = curr_sum / length
                
                if density > max_density:
                    max_density = density
                    best_start = start
                    best_end = end

        if best_start is None or (min_density is not None and max_density < min_density):
            break

        used_positions[best_start:best_end] = True
        results.append((best_start, best_end, max_density))

    results.sort(key=lambda x: x[2], reverse=True)
    return results

In [None]:
concepts_results = find_dense_subsequences(token_importances, min_size=20, max_size=100, num_sequences=3, min_density=0.2, min_gap=10)
print(f"Found {len(concepts_results)} dense passages:")
for start, end, density in concepts_results:
    text_start = offsets_mapping[start][0].item()
    text_end = offsets_mapping[end][1].item()
    dense_passage = docs[0][text_start:text_end]
    print(f"Dense Passage: {dense_passage}")
    print(f"Density: {density}")
    print()