In [12]:
import os, json

def read_input_texts_from_folder(raw_text_corpus_path, return_as_string):
        """
        Read text content from all JSON files in a folder and returns it as an array where each element represents a JSON entry in each file.
        Used to extract all Wikipedia articles from a raw text Wikipedia dump. Every entry of the array is a Wikipedia article.
        
        Args:
            raw_text_corpus_path (str): The path of the raw text corpus to read.
            return_as_string (bool): If true, all text is returned concatenated into a single string using line breaks.
        
        Returns:
            all_text (list | str): All text from the folder either as an array for each article or concatenated.
        """
        
        text_list = []

        for root, _, files in os.walk(raw_text_corpus_path):
            for file in files:
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        try:
                            # Parse each line as JSON and extract the 'text' field
                            data = json.loads(line)
                            text_content = data.get('text', '').strip()  # Strip any leading/trailing whitespace
                            if text_content:  # Ensure only non-empty content is added
                                text_list.append(text_content)
                        except json.JSONDecodeError:
                            continue  # Skip lines that are not valid JSON
        
        if return_as_string: text_list = "\n".join(text_list)
        return text_list


In [13]:
# wikipedia_text_entirety = read_input_texts_from_folder("D://raw_text//AA", False)

In [14]:
wikipedia_text = read_input_texts_from_folder("../context/wikipedia", False)

In [3]:
import sys; sys.path.append("../query")
from embedding_model import EmbeddingModel
embed = EmbeddingModel()

In [16]:
first_article = wikipedia_text[0]
first_article

In [18]:
import sys; sys.path.append("../processor")
from chunked_pooling import chunk_by_sentences

In [24]:
# Split article into chunks
first_article_chunks, span_annotations = chunk_by_sentences(first_article, tokenizer=embed.tokenizer)

chunks:['Anarchism\n\nAnarchism is a political philosophy and movement that is against all forms of authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including the state and capitalism.', ' Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations.', ' As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).', '\n\nAlthough traces of anarchist ideas are found all throughout history, modern anarchism emerged from the Enlightenment.', " During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished in most parts of the world and had a significant role in workers' struggles for emancipation.", ' Various anarchist schools of thought formed during this period.', ' Anarchists

In [None]:
# Create embeddings for each chunk of the article
chunk_embeddings = []
for chunk in first_article_chunks:
    chunk_embedding = embed.get_embedding(chunk, False)

    chunk_embeddings.append(chunk_embedding)

# Other

In [13]:
from transformers import AutoModel
from transformers import AutoTokenizer

import sys; sys.path.append("../processor")
from chunked_pooling import chunked_pooling, chunk_by_sentences

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)

In [17]:
def late_chunking(input_text):
    chunks, span_annotations = chunk_by_sentences(input_text, tokenizer)

    inputs = tokenizer(input_text, return_tensors='pt')
    model_output = model(**inputs)
    embeddings = chunked_pooling(model_output, [span_annotations])[0]

    return embeddings

In [26]:
chunks, span_annotations = chunk_by_sentences(first_article, tokenizer)

embeddings = model.encode(chunks)

chunk_position from 1: [(44, 237), (63, 348), (104, 563), (124, 682), (162, 888), (172, 952), (212, 1147), (255, 1362), (280, 1526), (313, 1719), (333, 1817), (400, 2018), (416, 2090), (452, 2225), (479, 2380), (534, 2642), (574, 2812), (611, 2982), (647, 3143), (717, 3533), (746, 3680), (766, 3775), (783, 3858), (811, 3998), (834, 4124), (849, 4211), (900, 4469), (959, 4746), (978, 4835), (1010, 4978), (1033, 5073), (1049, 5157), (1078, 5291), (1094, 5406), (1123, 5523), (1147, 5654), (1165, 5750), (1181, 5826), (1214, 5967), (1226, 6021), (1241, 6105), (1270, 6293), (1305, 6497), (1338, 6653), (1400, 6950), (1430, 7105), (1457, 7230), (1501, 7478), (1524, 7609), (1561, 7780), (1584, 7900), (1600, 7998), (1625, 8116), (1641, 8203), (1645, 8224), (1672, 8349), (1704, 8509), (1722, 8584), (1736, 8652), (1758, 8753), (1781, 8865), (1810, 8990), (1833, 9119), (1875, 9353), (1924, 9607), (1938, 9689), (1988, 9970), (2011, 10096), (2038, 10226), (2064, 10380), (2104, 10608), (2142, 10829), 

In [27]:
embeddings

array([[-0.18438207, -0.9284356 ,  0.53803533, ...,  0.9604816 ,
        -0.5733379 , -0.23253946],
       [ 0.1779887 , -0.25393423,  0.6923518 , ...,  0.5507926 ,
        -0.5229826 , -0.19712846],
       [-0.14048952, -0.9132976 ,  0.61803347, ...,  0.8029017 ,
        -0.3384023 , -0.2360551 ],
       ...,
       [-0.05098847, -0.2967887 ,  0.62198156, ...,  0.79796517,
        -0.33379242, -0.49701294],
       [ 0.02915758, -0.3697654 ,  0.69791025, ...,  0.47631672,
        -0.49336445, -0.2564552 ],
       [-0.4571627 , -0.2647516 ,  0.7009412 , ...,  0.46886438,
        -0.3131449 , -0.77323276]], dtype=float32)

In [24]:
enumerat

' In Europe, various religious sects developed anti-state and libertarian tendencies.'

In [4]:
from typing import Union
import torch
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("avsolatorio/NoInstruct-small-Embedding-v0")
tokenizer = AutoTokenizer.from_pretrained("avsolatorio/NoInstruct-small-Embedding-v0")


# Source: https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0
def get_embedding(text: Union[str, list[str]], mode: str = "sentence"):
    model.eval()

    assert mode in ("query", "sentence"), f"mode={mode} was passed but only `query` and `sentence` are the supported modes."

    if isinstance(text, str):
        text = [text]

    inp = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        output = model(**inp)

    # The model is optimized to use the mean pooling for queries,
    # while the sentence / document embedding uses the [CLS] representation.

    if mode == "query":
        vectors = output.last_hidden_state * inp["attention_mask"].unsqueeze(2)
        vectors = vectors.sum(dim=1) / inp["attention_mask"].sum(dim=-1).view(-1, 1)
    else:
        vectors = output.last_hidden_state[:, 0, :]

    return vectors

In [10]:
for article in wikipedia_text:

    article_embedding = get_embedding(article)

    print(article_embedding)
    print(len(article_embedding))
    break

tensor([[-1.3930e-01,  4.0428e-02,  4.8796e-02, -9.9161e-02,  4.0330e-02,
          3.2657e-01,  2.0233e-01,  1.7060e-01, -1.3455e-01, -2.3104e-01,
          1.1235e-01, -2.0545e-01, -1.1739e-01,  1.9158e-01,  2.2008e-01,
         -1.6492e-02, -1.8121e-01,  2.6459e-01, -2.1677e-01,  6.5115e-01,
         -3.3521e-02, -1.8031e-01, -2.3771e-01, -1.9862e-01,  1.5056e-02,
          2.0328e-01,  1.0520e-01,  1.2156e-01, -2.9869e-01, -1.5655e+00,
          1.0728e-01, -4.0765e-01,  3.3372e-01, -9.0552e-02,  6.6287e-02,
          3.1983e-01, -2.3777e-01, -3.1746e-02, -3.4505e-01,  3.7471e-01,
          1.4835e-01,  1.8229e-01,  1.9874e-01, -3.4285e-01, -1.2271e-01,
          1.1697e-01, -1.4483e-02, -1.1616e-01, -4.6207e-01, -5.5760e-01,
          5.2549e-01,  5.4421e-02, -1.6177e-01,  2.5220e-01,  1.8216e-01,
          3.5448e-01,  2.0906e-01, -1.1783e-02,  3.2788e-01, -4.1459e-01,
          5.9622e-01,  4.5835e-01, -2.1890e+00,  6.4322e-01,  4.5631e-01,
          4.8763e-01,  3.1803e-01, -2.

In [7]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# # class Summariser():
# #     def __init__(self):
# #         self.tokenizer = T5Tokenizer.from_pretrained("google/long-t5-tglobal-base")
# #         self.model = T5ForConditionalGeneration.from_pretrained("google/long-t5-tglobal-base")

# #     def summarise(self, txt, min_length=100, max_length=512):
# #         """
# #         """
        
# #         input_text = "Summarize: " + txt
# #         input_ids = self.tokenizer.encode(input_text, return_tensors="pt")
# #         outputs = self.model.generate(input_ids, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=5)
# #         summary = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
# #         return summary

In [1]:
import sys; sys.path.append("../")
from language_model import LanguageModel
import transformers, torch
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
from abc import ABC

class Summariser(LanguageModel):

    def __init__(self, model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct", causal = True, quantized = True):
        super().__init__(model_name, causal, quantized)

        self.pipeline = transformers.pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            torch_dtype=torch.float16,
            device_map="auto",
        )
    
    def __summarise__(self, input_text, limit):

        prompt = f"You are an expert in summarising large documents into concise summaries which contain as much meaningful information from the original document as possible. Ensure your summaries only contain content from the original document. Summarise all text into strictly {limit} words or less."

        input = [
                {'role':'system','content':prompt},
                {'role':'user','content':f"Summarise the following text:\n{input_text}"}
        ]
        
        chat_history = self.pipeline(
            input,
            temperature=0.01,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id= self.tokenizer.eos_token_id,
            truncation = True,
            max_new_tokens=limit
        )

        assistant_response = chat_history[0]['generated_text'][-1]
        summary = assistant_response['content']

        return chat_history, summary

: 

In [11]:
sum_model = Summariser()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

You shouldn't move a model that is dispatched using accelerate hooks.


In [None]:
chat_history, summary = sum_model.__summarise__(first_article, limit=32)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
