<a href="https://colab.research.google.com/github/DivijJaswal/LLM-Research/blob/main/Test_Run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/DivijJaswal/LLM-Research.git


Cloning into 'LLM-Research'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 24 (delta 12), reused 14 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (24/24), 9.64 KiB | 9.64 MiB/s, done.
Resolving deltas: 100% (12/12), done.


In [2]:
import torch
from transformers import pipeline
from huggingface_hub import login
from transformers import AutoTokenizer
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import spacy
from sklearn.decomposition import TruncatedSVD


In [9]:
def shorten_text(text, chunk_size, method, summarizer = None):
    """
    Shortens the text using the specified method.
    :param text: The original text to shorten.
    :param chunk_size: The size of each chunk if needed.
    :param method: Method to shorten the text.
    :return: Shortened text.
    """
    if method == "clipping":
        return text[:chunk_size]

    elif method == "iterative":
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        summarized_chunks = [summarizer(chunk, min_length=50, max_length=100, do_sample=False)[0]['summary_text'] for chunk in chunks]
        return " ".join(summarized_chunks)

    elif method == "random_removal":
        words = text.split()
        while len(words) > chunk_size:
            index_to_remove = random.randint(0, len(words) - 1)
            del words[index_to_remove]
        return " ".join(words)

    elif method == "sentence_ranking":
        return tfidf_sentence_ranking(text, chunk_size)

    elif method == "sliding_window":
        return sliding_window(text, chunk_size)

    elif method == "entity_filtering":
        return entity_filtering(text, chunk_size)

    elif method == "summarize_summary":
        return summarize_summary(text, chunk_size)

    elif method == "lsa":
        return lsa_text_summarization(text, chunk_size)

def tfidf_sentence_ranking(text, chunk_size):
    sentences = text.split('. ')
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()
    sentence_scores = np.sum(vectors, axis=1)

    ranked_sentences = sorted(((score, i, s) for i, (score, s) in enumerate(zip(sentence_scores, sentences))), reverse=True)
    selected_sentences = [s for _, _, s in ranked_sentences[:int(chunk_size / 20)]]
    return '. '.join(selected_sentences)

def sliding_window(text, chunk_size, overlap=100, summarizer = None):
    words = text.split()
    windows = []
    for i in range(0, len(words), chunk_size - overlap):
        window = words[i:i+chunk_size]
        windows.append(" ".join(window))

    summarized_windows = [summarizer(w, min_length=50, max_length=100, do_sample=False)[0]['summary_text'] for w in windows]
    return " ".join(summarized_windows)

def entity_filtering(text, chunk_size):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    important_sentences = []
    for sent in doc.sents:
        if any(ent.label_ in ["PERSON", "ORG", "GPE", "DATE"] for ent in sent.ents):
            important_sentences.append(sent.text)
        if len(important_sentences) >= chunk_size / 20:
            break
    return " ".join(important_sentences)


def summarize_summary(text, chunk_size, min_length=200, summarizer = None):
    """
    Recursively summarize the text until it is under a desired length.
    """
    summarized_text = text
    iteration = 0
    while len(summarized_text) > chunk_size:
        print(f"Iteration {iteration + 1}: Text too long. Summarizing again.")
        summarized_text = summarizer(summarized_text, num_beams=5, min_length=min_length, max_length=chunk_size, do_sample=False)[0]['summary_text']
        iteration += 1
    return summarized_text

def lsa_text_summarization(text, chunk_size):
    """
    Use Latent Semantic Analysis (LSA) to extract the most important concepts from the text and return the most relevant sentences.
    """
    sentences = text.split('. ')

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)

    lsa_model = TruncatedSVD(n_components=1, n_iter=100)
    lsa_model.fit(X)
    lsa_scores = lsa_model.transform(X)

    # Rank sentences by their relevance to the main topics
    ranked_sentences = sorted(((lsa_scores[i, 0], s) for i, s in enumerate(sentences)), reverse=True)

    # Select the top N sentences based on LSA scores
    selected_sentences = [s for _, s in ranked_sentences[:int(chunk_size / 20)]]
    return '. '.join(selected_sentences)

In [21]:
import torch
from transformers import pipeline
from huggingface_hub import login
from transformers import AutoTokenizer

def summarize_text(text,num_beams = 5):

    login(token = "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")

    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
    summarizer = pipeline("summarization", tokenizer=tokenizer,model="google/flan-t5-base")

    short_summary = summarizer(text ,num_beams, min_length = 50, max_length =100,do_sample=False)
    medium_summary = summarizer(text ,num_beams, min_length = 100, max_length =150,do_sample=False)
    large_summary = summarizer(text ,num_beams, min_length = 150, max_length =200,do_sample=False)

    print(short_summary)
    print(medium_summary)
    print(large_summary)

In [5]:
input_file="/content/LLM-Research/text1.txt"
print(input_file)

/content/LLM-Research/text1.txt


In [15]:
with open(input_file, 'r', encoding='utf-8') as file:
    text = file.read()

In [17]:
text = shorten_text(text, chunk_size = 480, method = 'random_removal')

In [19]:
len(text.split())

480

In [22]:
summarize_text(text, num_beams = 5)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Ignoring args : (5,)
Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors
Ignoring args : (5,)
Ignoring args : (5,)


[{'summary_text': 'ICC is the international federation responsible the governance the sport of cricket and Code of Conduct for Players and Player “Code of Conduct” is adopted and part of ICC’s continuing efforts maintain of cricket by (a) effective any participant from conducting themselves improperly on off field-of-play or manner that is contrary to the Spirit Cricket; and (b) robust proportionate procedure pursuant all of conduct can be dealt fairly, with certainty and an expeditious manner.'}]
[{'summary_text': 'ICC CODE OF FOR PLAYERS AND PLAYER INTRODUCTION ICC is the international federation responsible the governance the sport of cricket and Code of Conduct for Players and Player is adopted and part of ICC’s continuing efforts maintain of cricket by (a) effective any participant from conducting themselves improperly on off field-of-play or manner that is contrary to the Spirit Cricket; and (b) robust proportionate procedure pursuant all of conduct can be dealt fairly, with cert