<a href="https://colab.research.google.com/github/AbelAbeb/NLP-Assignment2/blob/main/NLP_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import torch
import nltk
import re
import os

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


def preprocess_text(text):
    # HTML tag removal
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text(separator=" ")

    # Lowercasing
    text = text.lower()

    # Noise removal (special characters, numbers, etc.)
    text = re.sub('[^a-z]+', ' ', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Stopword removal
    tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

def compare_slices(sentence1, sentence2, threshold=0.8):
    # Preprocess the sentences
    sentence1 = preprocess_text(sentence1)
    sentence2 = preprocess_text(sentence2)

    # Vectorization
    vectorizer = TfidfVectorizer().fit_transform([sentence1, sentence2])
    vectors = vectorizer.toarray()

    # Calculate cosine similarity
    cos_sim = cosine_similarity(vectors)[0, 1]

    # Compare with threshold
    return cos_sim > threshold

def remove_similar_slices(sliced_texts, similarity_threshold=0.2):
    filtered_slices = []

    for i, sliced_text in enumerate(sliced_texts):
        similar_slices_indices = []

        # Compare the current slice with the rest of the slices
        for j, other_slice in enumerate(sliced_texts):
            if i != j and j > i and compare_slices(sliced_text, other_slice, similarity_threshold):
                similar_slices_indices.append(j)

        # Print the results
        if similar_slices_indices:
            print(f"Slice {i + 1} is similar to the following slices: {', '.join(map(lambda x: str(x + 1), similar_slices_indices))}")

            # Add the similar slices to the filtered_slices list
            for index in similar_slices_indices:
                #print("index", index)
                filtered_slices.append(index)

    # Remove repetitions of indices in filtered_slices
    updated_filtered_slices = list(set(filtered_slices))

    # Remove similar slices from the sliced_texts list
    for index in sorted(updated_filtered_slices, reverse=True):
        del sliced_texts[index]

    return sliced_texts

def slice_text_with_max_tokens_limit(text, max_tokens, similarity_threshold=0.2):
    # Load pre-trained GPT-2 tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text(separator=" ")
    # Lowercasing
    text = text.lower()
    # Tokenize the input text
    tokens = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
    sliced_texts = []
    if tokens.size(1) < max_tokens:
        # Decode the tensor (access the tensor from the list)
        decoded_text = tokenizer.decode(tokens[0], skip_special_tokens=True)
        sliced_texts.append(decoded_text)
        return sliced_texts


    # Check if the input text exceeds the maximum token limit
    if tokens.size(1) > max_tokens:
        print(f"Warning: Input text exceeds the maximum token limit of {max_tokens}. Slicing into smaller parts.")

    # Initialize variables for slicing

    start_idx = 0

    # Iterate through the tokens and create slices based on the maximum token limit
    while start_idx < tokens.size(1):
        end_idx = min(start_idx + max_tokens, tokens.size(1))
        sliced_text_tokens = tokens[0, start_idx:end_idx]

        # Decode the sliced tokens back to text
        sliced_text = tokenizer.decode(sliced_text_tokens, skip_special_tokens=True)
        sliced_texts.append(sliced_text)

        start_idx = end_idx

    # Remove similar slices
    sliced_texts = remove_similar_slices(sliced_texts, similarity_threshold)

    return sliced_texts

def generate_responses(sliced_texts):
    # Load pre-trained GPT-2 tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    # Generate responses for each sliced text
    for i, sliced_text in enumerate(sliced_texts):
        # Tokenize the sliced text
        user_question ="What is AI?"
        initial_input = sliced_text
        prompt = f"Initial Input:\n\n{initial_input}\n\nUser Question: {user_question}"
        input_ids = tokenizer.encode(prompt, return_tensors="pt")
        # Set attention_mask and pad_token_id
        attention_mask = torch.ones(input_ids.shape, device=model.device)
        pad_token_id = tokenizer.eos_token_id

        # Generate response from GPT-2
        output = model.generate(
            input_ids,
            max_length=1000,
            num_beams=5,
            do_sample=True,
            no_repeat_ngram_size=2,
            top_k=100,
            top_p=0.95,
            temperature=0.9,
            attention_mask=attention_mask,
            pad_token_id=pad_token_id
        )
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

        # Display the response
        print(f"Model: {generated_text}")

        if(i==0):
          break

file_path = '/content/drive/MyDrive/AI Class/NLP/NLP-Assignment2/doc_with_conclusion.txt'
# Example usage
input_text = read_text_from_file(file_path)
max_tokens_limit = 300

#preprocessed_text = preprocess_text1(input_text)
sliced_texts = slice_text_with_max_tokens_limit(input_text, max_tokens_limit)

# Display the updated sliced_texts
#for i, sliced_text in enumerate(sliced_texts):
  #print("LENGTH sliced_texts", len(sliced_texts))
  #print(f"SCLICED TEXT {i + 1}: {sliced_text}")

generate_responses(sliced_texts)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Slice 1 is similar to the following slices: 2, 3
Slice 2 is similar to the following slices: 3
Model: Initial Input:

introduction:

artificial intelligence (ai) has emerged as a transformative force, reshaping various facets of our lives. this document explores the evolution, current state, and future prospects of ai, delving into its technological foundations, applications, ethical considerations, and potential societal impact.

i. historical perspective:

ai traces its roots back to ancient history, where myths and stories depicted automatons with human-like attributes. the formal beginnings of ai as a field of study, however, can be attributed to the mid-20th century. pioneering figures such as alan turing and john mccarthy laid the groundwork for the development of intelligent machines.

ii. technological foundations:

a. machine learning:

machine learning, a subset of ai, empowers systems to learn and improve from experience without explicit programming. this paradigm shift has 

In [None]:
!git add -A
!git commit -m “first commit”



[33mhint: You've added another git repository inside your current repository.[m
[33mhint: Clones of the outer repository will not contain the contents of[m
[33mhint: the embedded repository and will not know how to obtain it.[m
[33mhint: If you meant to add a submodule, use:[m
[33mhint: [m
[33mhint: 	git submodule add <url> drive/MyDrive/Colab Notebooks/amharic-qa[m
[33mhint: [m
[33mhint: If you added this path by mistake, you can remove it from the[m
[33mhint: index with:[m
[33mhint: [m
[33mhint: 	git rm --cached drive/MyDrive/Colab Notebooks/amharic-qa[m
[33mhint: [m
[33mhint: See "git help submodule" for more information.[m
error: open("drive/MyDrive/MLM/amh10.gdoc"): Operation not supported
error: unable to index file 'drive/MyDrive/MLM/amh10.gdoc'
fatal: adding files failed
error: pathspec 'commit”' did not match any file(s) known to git


In [None]:
!git remote add origin https://<AbelAbeb>:<A>@github.com/<AbelAbeb>/https://github.com/AbelAbeb/NLP-Assignment2.git