In [4]:
!pip install nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.9 MB/s eta 0:00:01
Installing collected packages: nltk
Successfully installed nltk-3.9.1
You should consider upgrading via the '/Users/alan/miniforge3/envs/machine_learning_pytorch/bin/python -m pip install --upgrade pip' command.[0m


In [6]:
import os
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

# Make sure NLTK's sentence tokenizer is downloaded
nltk.download('punkt_tab')
nltk.download('punkt')

# Read the content of a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Split text into overlapping sentence chunks
def shard_sentences(text, chunk_size=50, overlap_prev=5, overlap_next=5):
    sentences = sent_tokenize(text)  # Tokenize text into sentences
    step_size = chunk_size - overlap_prev  # Step size for sliding window
    for i in range(0, len(sentences), step_size):
        start = max(0, i - overlap_prev)  # Ensure we include the previous overlap
        end = min(len(sentences), i + chunk_size + overlap_next)
        yield sentences[start:end]

# Save each shard into a separate file
def save_shard(shard, output_dir, file_name, shard_index):
    output_file = os.path.join(output_dir, f"{file_name}-{shard_index}.txt")
    
    # Save the shard into the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(' '.join(shard))

# Process each text file in the directory
def process_directory(input_dir, output_dir, chunk_size=50, overlap_prev=5, overlap_next=5):
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Iterate over all text files in the input directory
    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(input_dir, file_name)
            file_base_name = os.path.splitext(file_name)[0]  # Remove file extension

            # Read the file content and shard it
            text = read_file(file_path)
            shards = list(shard_sentences(text, chunk_size, overlap_prev, overlap_next))
            
            # Save each shard
            for index, shard in enumerate(shards):
                save_shard(shard, output_dir, file_base_name, index)

    print("All files processed.")

input_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data'  # The directory containing your text files
output_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data_test'  # The directory to save the output files

process_directory(input_directory, output_directory)

[nltk_data] Downloading package punkt_tab to /Users/alan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 151/151 [00:00<00:00, 802.84it/s]

All files processed.





In [7]:
import os
import nltk

# Ensure NLTK word tokenizer is downloaded
nltk.download('punkt')

# Function to count the number of words in a text
def count_words(text):
    words = nltk.word_tokenize(text)
    return len(words)

# Function to read the content of a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to process all text files in the directory and compute word counts
def compute_word_counts(directory):
    word_counts = []
    
    # Iterate over all files in the directory
    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(directory, file_name)
            text = read_file(file_path)
            word_count = count_words(text)
            word_counts.append(word_count)
    
    # Calculate average and maximum word count
    if word_counts:
        avg_word_count = sum(word_counts) / len(word_counts)
        max_word_count = max(word_counts)
        return avg_word_count, max_word_count
    else:
        return 0, 0  # If no text files are found, return zero

# Example: Process all text files in the directory
directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data_test'  # Specify the path to your text file directory
avg_word_count, max_word_count = compute_word_counts(directory)

print(f"Average Word Count: {avg_word_count}")
print(f"Maximum Word Count: {max_word_count}")

[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Average Word Count: 1123.3785310734463
Maximum Word Count: 8155


In [2]:
import os
from transformers import pipeline
from tqdm import tqdm

# Load the pre-trained summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Read the content of a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Summarize a text chunk
def summarize_text(text, max_length=50):
    return summarizer(text, max_length=max_length, min_length=25, do_sample=False)[0]['summary_text']

# Process each shard file in the directory and create summaries
def process_directory(input_dir, output_dir, summary_output_dir, max_summary_length=50):
    # Ensure the output directories exist
    if not os.path.exists(summary_output_dir):
        os.makedirs(summary_output_dir)

    # Iterate over all text files (shards) in the input directory
    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(input_dir, file_name)
            text = read_file(file_path)
            
            # Generate the summary for the current shard
            summary = summarize_text(text, max_length=max_summary_length)
            
            # Save the summary to a new file
            summary_file_path = os.path.join(summary_output_dir, f"{file_name}_summary.txt")
            with open(summary_file_path, 'w', encoding='utf-8') as summary_file:
                summary_file.write(summary)

            print(f"Summary for {file_name} saved to {summary_file_path}")

# Example: Process all shards in the input directory and generate summaries
input_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data_test'  # Directory with text shards
summary_output_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/output_summary'  # Directory to save summaries

process_directory(input_directory, summary_output_directory, summary_output_directory)

  0%|          | 1/265 [00:05<24:33,  5.58s/it]

Summary for 12-7.txt saved to /Users/alan/11711/nlp-from-scratch-assignment/data/crawled/output_summary/12-7.txt_summary.txt


  1%|          | 2/265 [00:13<31:27,  7.18s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (2713 > 1024). Running this sequence through the model will result in indexing errors
  1%|          | 2/265 [00:13<30:30,  6.96s/it]

Summary for 0-15.txt saved to /Users/alan/11711/nlp-from-scratch-assignment/data/crawled/output_summary/0-15.txt_summary.txt





IndexError: index out of range in self

In [14]:
!huggingface-cli login

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/Users/alan/miniforge3/envs/machine_learning_pytorch/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/Users/alan/miniforge3/envs/machine_learning_pytorch/lib/python3.8/site-packages/huggingface_hub/commands/hug

In [8]:
!pip install -qU langchain-openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/Users/alan/miniforge3/envs/machine_learning_pytorch/bin/python -m pip install --upgrade pip' command.[0m


In [18]:
import os
import nltk
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# Ensure NLTK's sentence tokenizer is downloaded
nltk.download('punkt')

# Step 1: Read all `.txt` files from a directory
def load_text_files(directory):
    docs = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                docs.append(file.read())
    return docs

# Step 2: Chunk the documents with overlap
def chunk_text(text, chunk_size=50, overlap_size=5):
    sentences = nltk.sent_tokenize(text)  # Tokenize text into sentences
    chunks = []
    
    # Split sentences into chunks with overlap
    for i in range(0, len(sentences), chunk_size - overlap_size):
        chunk = sentences[i:i + chunk_size]
        chunks.append(' '.join(chunk))  # Join sentences back into a chunk of text
        if i + chunk_size >= len(sentences):
            break  # Avoid index overflow
    return chunks

# Step 3: Load documents from a directory and chunk them with overlap
def process_directory(directory, chunk_size=50, overlap_size=5):
    all_chunks = []
    docs = load_text_files(directory)
    for doc in docs:
        all_chunks.extend(chunk_text(doc, chunk_size, overlap_size))  # Chunk each document with overlap
    return all_chunks

# Step 4: Embed the chunks and build the FAISS index
def build_faiss_index(chunks):
    embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    chunk_embeddings = embedder.encode(chunks)
    dimension = chunk_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(chunk_embeddings))
    return index, embedder

# Step 5: Retrieve top-k relevant chunks based on query
def retrieve_top_k_chunks(query, index, chunks, embedder, k=5):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    return [(chunks[i], distances[0][j]) for j, i in enumerate(indices[0])]

# Step 6: Generate answer based on the top-k chunks
def generate_answer(query, top_chunks, model):
    context = "\n".join([chunk for chunk, _ in top_chunks])
    prompt = f"Question: {query}\n\nContext:\n{context}\n\nAnswer:"
    return model(prompt, max_length=10000, num_return_sequences=1)[0]['generated_text']

# Step 7: Combine everything in a RAG pipeline
def rag_pipeline(query, directory, k=5, chunk_size=50, overlap_size=5):
    # Process the directory and chunk the documents with overlap
    chunks = process_directory(directory, chunk_size, overlap_size)
    
    # Build the FAISS index
    index, embedder = build_faiss_index(chunks)
    
    # Retrieve top-k relevant chunks
    top_k_chunks = retrieve_top_k_chunks(query, index, chunks, embedder, k)
    
    # Load a pre-trained text generation model

    model = pipeline("text-generation", model="openai-community/gpt2")
    
    # Generate the answer based on the top-k chunks
    answer = generate_answer(query, top_k_chunks, model)
    
    return answer

# Example usage
if __name__ == "__main__":
    # Specify the directory containing .txt files
    directory = "/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data_test"
    
    # Define the query
    query = "How many super bowls did the Steelers win?"
    
    # Run the RAG pipeline with overlapping chunks
    result = rag_pipeline(query, directory, k=5, chunk_size=50, overlap_size=5)
    
    # Print the result
    print(result)

[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


IndexError: index out of range in self