In [4]:
!pip install nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.9 MB/s eta 0:00:01
Installing collected packages: nltk
Successfully installed nltk-3.9.1
You should consider upgrading via the '/Users/alan/miniforge3/envs/machine_learning_pytorch/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
import os
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

# Make sure NLTK's sentence tokenizer is downloaded
nltk.download('punkt_tab')
nltk.download('punkt')

CHUNK_SIZE = 50
OVERLAP = 5
# Read the content of a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Split text into overlapping sentence chunks
def shard_sentences(text, chunk_size=CHUNK_SIZE, overlap_prev=OVERLAP, overlap_next=OVERLAP):
    sentences = sent_tokenize(text)  # Tokenize text into sentences
    step_size = chunk_size - overlap_prev  # Step size for sliding window
    for i in range(0, len(sentences), step_size):
        start = max(0, i - overlap_prev)  # Ensure we include the previous overlap
        end = min(len(sentences), i + chunk_size + overlap_next)
        yield sentences[start:end]

# Save each shard into a separate file
def save_shard(shard, output_dir, file_name, shard_index):
    output_file = os.path.join(output_dir, f"{file_name}-{shard_index}.txt")
    
    # Save the shard into the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(' '.join(shard))

# Process each text file in the directory
def process_directory(input_dir, output_dir, chunk_size=CHUNK_SIZE, overlap_prev=OVERLAP, overlap_next=OVERLAP):
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Iterate over all text files in the input directory
    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(input_dir, file_name)
            file_base_name = os.path.splitext(file_name)[0]  # Remove file extension

            # Read the file content and shard it
            text = read_file(file_path)
            shards = list(shard_sentences(text, chunk_size, overlap_prev, overlap_next))
            
            # Save each shard
            for index, shard in enumerate(shards):
                save_shard(shard, output_dir, file_base_name, index)

    print("All files processed.")

input_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data'  # The directory containing your text files
output_directory = f'/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data_sentence_{CHUNK_SIZE}_{OVERLAP}'  # The directory to save the output files
os.makedirs(output_directory, exist_ok=True)
process_directory(input_directory, output_directory)

[nltk_data] Downloading package punkt_tab to /Users/alan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 151/151 [00:00<00:00, 348.94it/s]

All files processed.





Word sharding

In [3]:
import os
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm

CHUNK_SIZE = 3000  # Number of words per shard
OVERLAP = int(CHUNK_SIZE * 0.05)  # Number of overlapping words from the previous shard
# Make sure NLTK's word tokenizer is downloaded
nltk.download('punkt')

# Read the content of a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Split text into overlapping word chunks
def shard_words(text, chunk_size=CHUNK_SIZE, overlap_prev=OVERLAP, overlap_next=OVERLAP):
    words = word_tokenize(text)  # Tokenize text into words
    step_size = chunk_size - overlap_prev  # Step size for sliding window
    for i in range(0, len(words), step_size):
        start = max(0, i - overlap_prev)  # Include previous overlap
        end = min(len(words), i + chunk_size + overlap_next)  # Include next overlap
        yield words[start:end]

# Save each shard into a separate file
def save_shard(shard, output_dir, file_name, shard_index):
    output_file = os.path.join(output_dir, f"{file_name}-{shard_index}.txt")
    
    # Save the shard into the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(' '.join(shard))  # Join words into a string

# Process each text file in the directory
def process_directory(input_dir, output_dir, chunk_size=CHUNK_SIZE, overlap_prev=OVERLAP, overlap_next=OVERLAP):
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Iterate over all text files in the input directory
    for file_name in tqdm(os.listdir(input_dir)):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(input_dir, file_name)
            file_base_name = os.path.splitext(file_name)[0]  # Remove file extension

            # Read the file content and shard it
            text = read_file(file_path)
            shards = list(shard_words(text, chunk_size, overlap_prev, overlap_next))
            
            # Save each shard
            for index, shard in enumerate(shards):
                save_shard(shard, output_dir, file_base_name, index)

    print("All files processed.")

# Set your input and output directories
input_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data'  # The directory containing your text files
output_directory = f'/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data_word_{CHUNK_SIZE}_{OVERLAP}'

# Create the directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)
process_directory(input_directory, output_directory)

[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 151/151 [00:01<00:00, 94.41it/s]

All files processed.





In [7]:
import os
import nltk

# Ensure NLTK word tokenizer is downloaded
nltk.download('punkt')

# Function to count the number of words in a text
def count_words(text):
    words = nltk.word_tokenize(text)
    return len(words)

# Function to read the content of a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to process all text files in the directory and compute word counts
def compute_word_counts(directory):
    word_counts = []
    
    # Iterate over all files in the directory
    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(directory, file_name)
            text = read_file(file_path)
            word_count = count_words(text)
            word_counts.append(word_count)
    
    # Calculate average and maximum word count
    if word_counts:
        avg_word_count = sum(word_counts) / len(word_counts)
        max_word_count = max(word_counts)
        return avg_word_count, max_word_count
    else:
        return 0, 0  # If no text files are found, return zero

# Example: Process all text files in the directory
directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data_test'  # Specify the path to your text file directory
avg_word_count, max_word_count = compute_word_counts(directory)

print(f"Average Word Count: {avg_word_count}")
print(f"Maximum Word Count: {max_word_count}")

[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Average Word Count: 1123.3785310734463
Maximum Word Count: 8155


In [2]:
import pandas as pd
index = pd.read_csv('/Users/alan/11711/nlp-from-scratch-assignment/data/raw/raw_csv_data/events_after_10_27.csv')

In [3]:
index.head()

Unnamed: 0.1,Unnamed: 0,data_source_index,file_name_index,crawled_dir_index,status,Name,Tags,Source URL,Location,Select,Person
0,0,10,0,4,good,Pittsburgh Special Events,Events After 10.27,https://pittsburghpa.gov/events/index.html,Pittsburgh,Webpage,Chenglin Zhang
1,1,40,1,31,no good,Carnegie Mellon Opera Upcoming events,"Events After 10.27, Music",https://www.cmu.edu/cfa/music/concerts-events/...,CMU,Webpage,Qingyang Liu
2,2,41,2,32,no good,CMU School of Music upcoming events,"Events After 10.27, Music",https://www.cmu.edu/cfa/music/concerts-events/...,CMU,Webpage,Qingyang Liu
3,3,42,3,33,no good,CMU all upcoming events,Events After 10.27,https://events.cmu.edu/all,CMU,Webpage,Qingyang Liu
4,4,43,4,34,good,Pittsburgh Musical season,"Events After 10.27, Music",https://pittsburghmusicals.com/season/,Pittsburgh,Webpage,Qingyang Liu


In [2]:
import os
import pandas as pd

# Load the CSV file
# csv_file_path = 'your_csv_file.csv'  # Replace with the path to your CSV file
df = pd.read_csv('/Users/alan/11711/nlp-from-scratch-assignment/data/raw/raw_csv_data/events_after_10_27.csv')

# Define the directory containing the files
old_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/events_test'  # Replace with the path to your directory
new_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/events_test_copy'  # Replace with the path to your directory

# Loop through the rows in the DataFrame
for index, row in df.iterrows():
    old_file_name = f"{row['file_name_index']}.txt"  # The file's current name based on 'value' column
    new_file_name = f"{row['crawled_dir_index']}.txt"  # The new file name based on 'another_value' column
    
    old_file_path = os.path.join(old_directory, old_file_name)
    new_file_path = os.path.join(new_directory, new_file_name)
    
    os.rename(old_file_path, new_file_path)

print("Renaming completed.")

Renaming completed.


In [8]:
!pip install -qU langchain-openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/Users/alan/miniforge3/envs/machine_learning_pytorch/bin/python -m pip install --upgrade pip' command.[0m


[nltk_data] Downloading package punkt to /Users/alan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


IndexError: index out of range in self

In [4]:
import shutil
import os

# Define the source and destination directories
source_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/events_test_copy'
destination_directory = '/Users/alan/11711/nlp-from-scratch-assignment/data/crawled/crawled_text_data'

# Ensure the destination directory exists, create it if it doesn't
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

# Iterate over all files in the source directory
for file_name in os.listdir(source_directory):
    # Check if the file is a text file (ends with .txt)
    if file_name.endswith(".txt"):
        # Construct full file paths
        source_file_path = os.path.join(source_directory, file_name)
        destination_file_path = os.path.join(destination_directory, file_name)
        
        # Copy the file from source to destination
        shutil.copy(source_file_path, destination_file_path)

print("All text files have been copied successfully!")

All text files have been copied successfully!
