In [1]:
import pandas as pd
from collections import Counter
import re
import csv
from transformers import AutoTokenizer
import os

# Ensure transformers package is installed
try:
    from transformers import AutoTokenizer
except ImportError as e:
    print("Transformers package is not installed. Please run: pip install transformers")
    raise e

# Define the path to the CSV files and their respective text columns
# Define the path to the CSV files and their respective text columns
csv_files = {
    "/Users/nishchaltamang/Desktop/CSV1.csv": "SHORT-TEXT",  
    "/Users/nishchaltamang/Desktop/CSV2.csv": "entites",  
    "/Users/nishchaltamang/Desktop/CSV3.csv": "TEXT",  
    "/Users/nishchaltamang/Desktop/CSV4.csv": "TEXT", 
}

# Set output directory
output_dir = "/Users/nishchaltamang/Desktop/"
os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist

# Function to consolidate text from each CSV file separately
def consolidate_text(csv_file, text_column):
    try:
        df = pd.read_csv(csv_file)
        if text_column in df.columns:
            return "\n".join(df[text_column].dropna())  # Drop NaN values and join the text
        else:
            print(f"Column '{text_column}' not found in {csv_file}. Available columns: {df.columns}")
            return ""
    except Exception as e:
        print(f"Error reading {csv_file}: {e}")
        return ""

# Consolidate text from each file and store separately
consolidated_texts = {}
for csv_file, text_column in csv_files.items():
    consolidated_texts[csv_file] = consolidate_text(csv_file, text_column)

# Merge all the separate texts into a single string
final_consolidated_text = "\n".join(consolidated_texts.values())

# Task 1: Save the merged consolidated text to a .txt file
output_text_file = os.path.join(output_dir, 'consolidated_text.txt')
try:
    with open(output_text_file, 'w') as f:
        f.write(final_consolidated_text)
    print(f"Text data from all CSV files has been successfully merged into '{output_text_file}'.")
except OSError as e:
    print(f"Error writing to file: {e}")

# Task 2: Load the consolidated text from the .txt file
try:
    with open(output_text_file, 'r') as f:
        text = f.read()
except OSError as e:
    print(f"Error reading the file: {e}")
    text = ""  # Fallback in case of an error

# Function to count word occurrences
def count_words(text):
    words = re.findall(r'\w+', text.lower())  # Extract words and convert to lowercase
    word_counts = Counter(words)
    return word_counts

# Count the words in the text
word_counts = count_words(text)

# Get the top 30 most common words
top_30_words = word_counts.most_common(30)

# Task 3: Save the top 30 words to a CSV file
output_top_words_file = os.path.join(output_dir, 'top_30_words.csv')
try:
    with open(output_top_words_file, 'w', newline='') as csvfile:
        fieldnames = ['word', 'count']
        writer = csv.writer(csvfile)
        writer.writerow(fieldnames)
        writer.writerows(top_30_words)
    print(f"Top 30 words have been saved to '{output_top_words_file}'.")
except OSError as e:
    print(f"Error writing the file: {e}")


# Function to tokenize text in chunks (This function splits the text into smaller chunks, each with a length of up to 512 tokens, which is the limit for BERT.)

def tokenize_in_chunks(text, chunk_size=512):
    tokens = []
    # Split the text into chunks of the maximum length allowed by the model
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i + chunk_size]
        tokens.extend(tokenizer.tokenize(chunk))  # Tokenize each chunk and extend the token list
    return tokens

# Part 4: Tokenization and Top 30 Tokens for each CSV file
# Load the tokenizer for BERT
# Function to tokenize text in chunks (This function splits the text into smaller chunks, each with a length of up to 512 tokens, which is the limit for BERT.)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to count token occurrences from individual texts
def count_tokens(text, chunk_size=512):
    tokens = tokenize_in_chunks(text, chunk_size)
    token_counts = Counter(tokens)
    return token_counts

# Iterate over each CSV file and tokenize its respective text column
for csv_file, text_column in csv_files.items():
    csv_text = consolidated_texts[csv_file]  # Get text from the specific CSV file's column
    
    if csv_text:
        # Tokenize the text
        token_counts = count_tokens(csv_text)
        
        # Get the top 30 most common tokens for the current CSV file
        top_30_tokens = token_counts.most_common(30)
        
        # Save the top 30 tokens to a separate CSV file for each input CSV
        token_output_file = os.path.join(output_dir, f'top_30_tokens_{os.path.basename(csv_file)}.csv')
        try:
            with open(token_output_file, 'w', newline='') as csvfile:
                fieldnames = ['token', 'count']
                writer = csv.writer(csvfile)
                writer.writerow(fieldnames)
                writer.writerows(top_30_tokens)
            print(f"Top 30 tokens for {csv_file} have been saved to '{token_output_file}'.")
        except OSError as e:
            print(f"Error writing the file for {csv_file}: {e}")

print("Tokenization for all CSV files is complete.")


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Text data from all CSV files has been successfully merged into '/Users/nishchaltamang/Desktop/consolidated_text.txt'.
Top 30 words have been saved to '/Users/nishchaltamang/Desktop/top_30_words.csv'.




Top 30 tokens for /Users/nishchaltamang/Desktop/CSV1.csv have been saved to '/Users/nishchaltamang/Desktop/top_30_tokens_CSV1.csv.csv'.
Top 30 tokens for /Users/nishchaltamang/Desktop/CSV2.csv have been saved to '/Users/nishchaltamang/Desktop/top_30_tokens_CSV2.csv.csv'.
Top 30 tokens for /Users/nishchaltamang/Desktop/CSV3.csv have been saved to '/Users/nishchaltamang/Desktop/top_30_tokens_CSV3.csv.csv'.
Top 30 tokens for /Users/nishchaltamang/Desktop/CSV4.csv have been saved to '/Users/nishchaltamang/Desktop/top_30_tokens_CSV4.csv.csv'.
Tokenization for all CSV files is complete.
