In [None]:
import os
import pandas as pd
from collections import defaultdict
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the FinBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)

def preprocess_chat_transcriptions(input_dir, output_file):
    data = defaultdict(list)

    # Loop through all text files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(input_dir, filename), 'r') as file:
                text = file.read().strip()

            # Tokenize the text and prepare input for FinBERT
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)

            # Get sentiment predictions
            predictions = outputs.logits.argmax(dim=1).item()  # 0: negative, 1: neutral, 2: positive

            # Append sentiment based on FinBERT predictions
            sentiment = ['negative', 'neutral', 'positive'][predictions]
            data['sentiment'].append(sentiment)

            # You can tokenize the text for other usages if needed
            tokens = tokenizer.tokenize(text)
            data['tokens'].append(tokens)

            # Parse the 'rates' column from the filename
            rate = filename.split('_')[-1].replace('.txt', '')
            if 'k' in rate:
                data['rates'].append(float(rate.replace('k', '')) * 1000)
            else:
                if rate.isnumeric():
                    data['rates'].append(float(rate))
                else:
                    data['rates'].append(0)  # Default value for non-numeric rates

    # Create a DataFrame and fill null values
    df = pd.DataFrame(data)
    df = df.fillna(0)

    # Save the DataFrame to a CSV file
    df.to_csv(output_file, index=False)

# Example usage
input_dir = "D:\\ABX\\Chat_Scripts"
output_file = "D:\\ABX\\Datasets\\processed_chat_data.csv"  # Ensure this is a valid file name
preprocess_chat_transcriptions(input_dir, output_file)