### Import Libraries
This section imports all the necessary libraries for data processing, text cleaning, and emotion classification.

In [None]:
import pandas as pd
import json
import re
import torch
from transformers import pipeline, AutoTokenizer
from nltk.tokenize import sent_tokenize
import langdetect

### Load and Preprocess Data
This section loads the JSON file containing the chat data, extracts messages of type 'message', and creates a DataFrame.

In [None]:
with open('/data/chat.json', 'r', encoding='utf-8') as file:
    chat_data = json.load(file)

messages = [msg for msg in chat_data['messages'] if msg['type'] == 'message']

def extract_text(msg):
    if 'text' in msg:
        if isinstance(msg['text'], list):
            return ''.join([part['text'] if isinstance(part, dict) else part for part in msg['text']])
        return msg['text']
    return ''

texts = [extract_text(msg) for msg in messages]

df = pd.DataFrame({'text': texts})
print(f"Total messages: {len(df)}")

df = df[df['text'].str.strip() != '']
print(f"After removing empty texts: {len(df)}")

### Text Cleaning
This section cleans the text by removing URLs, special characters, and numbers. It also converts the text to lowercase.

In [None]:
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    return text.lower().strip()

def process_sentences(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    for sent in sentences:
        sent = clean_text(sent)
        if len(sent.split()) > 3:
            try:
                if langdetect.detect(sent) == 'en':
                    cleaned_sentences.append(sent)
            except:
                continue
    return ' '.join(cleaned_sentences)

df['text'] = df['text'].apply(process_sentences)
print(f"After removing non-English and short texts: {len(df)}")

df = df[df['text'].str.strip() != '']
print(f"After final filtering: {len(df)}")

### Load Model and Tokenizer
This section loads the pre-trained emotion classification model and tokenizer. It also checks if a GPU is available for faster processing.

In [None]:
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")

emotion_classifier = pipeline(task='text-classification', model='j-hartmann/emotion-english-distilroberta-base', truncation=True, padding=True, device=device)
tokenizer = AutoTokenizer.from_pretrained('j-hartmann/emotion-english-distilroberta-base')

### Truncate Text
This section truncates text that exceeds the model's maximum input length (512 tokens).

In [None]:
def truncate_text(text, max_length=512):
    tokenized = tokenizer(text, truncation=True, max_length=max_length, return_tensors="pt")
    return tokenizer.decode(tokenized["input_ids"][0], skip_special_tokens=True)

df['text'] = df['text'].apply(truncate_text)

### Emotion Prediction
This section processes the text in batches and predicts emotions using the pre-trained model.

In [None]:
def predict_emotions(texts, batch_size=32):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        try:
            predictions = emotion_classifier(batch)
            emotions = [pred['label'] for pred in predictions]
            results.extend(emotions)
        except RuntimeError as e:
            print(f"Skipping batch due to error: {e}")
            results.extend(["UNKNOWN"] * len(batch))
    return results

df['emotion'] = predict_emotions(df['text'].tolist(), batch_size=32)

### Save Results
This section saves the final DataFrame, including the original text and predicted emotions, to a CSV file.

In [None]:
df.to_csv('chat_emotions.csv', index=False)
print("Emotion analysis completed and saved.")