In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import time

In [2]:
try:
    stop_words = set(stopwords.words("english"))
except LookupError:
    print("NLTK 'stopwords' ve 'wordnet' paketleri indiriliyor...")
    nltk.download("stopwords", quiet=True)
    nltk.download("wordnet", quiet=True)
    nltk.download('omw-1.4', quiet=True)
    stop_words = set(stopwords.words("english"))

In [None]:
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

def flatten_jsonl_data(file_path):
    print(f"{os.path.basename(file_path)} dosyası işleniyor")

    df_raw = pd.read_json(file_path, lines=True)

    all_messages = []

    for index, row in df_raw.iterarrows():
        for i in range(len(row["message"])):
            message_data = {
                'game_id': row['game_id'],
                'speaker': row['speakers'][i],
                'receiver': row['receivers'][i],
                'message_text': row['messages'][i],
                'sender_intention': row['sender_labels'][i],
                'game_score': row['game_score'][i],
                'game_score_delta': row['game_score_delta'][i],
                'year': row['years'][i],
                'season': row['seasons'][i],
                # Makaledeki orijinal ayrımı korumak için bu bilgiyi de ekleyelim
                'original_fold': row.get('acl2020_fold', os.path.basename(file_path).split('.')[0])
            }
            all_messages.append(message_data)
    return pd.DataFrame(all_messages)
            