# ===========================================
# 2_data_preprocessing.ipynb
# ===========================================

# Zelle 1: Bibliotheken & Setup
import pandas as pd
import os
import re
import json
from sklearn.model_selection import train_test_split

# Falls du NLTK/Spacy/Hugging Face Tokenizer nutzen möchtest
import nltk
nltk.download('punkt')

RAW_DATA_DIR = "../data/raw"
PROCESSED_DATA_DIR = "../data/processed"

# Beispielhafte Datei: "chatlogs_raw.csv"
# (Spalten user_message, bot_response, optional conversation_id)
raw_file_path = os.path.join(RAW_DATA_DIR, "chatlogs_raw.csv")

# ===========================================
# Zelle 2: Rohdaten laden
# ===========================================
df = pd.read_csv(raw_file_path)

print(f"Anzahl Rohdialoge: {len(df)}")
display(df.head())

# ===========================================
# Zelle 3: Datensäuberung
# ===========================================
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    # HTML-Tags entfernen
    text = re.sub(r"<.*?>", "", text)
    # Mehrfache Leerzeichen
    text = re.sub(r"\s+", " ", text)
    # Optional: Lowercase
    text = text.strip().lower()
    return text

df["cleaned_user"] = df["user_message"].apply(clean_text)
df["cleaned_bot"]  = df["bot_response"].apply(clean_text)

# Beispiel: Entferne Zeilen, die sehr kurz sind
df = df[df["cleaned_user"].str.len() > 1]
df = df[df["cleaned_bot"].str.len() > 1]

print("\nBeispiel bereinigter Datensätze:")
display(df[["cleaned_user", "cleaned_bot"]].head())

# ===========================================
# Zelle 4: Tokenisierung
# ===========================================
def tokenize_text(text):
    # Simpler Word-Tokenizer (NLTK)
    tokens = nltk.word_tokenize(text)
    return tokens

df["user_tokens"] = df["cleaned_user"].apply(tokenize_text)
df["bot_tokens"]  = df["cleaned_bot"].apply(tokenize_text)

print("\nBeispiel für tokenisierte Sätze:")
display(df[["user_tokens", "bot_tokens"]].head())

# ===========================================
# Zelle 5: Vokabular aufbauen
# ===========================================
# Wir fassen alle Tokens aus User und Bot zusammen
from collections import Counter

all_tokens = []
for uts, bts in zip(df["user_tokens"], df["bot_tokens"]):
    all_tokens.extend(uts)
    all_tokens.extend(bts)

token_freqs = Counter(all_tokens)
print(f"Anzahl eindeutiger Tokens: {len(token_freqs)}")

# Beispiel: Wir begrenzen auf die häufigsten 10.000 Tokens
VOCAB_SIZE = 10000
most_common_tokens = token_freqs.most_common(VOCAB_SIZE)

# Sondertokens
special_tokens = ["<PAD>", "<UNK>", "<BOS_USER>", "<EOS_USER>", "<BOS_BOT>", "<EOS_BOT>"]
word2id = {}
idx = 0
for st in special_tokens:
    word2id[st] = idx
    idx += 1

for token, freq in most_common_tokens:
    if token not in word2id:
        word2id[token] = idx
        idx += 1

print(f"\nVokabulargröße (inkl. Sondertokens): {len(word2id)}")

# Inverses Mapping
id2word = {v:k for k,v in word2id.items()}

# ===========================================
# Zelle 6: Tokens -> IDs
# ===========================================
def tokens_to_ids(token_list, word2id, unk_id=word2id["<UNK>"]):
    return [word2id[t] if t in word2id else unk_id for t in token_list]

UNK_ID = word2id["<UNK>"]

df["user_ids"] = df["user_tokens"].apply(lambda ts: tokens_to_ids(ts, word2id, UNK_ID))
df["bot_ids"]  = df["bot_tokens"].apply(lambda ts: tokens_to_ids(ts, word2id, UNK_ID))

# Optional: Füge BOS/EOS-Token hinzu
# z. B.: <BOS_USER> user_ids <EOS_USER>, <BOS_BOT> bot_ids <EOS_BOT>
BOS_USER_ID = word2id["<BOS_USER>"]
EOS_USER_ID = word2id["<EOS_USER>"]
BOS_BOT_ID  = word2id["<BOS_BOT>"]
EOS_BOT_ID  = word2id["<EOS_BOT>"]

df["user_ids"] = df["user_ids"].apply(lambda x: [BOS_USER_ID] + x + [EOS_USER_ID])
df["bot_ids"]  = df["bot_ids"].apply(lambda x: [BOS_BOT_ID]  + x + [EOS_BOT_ID])

print("\nBeispiel für user_ids und bot_ids mit BOS/EOS:")
display(df[["user_ids", "bot_ids"]].head())

# ===========================================
# Zelle 7: Split in Train/Val
# ===========================================
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"Train Samples: {len(train_df)}, Val Samples: {len(val_df)}")

# ===========================================
# Zelle 8: Speichern
# ===========================================
import json
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

# Vokabular
vocab_path = os.path.join(PROCESSED_DATA_DIR, "vocab.json")
with open(vocab_path, "w", encoding="utf-8") as f:
    json.dump(word2id, f, ensure_ascii=False)

# CSVs
train_path = os.path.join(PROCESSED_DATA_DIR, "train.csv")
val_path   = os.path.join(PROCESSED_DATA_DIR, "val.csv")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)

print("\nGespeicherte Dateien:")
print("- Vokabular:", vocab_path)
print("- Train CSV:", train_path)
print("- Val CSV:  ", val_path)

# ===========================================
# Zelle 9: Ausblick
# ===========================================
print("""
Die Daten liegen nun tokenisiert (user_ids, bot_ids) in train.csv/val.csv.
Wir können sie in Notebook 3_training_demo.ipynb oder via src/train.py 
laden und unser ChatPal-Modell trainieren.
""")
