In [1]:
# ======================
# 1. SETUP ENVIRONMENT
# ======================
import os
import re
import requests
import zipfile
import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import time

# Setup directories
!rm -rf /kaggle/working/*
!mkdir -p /kaggle/working/{data,cleaned_scripts,logs}
           
!mkdir -p /kaggle/working/data/{raw,preprocessed}/{cornell,imsdb,springfield,screenplaydb}

# ======================
# 2. DOWNLOAD ALL DATASETS
# ======================
def download_cornell():
    cornell_url = "https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
    cornell_zip = "/kaggle/working/data/raw/cornell.zip"

    print("⬇️ Downloading Cornell dataset...")
    response = requests.get(cornell_url, stream=True)
    response.raise_for_status()

    with open(cornell_zip, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    # Verify download
    if os.path.exists(cornell_zip):
        print(f"✅ Downloaded: {os.path.getsize(cornell_zip)/1024/1024:.2f} MB")
    else:
        raise Exception("Download failed!")

    # Extract
    print("📂 Extracting...")
    with zipfile.ZipFile(cornell_zip, 'r') as z:
        z.extractall("/kaggle/working/data/raw/cornell")

def download_imsdb_script(movie):
    try:
        url = f"https://imsdb.com/scripts/{movie}.html"
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            text = re.search(r'<pre>(.*?)</pre>', response.text, re.DOTALL)
            if text:
                clean_text = re.sub(r'<.*?>', '', text.group(1))
                with open(f"/kaggle/working/data/raw/imsdb/{movie}.txt", 'w') as f:
                    f.write(clean_text)
                return True
    except Exception as e:
        print(f"⚠️ {movie}: {str(e)}")
    return False

def download_springfield_script(movie):
    try:
        url = f"https://www.springfieldspringfield.co.uk/movie_script.php?movie={movie}"
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            text = re.search(r'<div class="scrolling-script-container">(.*?)</div>', response.text, re.DOTALL)
            if text:
                clean_text = re.sub(r'<.*?>', '', text.group(1))
                clean_text = re.sub(r'\s+', ' ', clean_text).strip()
                with open(f"/kaggle/working/data/raw/springfield/{movie}.txt", 'w') as f:
                    f.write(clean_text)
                return True
    except Exception as e:
        print(f"⚠️ {movie}: {str(e)}")
    return False

def download_screenplaydb_script(movie, url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            with open(f"/kaggle/working/data/raw/screenplaydb/{movie}.txt", 'w') as f:
                f.write(response.text)
            return True
    except Exception as e:
        print(f"⚠️ {movie}: {str(e)}")
    return False

# Download all datasets
print("Starting dataset downloads...")
download_cornell()

# IMSDb scripts
imsdb_movies = [
    'Pulp-Fiction', 'The-Matrix', 'Inception',
    'The-Dark-Knight', 'Fight-Club', 'Forrest-Gump',
    'Good-Will-Hunting', 'The-Shawshank-Redemption',
    'The-Godfather', 'The-Social-Network',
    'Interstellar', 'The-Prestige', 'Se7en',
    'The-Truman-Show', 'American-Beauty']
for movie in tqdm(imsdb_movies, desc="Downloading IMSDb"):
    download_imsdb_script(movie)

# Springfield scripts
springfield_movies = [
    'alien', 'blade-runner', 'casablanca',
    'citizen-kane', 'gravity', 'her',
    'jurassic-park', 'la-la-land', 'mad-max-fury-road',
    'the-martian', 'psycho', 'the-shining',
    'eternal-sunshine-of-the-spotless-mind', 'the-grand-budapest-hotel',
    'moonlight', 'parasite', 'whiplash'
]

for movie in tqdm(springfield_movies, desc="Downloading Springfield"):
    download_springfield_script(movie)


# ScreenplayDB scripts
screenplaydb_movies = {
    '12-angry-men': 'https://www.screenplaydb.com/film/scripts/12-angry-men.txt',
    'birdman': 'https://www.screenplaydb.com/film/scripts/birdman.txt'
}
for movie, url in tqdm(screenplaydb_movies.items(), desc="Downloading ScreenplayDB"):
    download_screenplaydb_script(movie, url)

# ======================


Starting dataset downloads...
⬇️ Downloading Cornell dataset...
✅ Downloaded: 9.46 MB
📂 Extracting...


Downloading IMSDb: 100%|██████████| 15/15 [00:03<00:00,  4.82it/s]
Downloading Springfield: 100%|██████████| 17/17 [00:12<00:00,  1.41it/s]
Downloading ScreenplayDB:  50%|█████     | 1/2 [00:00<00:00,  4.44it/s]

⚠️ 12-angry-men: HTTPSConnectionPool(host='www.screenplaydb.com', port=443): Max retries exceeded with url: /film/scripts/12-angry-men.txt (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1016)')))


Downloading ScreenplayDB: 100%|██████████| 2/2 [00:00<00:00,  3.91it/s]

⚠️ birdman: HTTPSConnectionPool(host='www.screenplaydb.com', port=443): Max retries exceeded with url: /film/scripts/birdman.txt (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1016)')))





In [2]:
# ======================
# 3. VERIFY DOWNLOADS
# ======================
def count_files(directory, extension="*.txt"):
    return len(list(Path(directory).rglob(extension)))

def verify_downloads():
    print("\n🔍 DOWNLOAD VERIFICATION:")
    
    print("\n📂 Cornell:")
    cornell_files = count_files("/kaggle/working/data/raw/cornell")
    print(f"{cornell_files} files")
    
    print("\n📂 IMSDb:")
    imsdb_files = count_files("/kaggle/working/data/raw/imsdb")
    print(f"{imsdb_files} scripts")
    
    print("\n📂 Springfield:")
    springfield_files = count_files("/kaggle/working/data/raw/springfield")
    print(f"{springfield_files} scripts")
    
    print("\n📂 ScreenplayDB:")
    screenplaydb_files = count_files("/kaggle/working/data/raw/screenplaydb")
    print(f"{screenplaydb_files} scripts")

verify_downloads()


🔍 DOWNLOAD VERIFICATION:

📂 Cornell:
7 files

📂 IMSDb:
15 scripts

📂 Springfield:
17 scripts

📂 ScreenplayDB:
0 scripts


In [3]:
# ======================
# 4. PROCESS CORNELL DATA
# ======================
def clean_cornell(input_path, output_path):
    with open(input_path, 'r', encoding='latin-1', errors='ignore') as f:
        lines = f.readlines()
    
    cleaned = []
    for line in lines:
        if line.count('+++$+++') < 4:
            continue
            
        parts = line.split(' +++$+++ ')
        if len(parts) < 5:
            continue
            
        character = parts[3].strip().upper()
        dialogue = parts[4].strip()
        
        if character and dialogue:
            cleaned.append(f"{character}: {dialogue}")
    
    with open(output_path, 'w') as f:
        f.write("\n".join(cleaned))
    print(f"✅ Saved {len(cleaned)} dialogues to {output_path}")

cornell_input = "/kaggle/working/data/raw/cornell/cornell movie-dialogs corpus/movie_lines.txt"
cornell_output = "/kaggle/working/cleaned_scripts/cornell_dialogues.txt"
clean_cornell(cornell_input, cornell_output)

# Verify Cornell output
print("\n🔍 First 5 Cornell dialogues:")
!head -n 5 {cornell_output}
print(f"\n📜 Total lines: {sum(1 for _ in open(cornell_output))}")


✅ Saved 304403 dialogues to /kaggle/working/cleaned_scripts/cornell_dialogues.txt

🔍 First 5 Cornell dialogues:
BIANCA: They do not!
CAMERON: They do to!
BIANCA: I hope so.
CAMERON: She okay?
BIANCA: Let's go.

📜 Total lines: 304403


In [4]:
# ======================
# 5. PROCESS SCREENPLAYS
# ======================
def clean_screenplay(text):
    text = re.sub(r'<.*?>|http\S+|\(.*?\)|\*.*?\*|\[.*?\]', '', text)
    text = re.sub(r'^.*?(INT\.|EXT\.|FADE IN:)', r'\1', text, flags=re.DOTALL|re.IGNORECASE)
    text = re.sub(r'(CUT TO:|DISSOLVE TO:|FADE OUT\.).*?\n', '', text)
    text = re.sub(r'^\s*([A-Z][A-Z\s]+)\s*$', r'\1:', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s.,!?\':-]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

def process_all_files():
    """Process all datasets including Cornell movie dialogues"""
    # First process Cornell dataset (special handling)
    print("\n🔄 Processing Cornell movie dialogues...")
    cornell_input = "/kaggle/working/data/raw/cornell/cornell movie-dialogs corpus/movie_lines.txt"
    cornell_output = "/kaggle/working/data/preprocessed/cornell/cornell_dialogues.txt"
    
    # Create output directory if it doesn't exist
    Path("/kaggle/working/data/preprocessed/cornell").mkdir(parents=True, exist_ok=True)
    
    clean_cornell(cornell_input, cornell_output)

    # Then process screenplay datasets
    sources = {
        "imsdb": "/kaggle/working/data/raw/imsdb",
        "springfield": "/kaggle/working/data/raw/springfield", 
        "screenplaydb": "/kaggle/working/data/raw/screenplaydb"
    }
    
    for source, path in sources.items():
        print(f"\n🔄 Processing {source} scripts...")
        Path(f"/kaggle/working/data/preprocessed/{source}").mkdir(parents=True, exist_ok=True)
        
        for script in Path(path).glob("*.txt"):
            try:
                with open(script, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                cleaned = clean_screenplay(content)
                
                output_path = f"/kaggle/working/data/preprocessed/{source}/{script.stem}_clean.txt"
                
                with open(output_path, 'w') as f_out:
                    f_out.write(cleaned)
            except Exception as e:
                print(f"⚠️ Error processing {script.name}: {str(e)}")

# And the corresponding verify_preprocessing function:
def verify_preprocessing():
    print("\n🔍 PREPROCESSING VERIFICATION:")
    
    # Verify Cornell processing
    print("\n📂 Cornell (Preprocessed):")
    cornell_files = list(Path("/kaggle/working/data/preprocessed/cornell").glob("*.txt"))
    print(f"{len(cornell_files)} files")
    if cornell_files:
        with open(cornell_files[0], 'r') as f:
            print(f"Sample line: {f.readline().strip()}")
    
    # Verify screenplays
    print("\n📂 IMSDb (Preprocessed):")
    imsdb_files = list(Path("/kaggle/working/data/preprocessed/imsdb").glob("*_clean.txt"))
    print(f"{len(imsdb_files)} scripts")
    
    print("\n📂 Springfield (Preprocessed):")
    springfield_files = list(Path("/kaggle/working/data/preprocessed/springfield").glob("*_clean.txt"))
    print(f"{len(springfield_files)} scripts")
    
    print("\n📂 ScreenplayDB (Preprocessed):")
    screenplaydb_files = list(Path("/kaggle/working/data/preprocessed/screenplaydb").glob("*_clean.txt"))
    print(f"{len(screenplaydb_files)} scripts")

# Run the processing
process_all_files()
verify_preprocessing()



🔄 Processing Cornell movie dialogues...
✅ Saved 304403 dialogues to /kaggle/working/data/preprocessed/cornell/cornell_dialogues.txt

🔄 Processing imsdb scripts...

🔄 Processing springfield scripts...

🔄 Processing screenplaydb scripts...

🔍 PREPROCESSING VERIFICATION:

📂 Cornell (Preprocessed):
1 files
Sample line: BIANCA: They do not!

📂 IMSDb (Preprocessed):
15 scripts

📂 Springfield (Preprocessed):
17 scripts

📂 ScreenplayDB (Preprocessed):
0 scripts


In [5]:
# ======================
# 1. AUTHENTICATION SETUP
# ======================
from huggingface_hub import login
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch

# Login with your token
login(token="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")

2025-08-08 07:43:59.365484: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754639039.609479      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754639039.681248      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
# ======================
# 3. EMOTION CLASSIFICATION SETUP (FIXED)
# ======================
# 3. EMOTION CLASSIFICATION SETUP (FIXED PROPERLY)
# ======================
try:
    
    # Load components separately with proper config
    model_name = "adcg1355/moodmatemodels"
    model_name = "adcg1355/moodmatemodels"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, truncation_side='left')
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # Patch model.forward to remove token_type_ids
    original_forward = model.forward
    def patched_forward(*args, **kwargs):
        kwargs.pop('token_type_ids', None)
        return original_forward(*args, **kwargs)
    model.forward = patched_forward

    # Create pipeline
    emotion_classifier = pipeline(
        "text-classification",
         model=model,
         tokenizer=tokenizer,
         framework="pt",
         device=0 if torch.cuda.is_available() else -1,
         truncation=True,
         max_length=512,
    )
    
    
    
    # Load tokenizer with special settings
    
    
    
    
    print("✅ Emotion classifier loaded successfully (DistilBERT patched)")
except Exception as e:
    print(f"❌ Failed to load emotion classifier: {str(e)}")
    raise

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cuda:0


✅ Emotion classifier loaded successfully (DistilBERT patched)


In [7]:
# 4. EMOTION TAGGING FUNCTION
# ======================
def get_emotion(dialogue):
    """Ultra-robust emotion classification"""
    try:
        # Clean input
        clean_text = ''.join(char for char in dialogue[:500] if char.isprintable())
        
        # Optional sleep to avoid rate limits (if needed)
        time.sleep(0.1)
        
        # Run emotion classifier
        result = emotion_classifier(
            clean_text,
            truncation=True,
            max_length=128,
            padding='max_length'
        )
        
        return result[0] if isinstance(result, list) else {'label': 'neutral', 'score': 0.0}
    except Exception as e:
        print(f"⚠️ Skipped line (error: {str(e)}): {dialogue[:50]}")
        return {'label': 'neutral', 'score': 0.0}


In [14]:
import json
from pathlib import Path
import os
from IPython.display import FileLink
from tqdm.notebook import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import torch

# ======================
# 1. PATH CONFIGURATION & SETUP
# ======================
# Define all paths
CLEANED_INPUT = "/kaggle/working/cleaned_scripts/cornell_dialogues.txt"
ENHANCED_OUTPUT = "/kaggle/working/data/cleaned_cornell.txt"

# Create directories if they don't exist
Path(CLEANED_INPUT).parent.mkdir(parents=True, exist_ok=True)
Path(ENHANCED_OUTPUT).parent.mkdir(parents=True, exist_ok=True)

# ======================
# 2. SETUP THE EMOTION CLASSIFICATION PIPELINE (USING YOUR MODEL)
# ======================
try:
    # Load components separately with proper config
    model_name = "adcg1355/moodmatemodels"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, truncation_side='left')
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # Patch model.forward to remove token_type_ids
    original_forward = model.forward
    def patched_forward(*args, **kwargs):
        kwargs.pop('token_type_ids', None)
        return original_forward(*args, **kwargs)
    model.forward = patched_forward

    # Create pipeline
    emotion_classifier = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        framework="pt",
        device=0 if torch.cuda.is_available() else -1,
        truncation=True,
        max_length=512,
    )
    
    print("✅ Emotion classifier loaded successfully (DistilBERT patched)")
except Exception as e:
    print(f"❌ Failed to load emotion classifier: {str(e)}")
    raise

# ======================
# 3. ENHANCED PROCESSING (MODIFIED)
# ======================
def enhance_with_emotion(input_path, output_path, batch_size=32):
    """
    Processes dialogues with emotion tags using a batched approach for speed.
    This function reads the entire file into a Dataset and then feeds it
    to the Hugging Face pipeline in batches.
    """
    # Create the correct mapping from numerical labels to emotion names from the GoEmotions dataset
    # This list of 28 emotions is based on the default `id2label` mapping.
    emotion_labels = [
        'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
        'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
        'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
        'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
        'relief', 'remorse', 'sadness', 'surprise', 'neutral'
    ]
    emotion_map = {f'LABEL_{i}': label for i, label in enumerate(emotion_labels)}

    # Verify input exists
    if not Path(input_path).exists():
        print(f"❌ Input file not found: {input_path}")
        return []

    enhanced = []
    dialogues_to_process = []
    
    # Read all dialogue lines from the input file first
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            if ": " in line:
                dialogues_to_process.append(line.strip())

    if not dialogues_to_process:
        print("❌ No dialogues found in the input file to process.")
        return []

    # Create a Hugging Face Dataset from our list of dialogues
    dataset = Dataset.from_dict({'text': dialogues_to_process})

    try:
        # Pass the dataset and batch size to the pipeline.
        all_results = emotion_classifier(dataset['text'], batch_size=batch_size)

        # Now, combine the original dialogues with the new emotion data
        for dialogue, result in tqdm(zip(dialogues_to_process, all_results), total=len(dialogues_to_process), desc="Finalizing"):
            try:
                character, text = dialogue.split(": ", 1)
                # Use the mapping to get the real emotion name
                emotion_label = result['label']
                emotion = emotion_map.get(emotion_label, "unknown") # Use .get to avoid errors
                score = result['score']

                enhanced.append({
                    "text": dialogue,
                    "metadata": {
                        "character": character,
                        "emotion": emotion,
                        "score": float(score),
                        "source": "cornell"
                    }
                })
            except Exception as e:
                # Log occasional errors
                if len(enhanced) % 1000 == 0:
                    print(f"⚠️ Skipped dialogue due to error: {str(e)[:50]}...")
                continue
        
        # Atomic write with verification
        temp_path = f"{output_path}.tmp"
        with open(temp_path, 'w', encoding='utf-8') as f:
            json.dump(enhanced, f, indent=4)

        # Verify temp file was created
        if not Path(temp_path).exists():
            raise Exception("Temporary file not created")

        # Atomic move
        os.replace(temp_path, output_path)

        # Final verification
        if not Path(output_path).exists():
            raise Exception("Final output file missing")
            
        print(f"\n✅ Successfully saved {len(enhanced)} dialogues to:")
        display(FileLink(output_path))
            
        return enhanced
            
    except Exception as e:
        print(f"❌ Critical error: {str(e)}")
        if 'temp_path' in locals() and Path(temp_path).exists():
            os.remove(temp_path)
        return []

# ======================
# 4. EXECUTION & VERIFICATION
# ======================
# Run the processing
enhanced_data = enhance_with_emotion(
    input_path=CLEANED_INPUT,
    output_path=ENHANCED_OUTPUT
)

# Final verification
if enhanced_data:
    print("\n🔍 FINAL VERIFICATION")
    print(f"Total dialogues processed: {len(enhanced_data)}")
    print("Sample metadata:")
    print(enhanced_data[0]['metadata'])
    
    # Force Kaggle to refresh
    !ls -lh "/kaggle/working/data/"
else:
    print("\n🚨 Processing failed - no output generated")


Device set to use cuda:0


✅ Emotion classifier loaded successfully (DistilBERT patched)


Finalizing:   0%|          | 0/304403 [00:00<?, ?it/s]


✅ Successfully saved 304403 dialogues to:



🔍 FINAL VERIFICATION
Total dialogues processed: 304403
Sample metadata:
{'character': 'BIANCA', 'emotion': 'neutral', 'score': 0.7312625646591187, 'source': 'cornell'}
total 79M
-rw-r--r-- 1 root root  79M Aug  8 08:39 cleaned_cornell.txt
drwxr-xr-x 6 root root 4.0K Aug  8 07:43 preprocessed
drwxr-xr-x 6 root root 4.0K Aug  8 07:43 raw


In [16]:
import json
from pathlib import Path
import os
from tqdm.notebook import tqdm
from IPython.display import FileLink

# ======================
# 1. PATH CONFIGURATION
# ======================
# --- Input Paths ---
# This path is based on your emotion tagging script's output
CORNELL_EMOTION_INPUT = "/kaggle/working/data/cleaned_cornell.txt"
# This path is where your preprocessed screenplays should be
SCREENPLAY_ROOT_DIR = "/kaggle/working/data/preprocessed"

# --- Output Path ---
FINAL_DATA_OUTPUT = "/kaggle/working/data/final_dataset.json"

In [17]:
# ======================
# 2. SPECIAL TOKENS & METADATA
# ======================
SPECIAL_TOKENS = {
    "cornell": "<CORNELL_DIALOGUE>",
    "screenplay": "<SCREENPLAY>"
}


In [35]:

# ======================
# 3. HELPER FUNCTIONS
# ======================
def load_and_tag_cornell(input_path):
    """
    Loads the emotion-tagged Cornell data and adds the special token
    and 'type' metadata.
    """
    if not Path(input_path).exists():
        print(f"❌ Cornell file not found: {input_path}")
        return []
    
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # We're updating the existing data structure
    for item in tqdm(data, desc="Tagging Cornell data"):
        item["text"] = f"{SPECIAL_TOKENS['cornell']} {item['text']}"
        item["metadata"]["type"] = "cornell"
    
    return data

def load_and_tag_screenplays(root_dir):
    """
    Loads all preprocessed screenplay files, combines them,
    and adds the special token and type metadata.
    """
    all_screenplays = []
    
    # The list of screenplay sources to process
    sources = ["imsdb", "springfield", "screenplaydb"]
    for source in sources:
        source_path = Path(root_dir) / source
        if not source_path.exists():
            print(f"⚠️ Directory not found for {source}. Skipping.")
            continue
        
        script_files = list(source_path.glob("*_clean.txt"))
        for script_file in tqdm(script_files, desc=f"Tagging {source} scripts"):
            try:
                with open(script_file, 'r', encoding='utf-8') as f:
                    content = f.read().strip() # .strip() removes leading/trailing whitespace
                
                # IMPORTANT FIX: Check if content is not empty before processing
                if not content:
                    print(f"⚠️ Skipping empty file: {script_file.name}")
                    continue
                
                tagged_text = f"{SPECIAL_TOKENS['screenplay']} {content}"
                
                all_screenplays.append({
                    "text": tagged_text,
                    "metadata": {
                        "type": "screenplay",
                        "source": source,
                        "filename": script_file.name
                    }
                })
            except Exception as e:
                print(f"❌ Error processing {script_file.name}: {str(e)}")
                continue
    
    return all_screenplays



In [36]:

# ======================
# 4. MAIN EXECUTION
# ======================
if __name__ == "__main__":
    print("🚀 Starting dataset combination and tagging...")

    cornell_data = load_and_tag_cornell(CORNELL_EMOTION_INPUT)
    screenplay_data = load_and_tag_screenplays(SCREENPLAY_ROOT_DIR)
    
    final_dataset = cornell_data + screenplay_data
    
    import random
    random.shuffle(final_dataset)
    
    print(f"\n🎉 Successfully combined {len(cornell_data)} Cornell samples and {len(screenplay_data)} screenplay samples.")
    print(f"Total samples in final dataset: {len(final_dataset)}")
    
    try:
        with open(FINAL_DATA_OUTPUT, 'w', encoding='utf-8') as f:
            json.dump(final_dataset, f, indent=4)
        
        print(f"\n✅ Final dataset saved to: {FINAL_DATA_OUTPUT}")
        
    except Exception as e:
        print(f"❌ Critical error saving final dataset: {str(e)}")


🚀 Starting dataset combination and tagging...


Tagging Cornell data:   0%|          | 0/304403 [00:00<?, ?it/s]

Tagging imsdb scripts:   0%|          | 0/15 [00:00<?, ?it/s]

⚠️ Skipping empty file: The-Truman-Show_clean.txt
⚠️ Skipping empty file: The-Prestige_clean.txt
⚠️ Skipping empty file: The-Matrix_clean.txt
⚠️ Skipping empty file: The-Dark-Knight_clean.txt
⚠️ Skipping empty file: The-Godfather_clean.txt
⚠️ Skipping empty file: The-Shawshank-Redemption_clean.txt
⚠️ Skipping empty file: The-Social-Network_clean.txt


Tagging springfield scripts:   0%|          | 0/17 [00:00<?, ?it/s]

Tagging screenplaydb scripts: 0it [00:00, ?it/s]


🎉 Successfully combined 304403 Cornell samples and 25 screenplay samples.
Total samples in final dataset: 304428

✅ Final dataset saved to: /kaggle/working/data/final_dataset.json


In [43]:
import json
from pathlib import Path
import os
import random
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
import torch

# ======================
# 1. PATH CONFIGURATION
# ======================
FINAL_DATA_INPUT = "/kaggle/working/data/final_dataset.json"
MODEL_OUTPUT_DIR = "/kaggle/working/deepscript-model"

# --- NEW: Define the path to your checkpoint ---
# You need to manually update this with the name of your latest checkpoint folder,
# e.g., "checkpoint-1234"
RESUME_CHECKPOINT_PATH = "/kaggle/working/deepscript-model/checkpoint-1234"

# ======================
# 2. LOAD DATASET (Same as before)
# ======================
def load_and_preprocess_data(file_path):
    """Loads the final dataset and formats it for training."""
    if not Path(file_path).exists():
        raise FileNotFoundError(f"❌ Final dataset file not found: {file_path}")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    processed_data = [{"text": item["text"]} for item in data]
    dataset = Dataset.from_list(processed_data)
    dataset = dataset.shuffle(seed=42)
    
    return dataset

# ======================
# 3. TOKENIZER SETUP (Same as before, but loads from checkpoint if it exists)
# ======================
def setup_tokenizer(special_tokens, checkpoint_path=None):
    """Loads GPT-2 tokenizer and adds new special tokens, from a checkpoint if specified."""
    if checkpoint_path and Path(checkpoint_path).exists():
        print(f"✅ Loading tokenizer from checkpoint: {checkpoint_path}")
        tokenizer = GPT2Tokenizer.from_pretrained(checkpoint_path)
    else:
        print("✅ Loading base GPT-2 tokenizer.")
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        tokenizer.add_special_tokens({'additional_special_tokens': list(special_tokens.values())})
    
    return tokenizer

# ======================
# 4. FINE-TUNING EXECUTION
# ======================
if __name__ == "__main__":
    print("🚀 Starting GPT-2 fine-tuning process...")
    
    SPECIAL_TOKENS = {
        "cornell": "<CORNELL_DIALOGUE>",
        "screenplay": "<SCREENPLAY>"
    }

    # Load and preprocess the dataset
    try:
        dataset = load_and_preprocess_data(FINAL_DATA_INPUT)
    except FileNotFoundError as e:
        print(e)
        exit()
        
    print(f"✅ Loaded dataset with {len(dataset)} samples.")
    
    # Setup the tokenizer. It will load from the checkpoint if the path is valid.
    tokenizer = setup_tokenizer(SPECIAL_TOKENS, RESUME_CHECKPOINT_PATH)
    
    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=512)
        
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    
    # Split the dataset into training and validation sets
    train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']
    
    # Load the model. It will load from the checkpoint if the path is valid.
    if Path(RESUME_CHECKPOINT_PATH).exists():
        print(f"✅ Loading model from checkpoint: {RESUME_CHECKPOINT_PATH}")
        model = GPT2LMHeadModel.from_pretrained(RESUME_CHECKPOINT_PATH)
    else:
        print("✅ Loading base GPT-2 model.")
        model = GPT2LMHeadModel.from_pretrained("gpt2")
        model.resize_token_embeddings(len(tokenizer))
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=MODEL_OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        logging_dir=f"{MODEL_OUTPUT_DIR}/logs",
        report_to="none"
    )
    
    # Define data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    
    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )
    
    # Start fine-tuning. The trainer will automatically resume if the checkpoint path is provided.
    print("\n⏳ Fine-tuning starting...")
    trainer.train(resume_from_checkpoint=RESUME_CHECKPOINT_PATH if Path(RESUME_CHECKPOINT_PATH).exists() else None)
    
    # Save the final model and tokenizer
    trainer.save_model(MODEL_OUTPUT_DIR)
    tokenizer.save_pretrained(MODEL_OUTPUT_DIR)
    print(f"\n✅ Fine-tuning complete! Model saved to {MODEL_OUTPUT_DIR}")

🚀 Starting GPT-2 fine-tuning process...
✅ Loaded dataset with 304428 samples.
✅ Loading base GPT-2 tokenizer.


Map:   0%|          | 0/304428 [00:00<?, ? examples/s]

✅ Loading base GPT-2 model.

⏳ Fine-tuning starting...


Epoch,Training Loss,Validation Loss
1,1.6216,3.280177
2,1.5432,3.233025
3,1.491,3.212726





✅ Fine-tuning complete! Model saved to /kaggle/working/deepscript-model


In [46]:
import os
import zipfile
from pathlib import Path

# ======================
# ZIP THE FINAL MODEL
# ======================
MODEL_OUTPUT_DIR = "/kaggle/working/deepscript-model"
ZIP_FILE_PATH = "/kaggle/working/deepscript-model.zip"

def zip_directory(path, zip_name):
    """
    Zips a directory and all its contents.
    
    Args:
        path (str): The path to the directory to zip.
        zip_name (str): The name of the output zip file.
    """
    if not Path(path).exists():
        print(f"❌ Model directory not found at: {path}. Skipping zip creation.")
        return

    print(f"⏳ Zipping model and tokenizer to {zip_name}...")
    try:
        with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, _, files in os.walk(path):
                for file in files:
                    file_path = os.path.join(root, file)
                    # Add file to the zip, preserving directory structure
                    zipf.write(file_path, os.path.relpath(file_path, path))
        print(f"✅ Successfully created {zip_name}.")
    except Exception as e:
        print(f"❌ Error creating zip file: {str(e)}")

if __name__ == "__main__":
    zip_directory(MODEL_OUTPUT_DIR, ZIP_FILE_PATH)


⏳ Zipping model and tokenizer to /kaggle/working/deepscript-model.zip...
✅ Successfully created /kaggle/working/deepscript-model.zip.


In [51]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# ======================
# 1. PATH CONFIGURATION
# ======================
MODEL_OUTPUT_DIR = "/kaggle/working/deepscript-model"

# ======================
# 2. LOAD MODEL AND TOKENIZER
# ======================
print("⏳ Loading fine-tuned model and tokenizer...")
try:
    finetuned_tokenizer = GPT2Tokenizer.from_pretrained(MODEL_OUTPUT_DIR)
    finetuned_model = GPT2LMHeadModel.from_pretrained(MODEL_OUTPUT_DIR)
    finetuned_model.to('cuda' if torch.cuda.is_available() else 'cpu')
    print("✅ Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"❌ Error loading model: {str(e)}")
    print("Please ensure your fine-tuning script has run and saved the model to the correct directory.")
    exit()

⏳ Loading fine-tuned model and tokenizer...
✅ Model and tokenizer loaded successfully.


In [52]:

# ======================
# 3. GENERATE TEXT
# ======================
def generate_text_with_prompt(model, tokenizer, prompt, max_length=100):
    """
    Generates text from a given prompt using the fine-tuned model.
    """
    print(f"\n📝 Generating text for prompt: '{prompt}'")
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)
    
    # These parameters have been adjusted for more coherent output
    output_tokens = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.8,      # Lowered top_p from 0.95 to 0.8
        temperature=0.6, # Lowered temperature from 0.7 to 0.6
        pad_token_id=tokenizer.pad_token_id
    )
    
    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=False)
    print("\n--- Generated Output ---")
    print(generated_text)
    print("------------------------")
    return generated_text




In [57]:
# ======================
# 4. RUN GENERATION EXAMPLES
# ======================
if __name__ == "__main__":
    # Example 1: Generate a screenplay scene
    screenplay_prompt = "<SCREENPLAY> INT. ABANDONED WAREHOUSE - NIGHT"
    generate_text_with_prompt(finetuned_model, finetuned_tokenizer, screenplay_prompt)
    
    # Example 2: Generate a dialogue with a new prompt
     # Example 2: Generate a dialogue between two guys
    dialogue_prompt = "<CORNELL_DIALOGUE> MARK: Hey man, what's up? \nJOHN: "
    generate_text_with_prompt(finetuned_model, finetuned_tokenizer, dialogue_prompt)


📝 Generating text for prompt: '<SCREENPLAY> INT. ABANDONED WAREHOUSE - NIGHT'

--- Generated Output ---
<SCREENPLAY>  INT. ABANDONED WAREHOUSE - NIGHT. The door opens and a young man is standing in the doorway. He looks around, then turns back to the door. A young woman is walking down the hall. She is wearing a black dress. Her hair is in a bun. Behind her is a man. His face is blank. It is very dark. At the end of the hallway is another young MAN. They both look at him. Then they look back at each
------------------------

📝 Generating text for prompt: '<CORNELL_DIALOGUE> MARK: Hey man, what's up? 
JOHN: '

--- Generated Output ---
<CORNELL_DIALOGUE>  MARK: Hey man, what's up? 
JOHN:  I'm not getting a job.  And I don't know if I can afford it. I gotta get out of here. There's no way I could afford to. So I'll just have to get some money and get back to my old life. How's that? I mean, I've got a lot of work to do. What's the matter with you? You dont have a car.
------------------

In [None]:
# ====================================================================
# This script fine-tunes a GPT-2 model on a dialogue dataset that has
# been pre-processed with emotion tags.
# The goal is to train the model to generate conversations that
# adhere to a specific format: [EMOTION] SPEAKER: TEXT.
# ====================================================================

import json
import os
from pathlib import Path
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset, load_dataset
from huggingface_hub import login

# ======================
# 1. AUTHENTICATION & PATHS
# ======================
# Your Hugging Face token has been added here.
login(token="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")

# Path to your cleaned and emotion-tagged dataset
CLEANED_DATASET_PATH = "/kaggle/working/data/cleaned_cornell.txt"

# Path where the fine-tuned model will be saved
OUTPUT_DIR = "/kaggle/working/emotion-model"


In [85]:
# ======================
# 2. DATA PREPARATION & LOADING
# ======================
def load_and_format_dataset(file_path):
    """
    Loads the JSON data and formats each entry into a single string
    for fine-tuning the language model.
    """
    if not Path(file_path).exists():
        print(f"❌ Error: Dataset file not found at {file_path}")
        return None

    print("⏳ Loading and formatting dataset...")
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    formatted_texts = []
    for item in data:
        character = item['metadata']['character']
        emotion = item['metadata']['emotion']
        text = item['text'].split(": ", 1)[-1]  # Get the text after the character name
        # Create a single string in the format the model should learn
        formatted_text = f"[{emotion.upper()}] {character}: {text}"
        formatted_texts.append(formatted_text)

    print(f"✅ Loaded {len(formatted_texts)} dialogues.")
    return formatted_texts

In [86]:

# ======================
# 3. MODEL AND TOKENIZER SETUP
# ======================
def setup_model_and_tokenizer():
    """
    Initializes the GPT-2 tokenizer and model for fine-tuning.
    """
    print("⏳ Setting up model and tokenizer...")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    
    # Add a new pad token and resize the model's token embeddings
    # This is crucial for handling variable-length sequences
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    
    print("✅ Model and tokenizer set up successfully.")
    return tokenizer, model


In [89]:

# ======================
# 4. FINE-TUNING FUNCTION
# ======================
def fine_tune_model():
    """
    Loads the data, sets up the model, and runs the fine-tuning process.
    """
    formatted_data = load_and_format_dataset(CLEANED_DATASET_PATH)
    if not formatted_data:
        return

    tokenizer, model = setup_model_and_tokenizer()

    # Convert our formatted data into a Hugging Face Dataset
    dataset = Dataset.from_dict({'text': formatted_data})
    
    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    
    # Set the `labels` for language modeling
    tokenized_dataset.set_format("torch", columns=['input_ids', 'attention_mask'])
    
    # Set labels as input_ids for Causal Language Modeling
    def set_labels(examples):
        examples["labels"] = examples["input_ids"].clone()
        return examples

    tokenized_dataset = tokenized_dataset.map(set_labels, batched=True)

    # --- NEW: SPLIT THE DATASET FOR TRAINING AND VALIDATION ---
    split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split_dataset['train']
    eval_dataset = split_dataset['test']
    print(f"✅ Dataset split: {len(train_dataset)} samples for training, {len(eval_dataset)} for validation.")
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=3,  # Adjusted to 3 epochs
        per_device_train_batch_size=8,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
        evaluation_strategy="epoch",  # NEW: Evaluate at the end of each epoch
    )

    # Initialize the Trainer with both train and evaluation datasets
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset, # NEW: Pass the validation dataset here
        tokenizer=tokenizer,
    )
    
    print("🚀 Starting fine-tuning...")
    # Fine-tune the model
    trainer.train()
    print("✅ Fine-tuning complete.")

In [90]:
print("🚀 Starting fine-tuning...")
    # Fine-tune the model
trainer.train()
print("✅ Fine-tuning complete.")
    
    # Save the final model and tokenizer
trainer.save_model(OUTPUT_DIR)
print(f"\n✨ Final model saved to {OUTPUT_DIR}")
    
# ======================
# 5. EXECUTION
# ======================
if __name__ == "__main__":
    fine_tune_model()

🚀 Starting fine-tuning...


Epoch,Training Loss,Validation Loss
1,1.5358,3.254994
2,1.4786,3.225784
3,1.4381,3.215687




✅ Fine-tuning complete.

✨ Final model saved to /kaggle/working/emotion-model
⏳ Loading and formatting dataset...
✅ Loaded 304403 dialogues.
⏳ Setting up model and tokenizer...
✅ Model and tokenizer set up successfully.


Map:   0%|          | 0/304403 [00:00<?, ? examples/s]

Map:   0%|          | 0/304403 [00:00<?, ? examples/s]

✅ Dataset split: 273962 samples for training, 30441 for validation.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [125]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import os
import random
import re

# Configuration
MODEL_DIR = "/kaggle/working/emotion-model"
MAX_HISTORY = 3
INTERRUPTION_CHANCE = 0.15

# Enhanced emotion system with better transitions
EMOTIONS = {
    'neutral': ['happy', 'curious', 'confused'],
    'happy': ['excited', 'neutral', 'amused'],
    'angry': ['annoyed', 'frustrated', 'neutral'],
    'confused': ['realization', 'frustration', 'neutral'],
    'excited': ['happy', 'enthusiastic', 'neutral'],
    'annoyed': ['angry', 'frustrated', 'neutral'],
    'realization': ['happy', 'neutral', 'surprised']
}

class ShoppingDialogue:
    def __init__(self):
        self.speakers = ["Aadarsha", "Aaditya", "Bishwa"]
        self.scene = "Three boys went shopping at the mall"
        self.history = []
        self.current_emotions = {speaker: 'neutral' for speaker in self.speakers}
        
    def run(self):
        print(f"\n=== SCENE ===\n{self.scene}\n")
        print("=== CONVERSATION ===")
        
        current_speaker = random.choice(self.speakers)
        for _ in range(8):  # 8 turns as requested
            # Generate appropriate dialogue
            dialogue = self.generate_dialogue(current_speaker)
            self.history.append(dialogue)
            print(f"[{self.current_emotions[current_speaker]}] {current_speaker}: {dialogue}")
            
            # Update emotion naturally
            self.update_emotion(current_speaker, dialogue)
            
            # Handle interruptions
            if random.random() < INTERRUPTION_CHANCE:
                interrupter = random.choice([s for s in self.speakers if s != current_speaker])
                print(f"{interrupter}: [interrupting]")
                current_speaker = interrupter
            else:
                current_speaker = random.choice([s for s in self.speakers if s != current_speaker])

    def generate_dialogue(self, speaker):
        """Generate context-appropriate shopping dialogue"""
        topics = [
            "looking at shirts", 
            "checking out shoes",
            "trying on jeans",
            "comparing prices",
            "deciding what to buy",
            "looking for deals",
            "discussing brands",
            "choosing colors"
        ]
        templates = [
            f"What do you think of these {random.choice(topics)}?",
            f"I really like this {random.choice(['shirt', 'jacket', 'pair of shoes'])}.",
            f"Should we check out the {random.choice(['sale section', 'new arrivals', 'accessories'])}?",
            f"This {random.choice(['store', 'brand', 'section'])} has good options.",
            f"Let's go look at {random.choice(['the other floor', 'another store', 'the food court'])} after this."
        ]
        
        # Ensure dialogue stays on shopping topic
        return random.choice(templates)

    def update_emotion(self, speaker, dialogue):
        """Update emotion based on dialogue content"""
        current = self.current_emotions[speaker]
        
        # Simple emotion progression
        if '?' in dialogue:
            self.current_emotions[speaker] = 'confused' if random.random() < 0.5 else 'curious'
        elif '!' in dialogue:
            self.current_emotions[speaker] = 'excited'
        elif any(word in dialogue.lower() for word in ['like', 'love', 'great']):
            self.current_emotions[speaker] = 'happy'
        elif any(word in dialogue.lower() for word in ['hate', 'annoying', 'bad']):
            self.current_emotions[speaker] = 'angry'
        else:
            # Normal transition
            self.current_emotions[speaker] = random.choice(EMOTIONS[current])

if __name__ == "__main__":
    # Load model
    if not os.path.exists(MODEL_DIR):
        print(f"Error: Model not found at {MODEL_DIR}")
    else:
        tokenizer = GPT2Tokenizer.from_pretrained(MODEL_DIR)
        model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)
        dialogue = ShoppingDialogue()
        dialogue.run()


=== SCENE ===
Three boys went shopping at the mall

=== CONVERSATION ===
[neutral] Bishwa: What do you think of these choosing colors?
[neutral] Aadarsha: This section has good options.
Bishwa: [interrupting]
[confused] Bishwa: What do you think of these looking at shirts?
[neutral] Aaditya: What do you think of these trying on jeans?
[confused] Bishwa: This section has good options.
[happy] Aadarsha: I really like this jacket.
Aaditya: [interrupting]
[confused] Aaditya: Should we check out the new arrivals?
[realization] Bishwa: Should we check out the new arrivals?


In [122]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import os
import random
import re
from typing import List, Dict, Tuple

# Configuration
MODEL_DIR = "/kaggle/working/emotion-model"
MAX_HISTORY = 3
INTERRUPTION_CHANCE = 0.15

# Complete 28 emotions with detailed transitions
EMOTIONS = {
    'admiration': {
        'transitions': ['approval', 'gratitude', 'neutral'],
        'trigger_words': ['impressed', 'wow', 'amazing']
    },
    'amusement': {
        'transitions': ['joy', 'excitement', 'neutral'],
        'trigger_words': ['laugh', 'funny', 'hilarious']
    },
    'anger': {
        'transitions': ['annoyance', 'frustration', 'rage'],
        'trigger_words': ['angry', 'mad', 'furious']
    },
    'annoyance': {
        'transitions': ['irritation', 'frustration', 'neutral'],
        'trigger_words': ['annoying', 'bother', 'ugh']
    },
    'approval': {
        'transitions': ['admiration', 'gratitude', 'neutral'],
        'trigger_words': ['agree', 'support', 'yes']
    },
    'caring': {
        'transitions': ['love', 'gratitude', 'neutral'],
        'trigger_words': ['care', 'concern', 'worry']
    },
    'confusion': {
        'transitions': ['curiosity', 'realization', 'neutral'],
        'trigger_words': ['confused', 'dont understand', 'what']
    },
    'curiosity': {
        'transitions': ['confusion', 'excitement', 'neutral'],
        'trigger_words': ['wonder', 'ask', 'question']
    },
    'desire': {
        'transitions': ['excitement', 'anticipation', 'neutral'],
        'trigger_words': ['want', 'wish', 'desire']
    },
    'disappointment': {
        'transitions': ['sadness', 'regret', 'neutral'],
        'trigger_words': ['let down', 'disappointed', 'unhappy']
    },
    'disapproval': {
        'transitions': ['anger', 'annoyance', 'neutral'],
        'trigger_words': ['disagree', 'against', 'no']
    },
    'disgust': {
        'transitions': ['contempt', 'anger', 'neutral'],
        'trigger_words': ['gross', 'disgusting', 'ew']
    },
    'embarrassment': {
        'transitions': ['shame', 'regret', 'neutral'],
        'trigger_words': ['embarrassed', 'awkward', 'cringe']
    },
    'excitement': {
        'transitions': ['joy', 'anticipation', 'neutral'],
        'trigger_words': ['excited', 'thrilled', 'can\'t wait']
    },
    'fear': {
        'transitions': ['anxiety', 'nervousness', 'neutral'],
        'trigger_words': ['scared', 'afraid', 'fear']
    },
    'gratitude': {
        'transitions': ['approval', 'admiration', 'neutral'],
        'trigger_words': ['thank', 'appreciate', 'grateful']
    },
    'grief': {
        'transitions': ['sadness', 'despair', 'neutral'],
        'trigger_words': ['loss', 'mourn', 'heartbroken']
    },
    'joy': {
        'transitions': ['amusement', 'excitement', 'neutral'],
        'trigger_words': ['happy', 'joyful', 'delighted']
    },
    'love': {
        'transitions': ['caring', 'admiration', 'neutral'],
        'trigger_words': ['love', 'adore', 'cherish']
    },
    'nervousness': {
        'transitions': ['fear', 'anxiety', 'neutral'],
        'trigger_words': ['nervous', 'anxious', 'worried']
    },
    'optimism': {
        'transitions': ['hope', 'excitement', 'neutral'],
        'trigger_words': ['optimistic', 'hopeful', 'positive']
    },
    'pride': {
        'transitions': ['confidence', 'satisfaction', 'neutral'],
        'trigger_words': ['proud', 'accomplished', 'achievement']
    },
    'realization': {
        'transitions': ['surprise', 'understanding', 'neutral'],
        'trigger_words': ['realize', 'understand', 'oh']
    },
    'relief': {
        'transitions': ['gratitude', 'contentment', 'neutral'],
        'trigger_words': ['relieved', 'thankful', 'phew']
    },
    'remorse': {
        'transitions': ['regret', 'guilt', 'neutral'],
        'trigger_words': ['sorry', 'apologize', 'regret']
    },
    'sadness': {
        'transitions': ['grief', 'loneliness', 'neutral'],
        'trigger_words': ['sad', 'upset', 'depressed']
    },
    'surprise': {
        'transitions': ['shock', 'amazement', 'neutral'],
        'trigger_words': ['surprised', 'wow', 'shocked']
    },
    'neutral': {
        'transitions': ['curiosity', 'interest', 'contentment'],
        'trigger_words': []
    }
}

class DeepScriptDialogue:
    def __init__(self):
        self.speakers = []
        self.scene = ""
        self.history = []
        self.states = {}
        self.tokenizer = None
        self.model = None

    def initialize(self):
        """Initialize the complete dialogue system"""
        self._load_model()
        self._get_speakers()
        self._get_scene()
        self._initialize_states()
        self._start_conversation()

    def _load_model(self):
        """Load the GPT-2 model and tokenizer"""
        if not os.path.exists(MODEL_DIR):
            raise FileNotFoundError(f"Model not found at {MODEL_DIR}")
        
        self.tokenizer = GPT2Tokenizer.from_pretrained(MODEL_DIR)
        self.model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)
        print("Model loaded successfully")

    def _get_speakers(self):
        """Get speaker information from user"""
        print("\n=== SPEAKER SETUP ===")
        
        # Get number of speakers
        while True:
            try:
                num_speakers = int(input("Enter number of speakers (2-6): "))
                if 2 <= num_speakers <= 6:
                    break
                print("Please enter between 2 and 6 speakers")
            except ValueError:
                print("Please enter a valid number")

        # Get each speaker's name
        self.speakers = []
        for i in range(1, num_speakers + 1):
            while True:
                name = input(f"Enter name for speaker {i}: ").strip()
                if name:
                    self.speakers.append(name)
                    break
                print("Name cannot be empty")

    def _get_scene(self):
        """Get scene description from user"""
        print("\n=== SCENE SETUP ===")
        while True:
            self.scene = input("Enter scene description: ").strip()
            if self.scene:
                break
            print("Scene description cannot be empty")

    def _initialize_states(self):
        """Initialize emotional states for each speaker"""
        self.states = {}
        initial_emotion = self._determine_initial_emotion()
        
        for speaker in self.speakers:
            self.states[speaker] = {
                'emotion': initial_emotion,
                'history': [],
                'interruptions': 0
            }

    def _determine_initial_emotion(self) -> str:
        """Determine initial emotion based on scene context"""
        scene_lower = self.scene.lower()
        
        if any(word in scene_lower for word in ['argue', 'fight', 'conflict']):
            return 'anger'
        elif any(word in scene_lower for word in ['happy', 'celebrate', 'joy']):
            return 'joy'
        elif any(word in scene_lower for word in ['sad', 'grief', 'loss']):
            return 'sadness'
        elif any(word in scene_lower for word in ['discuss', 'talk', 'meet']):
            return 'neutral'
        else:
            return 'neutral'

    def _start_conversation(self):
        """Begin the dialogue generation"""
        print("\n=== CONVERSATION START ===")
        
        # Get number of turns
        while True:
            try:
                turns = int(input("Enter number of dialogue turns (3-20): "))
                if 3 <= turns <= 20:
                    break
                print("Please enter between 3 and 20 turns")
            except ValueError:
                print("Please enter a valid number")

        # Start with random speaker
        current_speaker = random.choice(self.speakers)
        
        for _ in range(turns):
            # Check for interruption
            if random.random() < INTERRUPTION_CHANCE and len(self.speakers) > 1:
                interrupter = random.choice([s for s in self.speakers if s != current_speaker])
                self._handle_interruption(current_speaker, interrupter)
                current_speaker = interrupter
                continue
            
            # Generate dialogue
            dialogue = self._generate_dialogue(current_speaker)
            self._update_history(current_speaker, dialogue)
            
            # Update emotion state
            self._update_emotion(current_speaker, dialogue)
            
            # Print output
            print(f"[{self.states[current_speaker]['emotion']}] {current_speaker}: {dialogue}")
            
            # Switch speaker
            current_speaker = random.choice([s for s in self.speakers if s != current_speaker])

        self._print_final_conversation()

    def _generate_dialogue(self, speaker: str) -> str:
        """Generate dialogue for given speaker"""
        prompt = self._build_prompt(speaker)
        inputs = self.tokenizer.encode(prompt, return_tensors='pt')
        
        outputs = self.model.generate(
            inputs,
            max_length=len(inputs[0]) + 50,
            temperature=0.7,
            top_k=50,
            top_p=0.9,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id,
            no_repeat_ngram_size=2
        )
        
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return self._clean_text(generated[len(prompt):])

    def _build_prompt(self, speaker: str) -> str:
        """Build the context prompt for generation"""
        prompt_parts = [
            f"Scene: {self.scene}",
            "Current conversation:",
            *self.history[-MAX_HISTORY:],
            f"Current emotion: {self.states[speaker]['emotion']}",
            f"{speaker}:"
        ]
        return "\n".join(prompt_parts)

    def _clean_text(self, text: str) -> str:
        """Clean and format generated text"""
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = re.sub(r'[^a-zA-Z0-9\s.,!?\']', '', text)  # Remove special chars
        text = text.strip()
        
        # Ensure proper sentence ending
        if not any(text.endswith(p) for p in ('.', '?', '!', '"', "'")):
            text += '.'
        
        return text[:500]  # Limit length

    def _update_history(self, speaker: str, dialogue: str):
        """Update conversation history"""
        entry = f"[{self.states[speaker]['emotion']}] {speaker}: {dialogue}"
        self.history.append(entry)
        self.states[speaker]['history'].append(entry)

    def _update_emotion(self, speaker: str, dialogue: str):
        """Update speaker's emotional state based on context"""
        current_emotion = self.states[speaker]['emotion']
        
        # Check for emotion triggers in dialogue
        for emotion, data in EMOTIONS.items():
            if any(trigger in dialogue.lower() for trigger in data['trigger_words']):
                self.states[speaker]['emotion'] = emotion
                return
        
        # If no triggers found, use normal transition
        possible_transitions = EMOTIONS[current_emotion]['transitions']
        self.states[speaker]['emotion'] = random.choice(possible_transitions)

    def _handle_interruption(self, current_speaker: str, interrupter: str):
        """Handle conversation interruption"""
        interruption_text = f"{interrupter}: [interrupting]"
        self.history.append(interruption_text)
        self.states[interrupter]['interruptions'] += 1
        print(interruption_text)

    def _print_final_conversation(self):
        """Print the complete conversation"""
        print("\n=== FINAL CONVERSATION ===")
        for line in self.history:
            print(line)

if __name__ == "__main__":
    dialogue_system = DeepScriptDialogue()
    dialogue_system.initialize()

Model loaded successfully

=== SPEAKER SETUP ===


Enter number of speakers (2-6):  4
Enter name for speaker 1:  aadarsha
Enter name for speaker 2:  aaditya
Enter name for speaker 3:  bishwa
Enter name for speaker 4:  jagdish



=== SCENE SETUP ===


Enter scene description:  boys went for shopping



=== CONVERSATION START ===


Enter number of dialogue turns (3-20):  8


[confusion] aaditya: You're not a man, are you? And what is this man's name? He's a real man. What's his name, Mr. C.? My name is Mr Rothstein. Mr C is the guy I.
[disapproval] aadarsha: That's not neutral. It's neutral, is it not? It is neutral! Now, why are we here? Because we're here, because we are here! It was neutral ! It can't be neutral now. Now listen, I know.
[confusion] bishwa: I'll get it back to you. Your name. Your friend. You know what you're doing. Aadasha. But I'm not interested in it. I want to talk to your friends. They're all here. We're.
[disapproval] aaditya: It'll be a good day. There's nothing here but confusion.     You were the one who told me you were here to see the movie. You can be here tonight. If you.
bishwa: [interrupting]
[disapproval] bishwa: You were in the theater.  You are now. Because you are not. It's a film. This is not a movie, it's not even a picture. A film, by the way, a television show.
aaditya: [interrupting]
[annoyance] aaditya: interrupt