# Final Project

In [1]:
import pandas as pd



In [2]:
def load_movie_lines(lines_file_path):
    lines = {}
    with open(lines_file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 5:
                line_id, character_id, movie_id, character_name, text = parts 
                lines[line_id] = {
                    "character_id": character_id,
                    "movie_id": movie_id,
                    "character_name": character_name,
                    "text": text
                }
    return lines
        

In [3]:
def load_movie_conversations(conversations_file_path):
    conversations = []
    with open(conversations_file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 4:
                character1_id, character2_id, movie_id, utterance_ids = parts
                utterance_ids = utterance_ids[1:-1].replace("'", "").split(', ')
                conversations.append({
                    "character1_id": character1_id,
                    "character2_id": character2_id,
                    "movie_id": movie_id,
                    "utterances": utterance_ids
                })
    return conversations

In [4]:
def load_movie_metadata(metadata_file_path):
    movie_metadata = {}
    with open(metadata_file_path, 'r', encoding='utf-8', errors='ignore') as f: 
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) > 5:
                movie_id, title, genre = parts[0], parts[1], parts[5]
                movie_metadata[movie_id] = genre
    return movie_metadata

In [5]:
lines_file = "./Cornell_Movie_Dialogue_Corpus/movie_lines.txt"
conversation_file = "./Cornell_Movie_Dialogue_Corpus/movie_conversations.txt"
metadata_file = "./Cornell_Movie_Dialogue_Corpus/movie_titles_metadata.txt"

lines = load_movie_lines(lines_file)
conversations = load_movie_conversations(conversation_file)
movie_metadata = load_movie_metadata(metadata_file)

In [6]:
data = []

for conv in conversations:
    utterance_ids = conv['utterances']

    for i in range(1, len(utterance_ids)):
        response_id = utterance_ids[i]
        context_ids = utterance_ids[max(0, i-5):i]

        # Create the convo history
        history = []
        for utt_id in context_ids:
            if utt_id in lines:  # Check if utterance exists in lines
                history.append(lines[utt_id]['text'])
            else:
                continue
                #print(f"Missing utterance ID in conversation history: {utt_id}")  # Debugging line

        
        if response_id in lines:
            response = lines[response_id]['text']

            # Meta data for movie and character
            movie_id = conv['movie_id']
            character1_name = lines[context_ids[0]]['character_name'] if context_ids and context_ids[0] in lines else ''
            character2_name = lines[response_id]['character_name']

            if character1_name == character2_name:
                continue

            # Append data
            data.append({
                "movie_id": movie_id,
                "character_1": character1_name,
                "character_2": character2_name,
                "conversation_history": history,
                "response": response
            })

In [7]:
# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df.head()

Unnamed: 0,movie_id,character_1,character_2,conversation_history,response
0,m0,BIANCA,CAMERON,[Can we make this quick? Roxanne Korrine and ...,"Well, I thought we'd start with pronunciation,..."
1,m0,BIANCA,BIANCA,[Can we make this quick? Roxanne Korrine and ...,Not the hacking and gagging and spitting part....
2,m0,BIANCA,CAMERON,[Can we make this quick? Roxanne Korrine and ...,Okay... then how 'bout we try out some French ...
3,m0,BIANCA,CAMERON,[You're asking me out. That's so cute. What's...,Forget it.
4,m0,BIANCA,CAMERON,"[No, no, it's my fault -- we didn't have a pro...",Cameron.


In [8]:
df['genre'] = df['movie_id'].map(movie_metadata)

In [14]:
df.head(-50)

Unnamed: 0,movie_id,character_1,character_2,conversation_history,response,genre
0,m0,BIANCA,CAMERON,[Can we make this quick? Roxanne Korrine and ...,"Well, I thought we'd start with pronunciation,...","['comedy', 'romance']"
1,m0,BIANCA,BIANCA,[Can we make this quick? Roxanne Korrine and ...,Not the hacking and gagging and spitting part....,"['comedy', 'romance']"
2,m0,BIANCA,CAMERON,[Can we make this quick? Roxanne Korrine and ...,Okay... then how 'bout we try out some French ...,"['comedy', 'romance']"
3,m0,BIANCA,CAMERON,[You're asking me out. That's so cute. What's...,Forget it.,"['comedy', 'romance']"
4,m0,BIANCA,CAMERON,"[No, no, it's my fault -- we didn't have a pro...",Cameron.,"['comedy', 'romance']"
...,...,...,...,...,...,...
221359,m615,MAITRE D',MAITRE D',"[Yes, sir, name, please?, Food!, Do you have a...","I'm sorry, sir. We only seat by reservation.","['comedy', 'sci-fi']"
221360,m615,MONSTER,MAITRE D',[Drink!],"Oh, no sir-ee. If yo don't have a reservation...","['comedy', 'sci-fi']"
221361,m615,MONSTER,MAITRE D',[Foooooood!],Now just one moment. There's no need for roug...,"['comedy', 'sci-fi']"
221362,m615,MONSTER,MONSTER,"[Foooooood!, Now just one moment. There's no ...",GRRRHMMNNNJKJMMMNN!,"['comedy', 'sci-fi']"


In [18]:
number = len(df)

print(number)

221414
