In [3]:
import pandas as pd
import re
import chardet

def preprocess_rick_and_morty_script(input_file, output_file):
    # Detect the encoding of the input file
    with open(input_file, 'rb') as f:
        result = chardet.detect(f.read())
    encoding = result['encoding']

    # Load the CSV file with the detected encoding
    data = pd.read_csv(input_file, header=None, names=['season', 'episode', 'text'], encoding=encoding)

    # Initialize a list to store the processed rows
    processed_rows = []

    # Group by season and episode
    grouped = data.groupby(['season', 'episode'])

    for (season, episode), group in grouped:
        # 1. Delete any string that is in square brackets or brackets (together with the square brackets or brackets)
        group['text'] = group['text'].apply(lambda x: re.sub(r'\[.*?\]|\(.*?\)', '', x))

        # 2. Delete any rows that are blank
        group = group[group['text'].str.strip() != '']

        # 3. If any row does not contain a colon, append its content to the previous row split with a space and delete the current row
        processed_data = []
        buffer = ""

        for row in group['text']:
            if ':' in row:
                if buffer:
                    processed_data.append(buffer.strip())
                    buffer = ""
                processed_data.append(row.strip())
            else:
                buffer += " " + row.strip()

        if buffer:
            processed_data.append(buffer.strip())

        # 4. Split the text into character name and dialogue, and remove any blanks at the start of the dialogue string
        for row in processed_data:
            split_row = row.split(':', 1)
            if len(split_row) == 2:
                character, dialogue = split_row
                processed_rows.append([season, episode, character.strip(), dialogue.strip()])

    # Convert the list back to a DataFrame
    processed_data_df = pd.DataFrame(processed_rows, columns=['season', 'episode', 'character', 'dialogue'])

    # Write the processed data into a new CSV file
    processed_data_df.to_csv(output_file, index=False, encoding='utf-8')

# Usage
input_file = 'RickandMortySeason1-3.csv'
output_file = 'Processed_RickandMortySeason1-3.csv'
preprocess_rick_and_morty_script(input_file, output_file)


In [1]:
import pandas as pd
import json

# Load the CSV file
file_path = 'Processed_RickandMortySeason1-7.csv'
df = pd.read_csv(file_path)

# Process the DataFrame into the desired JSON format
conversations = []
for index, row in df.iterrows():
    from_field = 'gpt' if 'Rick' in row['character'] else 'human'
    conversation_entry = {
        "from": from_field,
        "value": row['dialogue']
    }
    conversations.append(conversation_entry)

# Create the final JSON structure
json_output = {
    "conversations": conversations
}

# Save the JSON to a file
output_file_path = 'RickDialogues.json'
with open(output_file_path, 'w') as json_file:
    json.dump(json_output, json_file, indent=4)

print(f"JSON file saved as {output_file_path}")

JSON file saved as RickDialogues.json


In [2]:
# Read the json file
with open('RickDialogues.json', 'r') as file:
    data = json.load(file)

# Find the indices of GPT dialogues
gpt_indices = [i for i, conv in enumerate(data["conversations"]) if conv["from"] == "gpt"]

filtered_conversations = []  # Initialize list to store filtered conversations
checked_indices = set()  # Keep track of checked human dialogues to maintain sequence

for idx in gpt_indices:
    filtered_conversations.append(data["conversations"][idx])

for i, conv in enumerate(data["conversations"]):
    if conv["from"] == "human" and i not in checked_indices:
        gpt_nearby = False
        for j in range(max(0, i - 2), min(len(data["conversations"]), i + 3)):
            if j in gpt_indices:
                gpt_nearby = True
                break
        if gpt_nearby:
            filtered_conversations.append(conv)
        checked_indices.add(i)

# Sort the filtered conversations based on their original order
filtered_conversations.sort(key=lambda x: data["conversations"].index(x))

# Save the filtered dialogues to a new json with correct sequence
filtered_data = {"conversations": filtered_conversations}
with open('RickDialogues_modified_2.json', 'w') as outfile:
    json.dump(filtered_data, outfile, indent=4)

print("Filtered dialogues saved to 'RickDialogues_modified_2.json'.")

Filtered dialogues saved to 'RickDialogues_modified_2.json'.


In [3]:
# Load the JSON file
input_file_path = 'RickDialogues_modified_2.json'
with open(input_file_path, 'r') as json_file:
    data = json.load(json_file)

# Get the conversations list
conversations = data["conversations"]

# Calculate the number of conversations per file
num_conversations = len(conversations)
num_files = 10
conversations_per_file = num_conversations // num_files
remainder = num_conversations % num_files

# Split the conversations into smaller files
start_index = 0
for i in range(num_files):
    end_index = start_index + conversations_per_file + (1 if i < remainder else 0)
    split_conversations = conversations[start_index:end_index]
    
    # Create the JSON structure for the split file
    split_json_output = {
        "conversations": split_conversations
    }
    
    # Save the split JSON to a file
    split_file_path = f'RickDialogues_modified_2_part_{i+1}.json'
    with open(split_file_path, 'w') as split_json_file:
        json.dump(split_json_output, split_json_file, indent=4)
    
    print(f"JSON file saved as {split_file_path}")
    
    start_index = end_index


JSON file saved as RickDialogues_modified_2_part_1.json
JSON file saved as RickDialogues_modified_2_part_2.json
JSON file saved as RickDialogues_modified_2_part_3.json
JSON file saved as RickDialogues_modified_2_part_4.json
JSON file saved as RickDialogues_modified_2_part_5.json
JSON file saved as RickDialogues_modified_2_part_6.json
JSON file saved as RickDialogues_modified_2_part_7.json
JSON file saved as RickDialogues_modified_2_part_8.json
JSON file saved as RickDialogues_modified_2_part_9.json
JSON file saved as RickDialogues_modified_2_part_10.json


In [1]:
import pandas as pd

# Load the CSV file
file_path = 'Processed_RickandMortySeason1-7.csv'  # Update this with the correct path to your file
data = pd.read_csv(file_path)

# Normalize the character names
data['character'] = data['character'].str.lower()
data['character'] = data['character'].apply(lambda x: 'Rick' if 'rick' in x else ('Morty' if 'morty' in x else x))

# Count the number of dialogues for each character
character_dialogue_counts = data['character'].value_counts()

# Count the number of unique characters
unique_characters = data['character'].nunique()

# Print the results
print(f"Number of unique characters: {unique_characters}")
print("Number of dialogues per character:")
print(character_dialogue_counts)


Number of unique characters: 472
Number of dialogues per character:
character
Rick                   2916
Morty                  1993
jerry                   677
summer                  580
beth                    497
                       ... 
villager 1                1
owner                     1
tickets please guy        1
hamster news anchor       1
beth and jerry            1
Name: count, Length: 472, dtype: int64
