In [13]:
import csv

input_file = "merged_cast.csv"
output_file = "merged_cast_fixed.csv"

with open(input_file, "r", newline="", encoding="utf-8") as infile, \
     open(output_file, "w", newline="", encoding="utf-8") as outfile:
    reader = csv.reader(infile)
    # Use QUOTE_ALL to force quoting of every field
    writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)
    for row in reader:
        writer.writerow(row)

print("CSV file has been rewritten as:", output_file)


CSV file has been rewritten as: merged_cast_fixed.csv


In [3]:
import csv

input_file = "tv_shows_temp_fixed.csv"
output_file = "tv_shows_temp_fixed_null.csv"

with open(input_file, "r", newline="", encoding="utf-8") as infile, \
     open(output_file, "w", newline="", encoding="utf-8") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()
    for row in reader:
        # Replace empty first_air_date with \N
        if row.get("first_air_date", "").strip() == "":
            row["first_air_date"] = r"\N"
        writer.writerow(row)

print("CSV fixed; output written to", output_file)


CSV fixed; output written to tv_shows_temp_fixed_null.csv


In [None]:
import pandas as pd

# Load both CSVs ensuring person_id is read as a string.
persons_df = pd.read_csv("merged_cast.csv", dtype={'person_id': str})
tv_persons_df = pd.read_csv("tv_persons_temp.csv", dtype={'person_id': str})

# Strip any extra whitespace in the person_id columns.
persons_df["person_id"] = persons_df["person_id"].str.strip()
tv_persons_df["person_id"] = tv_persons_df["person_id"].str.strip()

# Debug prints: check the number of rows before filtering.
print("Total in persons.csv:", len(persons_df))
print("Total in tv_persons.csv:", len(tv_persons_df))

# Filter tv_persons_df: keep only rows where person_id is not in persons_df.
filtered_tv_persons_df = tv_persons_df[~tv_persons_df["person_id"].isin(persons_df["person_id"])]

# Debug print: check the number of rows after filtering.
print("Filtered tv_persons.csv rows:", len(filtered_tv_persons_df))

# Save the filtered DataFrame.
filtered_tv_persons_df.to_csv("tv_persons_filtered.csv", index=False, encoding="utf-8")


Total in persons.csv: 3553674
Total in tv_persons.csv: 934137
Filtered tv_persons.csv rows: 374544


In [12]:
import pandas as pd

def merge_csv_files(file1, file2, output_file, key_columns=None):
    """
    Merge two CSV files and remove duplicate rows.
    
    Parameters:
    - file1: Path to the first CSV file.
    - file2: Path to the second CSV file.
    - output_file: Path for the output merged CSV file.
    - key_columns: List of columns to use as keys for identifying duplicates.
                   If None, duplicates are dropped based on all columns.
    """
    # Read the CSV files
    df1 = pd.read_csv(file1, dtype=str)
    df2 = pd.read_csv(file2, dtype=str)
    
    # Concatenate the DataFrames
    merged_df = pd.concat([df1, df2], ignore_index=True)
    
    # Remove duplicates
    if key_columns:
        merged_df = merged_df.drop_duplicates(subset=key_columns)
    else:
        merged_df = merged_df.drop_duplicates()
    
    # Write the merged DataFrame to CSV
    merged_df.to_csv(output_file, index=False)
    print(f"Merged file written to: {output_file}")


# Merge cast_temp.csv and tv_cast_temp.csv
merge_csv_files("cast_temp.csv", "tv_cast_temp.csv", "merged_cast.csv", key_columns=["credit_id", "person_id"])



Merged file written to: merged_cast.csv
