# Preprocessing Debates

Here we turn our dataset into a more usable dataset.

## Old Debates

In [None]:
import os
import json
import pandas as pd

directory= "data_dir"

# 🗃️ Incumbents listed as (last_name_lowercase, year)
INCUMBENT_PAIRS = {
    ("ford", "1976"),
    ("carter", "1980"),
    ("reagan", "1984"),
    ("bush", "1984"),
    ("bush", "1992"),
    ("quayle", "1992"),
    ("clinton", "1996"),
    ("gore", "1996"),
    ("bush", "2004"),
    ("cheney", "2004"),
    ("obama", "2012"),
    ("biden", "2012"),
    ("trump", "2020"),
    ("pence", "2020"),
}



CANDIDATES = {
    # Presidential Candidates
    "kennedy": "Democratic",
    "nixon": "Republican",
    "ford": "Republican",
    "carter": "Democratic",
    "reagan": "Republican",
    "anderson": "Independent",
    "mondale": "Democratic",
    "bush": "Republican",
    "dukakis": "Democratic",
    "clinton": "Democratic",
    "perot": "Independent",
    "dole": "Republican",
    "gore": "Democratic",
    "nader": "Green",
    "kerry": "Democratic",
    "obama": "Democratic",
    "mccain": "Republican",
    "romney": "Republican",
    "trump": "Republican",
    "clinton": "Democratic",
    "biden": "Democratic",


    # Vice-Presidential Candidates (1984 onwards)
    "ferraro": "Democratic",
    "quayle": "Republican",
    "bentsen": "Democratic",
    "kemp": "Republican",
    "lieberman": "Democratic",
    "cheney": "Republican",
    "edwards": "Democratic",
    "palin": "Republican",
    "biden": "Democratic",
    "ryan": "Republican",
    "kaine": "Democratic",
    "pence": "Republican",
    "harris": "Democratic",
    "vance": "Republican",
}



# Helper to extract last name and format it
def normalize_last_name(full_name):
    if not full_name or full_name == "UNKNOWN":
        return "UNKNOWN"
    last_name = full_name.strip().split()[-1]
    return last_name.capitalize()

# Check incumbency
def check_incumbent(last_name, year):
    return (last_name.lower(), str(year)) in INCUMBENT_PAIRS

def check_candidate(last_name):
    key = last_name.lower()
    if key in CANDIDATES:
        return True, CANDIDATES[key]
    else:
        return False, None

# 📜 Gather JSON file paths
json_files = [
    os.path.join(directory, f)
    for f in os.listdir(directory)
    if f.endswith(".json")
]

#Remove all files that are begin with "part"
json_files = [f for f in json_files if not os.path.basename(f).startswith("part")]

# 📊 Store all data
all_rows = []

# 🔁 Process each file
for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        debate = json.load(f)
        content = debate.get("content", [])
        # Join date parts and parse to datetime object
        date_str = " ".join(debate.get("date", []))
        try:
            date = pd.to_datetime(date_str, errors="raise")
            date = date.date()
            year = date.year
        except Exception:
            date = pd.NaT
            year = "UNKNOWN"

        # 🔎 Check if VP debate (first 5 entries only)
        first_5_dialogues = [entry.get("dialogue", "").lower() for entry in content[:5]]
        is_vp_debate = any("vice presidential" in dialogue for dialogue in first_5_dialogues)

        for entry in content:
            actor_raw = entry.get("actor", "UNKNOWN")
            dialogue = entry.get("dialogue", "")
            last_name = normalize_last_name(actor_raw)
            
            is_candidate, party = check_candidate(last_name)
            is_incumbent = check_incumbent(last_name, year)

            # Distinguish Bush Sr/Jr and Bill/Hillary Clinton
            if last_name.lower() == "bush":
                # Use "Bush Sr" for 1988/1992, "Bush Jr" for 2000/2004
                if str(year) in ["1984","1988", "1992"]:
                    last_name = "Bush Sr"
                elif str(year) in ["2000", "2004"]:
                    last_name = "Bush Jr"
            elif last_name.lower() == "clinton":
                # If actor_raw contains "Hillary", use "Clinton (Hillary)", else "Clinton (Bill)"
                if str(year)=="2016":
                    last_name = "Clinton (Hillary)"
                else:
                    last_name = "Clinton (Bill)"
                    
            if last_name.lower()=="edwards" and str(year)=="1960":
                party= None
                is_candidate = False

            all_rows.append({
                "year": year,
                "date": date,
                "actor": last_name,
                "dialogue": dialogue,
                "is_candidate": is_candidate,
                "party": party,
                "VP_debate": is_vp_debate,
                "is_incumbent": is_incumbent
            })

# 🧱 Create DataFrame
df = pd.DataFrame(all_rows)

# Manually fix the VP debate column for Oct 11, 2000, Sep 26, 2008, and Oct 3, 2012 debates
for d in ["2000-10-11", "2008-09-26", "2012-10-03"]:
    df.loc[df["date"] == pd.to_datetime(d).date(), "VP_debate"] = False

# ✅ View
df.head(20)


Unnamed: 0,year,date,actor,dialogue,is_candidate,party,VP_debate,is_incumbent
0,2020,2020-10-07,Participants,Senator Kamala Harris (D-CA) and,False,,True,False
1,2020,2020-10-07,Moderator,Susan Page (USA Today),False,,True,False
2,2020,2020-10-07,Page,Good evening. From the University of Utah in S...,False,,True,False
3,2020,2020-10-07,Pence,Thank you.,True,Republican,True,True
4,2020,2020-10-07,Page,Senator Harris and Vice President Pence thank ...,False,,True,False
5,2020,2020-10-07,Harris,"Thank you Susan. Well, the American people hav...",True,Democratic,True,False
6,2020,2020-10-07,Page,"Thank you- Thank you, Senator Harris. Thank yo...",False,,True,False
7,2020,2020-10-07,Pence,Susan thank you. And I want to thank the Commi...,True,Republican,True,True
8,2020,2020-10-07,Page,"Thank you, Vice Pres- Thank you Vice President...",False,,True,False
9,2020,2020-10-07,Harris,"Oh, absolutely. Whatever the Vice Presidents c...",True,Democratic,True,False


In [38]:
# 🧹 First, drop duplicates to only show each actor once per debate
unique_actors = df.drop_duplicates(subset=["date", "actor"])[
    ["date", "actor", "VP_debate", "is_incumbent", "is_candidate", "party"]
]

# 🧮 Count the number of statements per actor per debate
statement_counts = df.groupby(["date", "actor"]).size().reset_index(name="statement_count")

# 🪄 Merge the counts into the unique_actors DataFrame
unique_actors = unique_actors.merge(statement_counts, on=["date", "actor"], how="left")

# 📅 Sort by date and actor
unique_actors.sort_values(by=["date", "actor"], inplace=True)

unique_actors

Unnamed: 0,date,actor,VP_debate,is_incumbent,is_candidate,party,statement_count
277,1960-09-26,Fleming,False,False,False,,2
274,1960-09-26,Kennedy,False,False,True,Democratic,17
276,1960-09-26,Nixon,False,False,True,Republican,10
278,1960-09-26,Novins,False,False,False,,5
275,1960-09-26,Smith,False,False,False,,23
...,...,...,...,...,...,...,...
148,2020-10-22,Biden,False,False,True,Democratic,84
145,2020-10-22,Moderator,False,False,False,,1
144,2020-10-22,Participants,False,False,False,,1
147,2020-10-22,Trump,False,True,True,Republican,122
