# Preprocessing Debates

Here we turn our dataset into a more usable dataset.

## Old Debates

In [1]:
import os
import json
import pandas as pd

directory= "data_dir"

# üóÉÔ∏è Incumbents listed as (last_name_lowercase, year)
INCUMBENT_PAIRS = {
    ("ford", "1976"),
    ("carter", "1980"),
    ("reagan", "1984"),
    ("bush", "1984"),
    ("bush", "1992"),
    ("quayle", "1992"),
    ("clinton", "1996"),
    ("gore", "1996"),
    ("bush", "2004"),
    ("cheney", "2004"),
    ("obama", "2012"),
    ("biden", "2012"),
    ("trump", "2020"),
    ("pence", "2020"),
}

WINNER_PAIRS = {
    ("kennedy", "1960"),
    ("carter", "1976"),
    ("reagan", "1980"),
    ("bush", "1980"),
    ("reagan", "1984"),
    ("bush", "1984"),
    ("bush", "1988"),
    ("quayle", "1988"),
    ("clinton", "1992"),
    ("gore", "1992"),
    ("clinton", "1996"),
    ("gore", "1996"),
    ("bush", "2000"),
    ("cheney", "2000"),
    ("bush", "2004"),
    ("cheney", "2004"),
    ("obama", "2008"),
    ("biden", "2008"),
    ("obama", "2012"),
    ("biden", "2012"),
    ("trump", "2016"),
    ("pence", "2016"),
    ("biden", "2020"),
    ("harris", "2020")}

CANDIDATES = {
    # Presidential Candidates
    "kennedy": "Democratic",
    "nixon": "Republican",
    "ford": "Republican",
    "carter": "Democratic",
    "reagan": "Republican",
    "anderson": "Independent",
    "mondale": "Democratic",
    "bush": "Republican",
    "dukakis": "Democratic",
    "clinton": "Democratic",
    "perot": "Independent",
    "dole": "Republican",
    "gore": "Democratic",
    "nader": "Green",
    "kerry": "Democratic",
    "obama": "Democratic",
    "mccain": "Republican",
    "romney": "Republican",
    "trump": "Republican",
    "clinton": "Democratic",
    "biden": "Democratic",


    # Vice-Presidential Candidates (1984 onwards)
    "ferraro": "Democratic",
    "quayle": "Republican",
    "bentsen": "Democratic",
    "kemp": "Republican",
    "lieberman": "Democratic",
    "cheney": "Republican",
    "edwards": "Democratic",
    "palin": "Republican",
    "biden": "Democratic",
    "ryan": "Republican",
    "kaine": "Democratic",
    "pence": "Republican",
    "harris": "Democratic",
    "vance": "Republican",
}



# Helper to extract last name and format it
def normalize_last_name(full_name):
    if not full_name or full_name == "UNKNOWN":
        return "UNKNOWN"
    last_name = full_name.strip().split()[-1]
    return last_name.capitalize()

# Check incumbency
def check_incumbent(last_name, year):
    return (last_name.lower(), str(year)) in INCUMBENT_PAIRS

def check_winner(last_name, year):
    return (last_name.lower(), str(year)) in WINNER_PAIRS

def check_candidate(last_name):
    key = last_name.lower()
    if key in CANDIDATES:
        return True, CANDIDATES[key]
    else:
        return False, None

# üìú Gather JSON file paths
json_files = [
    os.path.join(directory, f)
    for f in os.listdir(directory)
    if f.endswith(".json")
]

#Remove all files that are begin with "part"
json_files = [f for f in json_files if not os.path.basename(f).startswith("part")]

# üìä Store all data
all_rows = []

# üîÅ Process each file
for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        debate = json.load(f)
        content = debate.get("content", [])
        # Join date parts and parse to datetime object
        date_str = " ".join(debate.get("date", []))
        try:
            date = pd.to_datetime(date_str, errors="raise")
            date = date.date()
            year = date.year
        except Exception:
            date = pd.NaT
            year = "UNKNOWN"

        # üîé Check if VP debate (first 5 entries only)
        first_5_dialogues = [entry.get("dialogue", "").lower() for entry in content[:5]]
        is_vp_debate = any("vice presidential" in dialogue for dialogue in first_5_dialogues)

        for entry in content:
            actor_raw = entry.get("actor", "UNKNOWN")
            dialogue = entry.get("dialogue", "")
            last_name = normalize_last_name(actor_raw)
            
            is_candidate, party = check_candidate(last_name)
            is_incumbent = check_incumbent(last_name, year)
            is_winner = check_winner(last_name, year)

            # Distinguish Bush Sr/Jr and Bill/Hillary Clinton
            if last_name.lower() == "bush":
                # Use "Bush Sr" for 1988/1992, "Bush Jr" for 2000/2004
                if str(year) in ["1984","1988", "1992"]:
                    last_name = "Bush Sr"
                elif str(year) in ["2000", "2004"]:
                    last_name = "Bush Jr"
            elif last_name.lower() == "clinton":
                # If actor_raw contains "Hillary", use "Clinton (Hillary)", else "Clinton (Bill)"
                if str(year)=="2016":
                    last_name = "Clinton (Hillary)"
                else:
                    last_name = "Clinton (Bill)"
                    
            if last_name.lower()=="edwards" and str(year)=="1960":
                party= None
                is_candidate = False

            all_rows.append({
                "year": year,
                "date": date,
                "actor": last_name,
                "dialogue": dialogue,
                "is_candidate": is_candidate,
                "party": party,
                "is_winner": is_winner,
                "VP_debate": is_vp_debate,
                "is_incumbent": is_incumbent
            })

# üß± Create DataFrame
df_debates = pd.DataFrame(all_rows)

# Manually fix the VP debate column for Oct 11, 2000, Sep 26, 2008, and Oct 3, 2012 debates
for d in ["2000-10-11", "2008-09-26", "2012-10-03"]:
    df_debates.loc[df_debates["date"] == pd.to_datetime(d).date(), "VP_debate"] = False

# ‚úÖ View
df_debates.head(20)


Unnamed: 0,year,date,actor,dialogue,is_candidate,party,is_winner,VP_debate,is_incumbent
0,2020,2020-10-07,Participants,Senator Kamala Harris (D-CA) and,False,,False,True,False
1,2020,2020-10-07,Moderator,Susan Page (USA Today),False,,False,True,False
2,2020,2020-10-07,Page,Good evening. From the University of Utah in S...,False,,False,True,False
3,2020,2020-10-07,Pence,Thank you.,True,Republican,False,True,True
4,2020,2020-10-07,Page,Senator Harris and Vice President Pence thank ...,False,,False,True,False
5,2020,2020-10-07,Harris,"Thank you Susan. Well, the American people hav...",True,Democratic,True,True,False
6,2020,2020-10-07,Page,"Thank you- Thank you, Senator Harris. Thank yo...",False,,False,True,False
7,2020,2020-10-07,Pence,Susan thank you. And I want to thank the Commi...,True,Republican,False,True,True
8,2020,2020-10-07,Page,"Thank you, Vice Pres- Thank you Vice President...",False,,False,True,False
9,2020,2020-10-07,Harris,"Oh, absolutely. Whatever the Vice Presidents c...",True,Democratic,True,True,False


In [2]:
def summarize_debate_actors(
    df,
    only_candidates=True
):
    """
    Summarize actors in a debate DataFrame, counting statements per actor per debate.

    Args:
        df (pd.DataFrame): Debate DataFrame.
        only_candidates (bool): If True, include only candidates.

    Returns:
        pd.DataFrame: Summary with one row per actor per debate.
    """
    # Ensure required columns exist
    required_cols = {"date", "actor", "VP_debate", "is_incumbent", "is_candidate", "party", "dialogue", "is_winner"}
    if not required_cols.issubset(df.columns):
        raise ValueError(f"Input DataFrame must contain columns: {required_cols}")

    # Drop duplicates to get one row per actor per debate
    unique_actors = df.drop_duplicates(subset=["date", "actor"])[
        ["date", "actor",  "is_candidate", "party", "is_winner", "VP_debate", "is_incumbent",]
    ]

    # Count number of statements per actor per debate
    statement_counts = df.groupby(["date", "actor"]).size().reset_index(name="statement_count")

    # Merge counts into unique_actors
    unique_actors = unique_actors.merge(statement_counts, on=["date", "actor"], how="left")

    # Optionally filter only candidates
    if only_candidates:
        unique_actors = unique_actors[unique_actors["is_candidate"]]

    # Sort by date and actor
    unique_actors = unique_actors.sort_values(by=["date", "actor"]).reset_index(drop=True)

    return unique_actors


summarize_debate_actors(df_debates)

Unnamed: 0,date,actor,is_candidate,party,is_winner,VP_debate,is_incumbent,statement_count
0,1960-09-26,Kennedy,True,Democratic,True,False,False,17
1,1960-09-26,Nixon,True,Republican,False,False,False,10
2,1960-10-07,Kennedy,True,Democratic,True,False,False,13
3,1960-10-07,Nixon,True,Republican,False,False,False,12
4,1960-10-13,Kennedy,True,Democratic,True,False,False,14
...,...,...,...,...,...,...,...,...
88,2020-09-29,Trump,True,Republican,False,False,True,341
89,2020-10-07,Harris,True,Democratic,True,True,False,62
90,2020-10-07,Pence,True,Republican,False,True,True,89
91,2020-10-22,Biden,True,Democratic,True,False,False,84


## Add txt based debates

In [3]:
import re

def extract_debate_data(
    file_path,
    year,
    date,
    vp_debate,
    candidate_info
):
    """
    Extracts structured debate data from a transcript file.

    Args:
        file_path (str): Path to the transcript text file.
        year (int): Year of the debate.
        date (str): Date of the debate in 'YYYY-MM-DD' format.
        vp_debate (bool): Whether this is a vice-presidential debate.
        candidate_info (dict): Dictionary mapping speaker last names to:
            {
                "is_candidate": bool,
                "party": str,
                "is_winner": bool,
                "is_incumbent": bool
            }

    Returns:
        pd.DataFrame: Structured DataFrame of the debate.
    """

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    data = []
    current_actor = None
    current_text = []
    
    date= pd.to_datetime(date, errors="coerce").date() if date else None

    # Match speaker lines like "JAKE TAPPER, CNN MODERATOR:" or "BIDEN:"
    speaker_line_pattern = re.compile(r'^([A-Z][A-Z\s\.\-]*)(?:, [A-Z\s\.]+)?:\s*(.*)')

    for line in lines:
        line = line.strip()
        if not line:
            continue
    
        match = speaker_line_pattern.match(line)
        if match:
            # Store previous dialogue block
            if current_actor and current_text:
                actor_info = candidate_info.get(current_actor, {
                    'is_candidate': False,
                    'party': None,
                    'is_winner': False,
                    'is_incumbent': False
                })
                data.append({
                    "year": year,
                    "date": date,
                    "actor": current_actor,
                    "dialogue": ' '.join(current_text).strip(),
                    "is_candidate": actor_info['is_candidate'],
                    "party": actor_info['party'],
                    "is_winner": actor_info['is_winner'],
                    "VP_debate": vp_debate,
                    "is_incumbent": actor_info['is_incumbent']
                })

            full_name = match.group(1).strip()
            last_name = full_name.split()[-1].title()
            current_actor = last_name
            first_line = match.group(2)
            current_text = [first_line] if first_line else []
        else:
            current_text.append(line)

    # Add final block
    if current_actor and current_text:
        actor_info = candidate_info.get(current_actor, {
            'is_candidate': False,
            'party': None,
            'is_winner': False,
            'is_incumbent': False
        })
        data.append({
            "year": year,
            "date": date,
            "actor": current_actor,
            "dialogue": ' '.join(current_text).strip(),
            "is_candidate": actor_info['is_candidate'],
            "party": actor_info['party'],
            "is_winner": actor_info['is_winner'],
            "VP_debate": vp_debate,
            "is_incumbent": actor_info['is_incumbent']
        })

    return pd.DataFrame(data)


In [4]:
debate_1992_first_half=extract_debate_data(
    file_path="data_dir/transcript_1992_oct_15_first_half.txt",
    year=1992,
    date="1992-10-15",
    vp_debate=False,
    candidate_info={
        "Bush": {
            "is_candidate": True,
            "party": "Republican",
            "is_winner": False,
            "is_incumbent": True
        },
        "Clinton": {
            "is_candidate": True,
            "party": "Democratic",
            "is_winner": True,
            "is_incumbent": False
        },
        "Perot": {
            "is_candidate": True,
            "party": "Independent",
            "is_winner": False,
            "is_incumbent": False
        }
    }
)

#Rename Bush to Bush Sr and Clinton to Clinton (Bill)
debate_1992_first_half.loc[debate_1992_first_half["actor"] == "Bush", "actor"] = "Bush Sr"
debate_1992_first_half.loc[debate_1992_first_half["actor"] == "Clinton", "actor"] = "Clinton (Bill)"
debate_1992_first_half.head(20)

Unnamed: 0,year,date,actor,dialogue,is_candidate,party,is_winner,VP_debate,is_incumbent
0,1992,1992-10-15,Simpson,Good evening and welcome to this second of thr...,False,,False,False,False
1,1992,1992-10-15,Bush Sr,Let‚Äôs go.,True,Republican,False,False,True
2,1992,1992-10-15,Simpson,And I think the first question is over here.,False,,False,False,False
3,1992,1992-10-15,Question,Yes. I‚Äôd like to direct my question to Mr. Per...,False,,False,False,False
4,1992,1992-10-15,Perot,That‚Äôs right at the top of my agenda. We‚Äôve sh...,True,Independent,False,False,False
5,1992,1992-10-15,Simpson,"Thank you, Mr. Perot. I see that the president...",False,,False,False,False
6,1992,1992-10-15,Bush Sr,"Carole, the thing that saved us in this global...",True,Republican,False,False,True
7,1992,1992-10-15,Simpson,Governor Clinton.,False,,False,False,False
8,1992,1992-10-15,Clinton (Bill),"I‚Äôd like to answer the question, because I‚Äôve ...",True,Democratic,True,False,False
9,1992,1992-10-15,Simpson,Thank you. I think we have a question over here.,False,,False,False,False


In [5]:

#concat with the rest of the data
df_debates = pd.concat([df_debates, debate_1992_first_half], ignore_index=True)
summary=summarize_debate_actors(df_debates)
summary[summary["date"]==pd.to_datetime("1992-10-15").date()]

Unnamed: 0,date,actor,is_candidate,party,is_winner,VP_debate,is_incumbent,statement_count
35,1992-10-15,Bush Sr,True,Republican,False,False,True,47
36,1992-10-15,Clinton (Bill),True,Democratic,True,False,False,24
37,1992-10-15,Perot,True,Independent,False,False,False,45


In [6]:
debate_2024_biden=extract_debate_data(
    file_path="data_dir/transcript_2024_Trump_Biden.txt",
    year=2024,
    date="2024-07-27",
    vp_debate=False,
    candidate_info={
        "Trump": {
            "is_candidate": True,
            "party": "Republican",
            "is_winner": True,
            "is_incumbent": False
        },
        "Biden": {
            "is_candidate": True,
            "party": "Democratic",
            "is_winner": False,
            "is_incumbent": True
        }
    }
)

df_debates = pd.concat([df_debates, debate_2024_biden], ignore_index=True)
debate_2024_biden.head(20)

Unnamed: 0,year,date,actor,dialogue,is_candidate,party,is_winner,VP_debate,is_incumbent
0,2024,2024-07-27,Tapper,"We‚Äôre live from Georgia, a key battleground st...",False,,False,False,False
1,2024,2024-07-27,Bash,This debate is being produced by CNN and it‚Äôs ...,False,,False,False,False
2,2024,2024-07-27,Tapper,"I‚Äôm Jake Tapper, anchor of CNN‚Äôs ‚ÄúThe Lead‚Äù an...",False,,False,False,False
3,2024,2024-07-27,Bash,"When it‚Äôs time for a candidate to speak, his m...",False,,False,False,False
4,2024,2024-07-27,Tapper,Now please welcome the 46th president of the U...,False,,False,False,False
5,2024,2024-07-27,Biden,How are you? Good to be here. Thank you.,True,Democratic,False,False,True
6,2024,2024-07-27,Tapper,And please welcome the 45th president of the U...,False,,False,False,False
7,2024,2024-07-27,Biden,You have to take a look at what I was left whe...,True,Democratic,False,False,True
8,2024,2024-07-27,Tapper,Thank you. President Trump?,False,,False,False,False
9,2024,2024-07-27,Trump,We had the greatest economy in the history of ...,True,Republican,True,False,False


In [7]:
debate_2024_harris=extract_debate_data(
    file_path="data_dir/transcript_2024_Trump_Harris.txt",
    year=2024,
    date="2024-09-10",
    vp_debate=False,
    candidate_info={
        "Trump": {
            "is_candidate": True,
            "party": "Republican",
            "is_winner": True,
            "is_incumbent": False
        },
        "Harris": {
            "is_candidate": True,
            "party": "Democratic",
            "is_winner": False,
            "is_incumbent": False
        }
    }
)

df_debates = pd.concat([df_debates, debate_2024_harris], ignore_index=True)
debate_2024_harris.head(20)

Unnamed: 0,year,date,actor,dialogue,is_candidate,party,is_winner,VP_debate,is_incumbent
0,2024,2024-09-10,Muir,"Tonight, the high-stakes showdown here in Phil...",False,,False,False,False
1,2024,2024-09-10,Davis,A historic race for president upended just wee...,False,,False,False,False
2,2024,2024-09-10,Muir,The candidates separated by the smallest of ma...,False,,False,False,False
3,2024,2024-09-10,Muir,"Good evening, I'm David Muir. And thank you fo...",False,,False,False,False
4,2024,2024-09-10,Davis,And I'm Linsey Davis. Tonight's meeting could ...,False,,False,False,False
5,2024,2024-09-10,Muir,And that brings us to the rules of tonight's d...,False,,False,False,False
6,2024,2024-09-10,Davis,President Trump won the coin toss. He chose to...,False,,False,False,False
7,2024,2024-09-10,Muir,So let's now welcome the candidates to the sta...,False,,False,False,False
8,2024,2024-09-10,Harris,Kamala Harris. Let's have a good debate.,True,Democratic,False,False,False
9,2024,2024-09-10,Trump,Nice to see you. Have fun.,True,Republican,True,False,False


In [8]:
debate_2024_vp=extract_debate_data(
    file_path="data_dir/transcript_2024_Vance_Walz.txt",
    year=2024,
    date="2024-10-01",
    vp_debate=True,
    candidate_info={
        "Jdv": {
            "is_candidate": True,
            "party": "Republican",
            "is_winner": True,
            "is_incumbent": False
        },
        "Tw": {
            "is_candidate": True,
            "party": "Democratic",
            "is_winner": False,
            "is_incumbent": False
        }
    }
)

#Rename Jdv to Vance, Tw to Walz, No to O'Donnell, and Mb to Brennan
debate_2024_vp.loc[debate_2024_vp["actor"] == "Jdv", "actor"] = "Vance"
debate_2024_vp.loc[debate_2024_vp["actor"] == "Tw", "actor"] = "Walz"
debate_2024_vp.loc[debate_2024_vp["actor"] == "No", "actor"] = "O'Donnell"
debate_2024_vp.loc[debate_2024_vp["actor"] == "Mb", "actor"] = "Brennan"


df_debates = pd.concat([df_debates, debate_2024_vp], ignore_index=True)
debate_2024_vp.head(20)

Unnamed: 0,year,date,actor,dialogue,is_candidate,party,is_winner,VP_debate,is_incumbent
0,2024,2024-10-01,O'Donnell,Good evening. I'm Norah O'Donnell and thank yo...,False,,False,True,False
1,2024,2024-10-01,Brennan,I'm Margaret Brennan. In order to have a thoug...,False,,False,True,False
2,2024,2024-10-01,Brennan,"Thank you, Norah. Earlier today, Iran launched...",False,,False,True,False
3,2024,2024-10-01,Walz,"Well, thank you. And thank you for those joini...",True,Democratic,False,True,False
4,2024,2024-10-01,Brennan,"Governor, your time is up. Senator Vance, the ...",False,,False,True,False
5,2024,2024-10-01,Vance,"So, Margaret, I want to answer the question. F...",True,Republican,True,True,False
6,2024,2024-10-01,Brennan,"Thank you, Senator. Governor Walz, do you care...",False,,False,True,False
7,2024,2024-10-01,Walz,"Well, look, Donald Trump was in office. We'll ...",True,Democratic,False,True,False
8,2024,2024-10-01,Brennan,"Senator Vance, the U.S. did have a diplomatic ...",False,,False,True,False
9,2024,2024-10-01,Vance,"Well, first of all, Margaret, diplomacy is not...",True,Republican,True,True,False


In [9]:
df_debates.sort_values(by=["year", "date"], inplace=True)
df_debates.reset_index(drop=True, inplace=True)
df_debates

Unnamed: 0,year,date,actor,dialogue,is_candidate,party,is_winner,VP_debate,is_incumbent
0,1960,1960-09-26,Kennedy,"Mr. Smith, Mr. Nixon. In the election of 1860,...",True,Democratic,True,False,False
1,1960,1960-09-26,Smith,And now the opening statement by Vice Presiden...,False,,False,False,False
2,1960,1960-09-26,Nixon,"Mr. Smith, Senator Kennedy. The things that Se...",True,Republican,False,False,False
3,1960,1960-09-26,Smith,"Thank you, Mr. Nixon. That completes the openi...",False,,False,False,False
4,1960,1960-09-26,Fleming,"Senator, the Vice President in his campaign ha...",False,,False,False,False
...,...,...,...,...,...,...,...,...,...
9928,2024,2024-10-01,Walz,"Well, thank you, Senator Vance. Thank you to C...",True,Democratic,False,True,False
9929,2024,2024-10-01,Brennan,"Governor Walz. Thank you. Senator Vance, your ...",False,,False,True,False
9930,2024,2024-10-01,Vance,"Well, I want to thank Governor Walz, you folks...",True,Republican,True,True,False
9931,2024,2024-10-01,Brennan,"Senator Vance, thank you. And thank you both f...",False,,False,True,False


In [10]:
# Save the final DataFrame to a CSV file
df_debates.to_csv("debate_transcripts_cleaned.csv", index=False, encoding="utf-8")
print(f"Data saved")

Data saved
