In [17]:
import csv
import re

def clean_data(input_file, output_file):
    cleaned_data = []

    with open(input_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        data = list(reader)

        # clean the fragments
        for row in data:
            fragment = row['Text']

            fragment = fragment.strip()
            fragment = re.sub(r'\s+', ' ', fragment)

            fragment = re.sub(r'[A-Za-z\s]+/[A-Za-z\s]+(?: via Getty Images| hide caption)', '', fragment)

            fragment = re.sub(r'\b(?:hide caption|AP writes|As we write this)\b', '', fragment, flags=re.IGNORECASE)

            row['Text'] = fragment

        # remove duplicates
        for i, row in enumerate(data):
            fragment = row['Text']
            is_duplicate = False

            for j, other_row in enumerate(data):
                if i != j and fragment in other_row['Text']:
                    is_duplicate = True
                    break

            if not is_duplicate and fragment.strip():
                cleaned_data.append(row)

    with open(output_file, mode='w', encoding='utf-8', newline='') as file:
        fieldnames = ['Text', 'Leaning', 'Year']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(cleaned_data)

    print(f"Data cleaned and saved to {output_file}")

    return cleaned_data

input_csv = 'all_years_merged.csv'
output_csv = 'cleaned_data.csv'
cleaned_data = clean_data(input_csv, output_csv)


Data cleaned and saved to cleaned_data.csv


In [18]:
len(cleaned_data)

692

In [20]:
# clean boilerplate
def remove_boilerplate(data):
    boilerplate_patterns = [
        r"Source:.*",
        r"Editor's Note:.*",
        r"NPR transcripts are created on a rush deadline.*",
        r"This text may not be in its final form and may be updated.*",
        r"All rights reserved\.",
        r"[A-Za-z\s]+/[A-Za-z\s]+(?: via Getty Images| hide caption)",
    ]

    boilerplate_regex = [re.compile(pattern, re.IGNORECASE) for pattern in boilerplate_patterns]

    for row in data:
        fragment = row['Text']
        for regex in boilerplate_regex:
            fragment = regex.sub('', fragment)
        row['Text'] = fragment.strip()

    return data

cleaned_data = remove_boilerplate(cleaned_data)

with open(output_csv, mode='w', encoding='utf-8', newline='') as file:
    fieldnames = ['Text', 'Leaning', 'Year']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(cleaned_data)

print(f"Final data cleaned and saved to {output_csv}")


Final data cleaned and saved to cleaned_data.csv


In [21]:
len(cleaned_data)

692

In [None]:
# equalize the number of left, center, right samples
# left: 9765
#