In [3]:
# imports and settings
import pandas as pd
from os import listdir
import warnings
from time import time
from sklearn.utils import resample
from sockit import title as sockit
from math import log10
from sklearn.preprocessing import StandardScaler
from pickle import dump
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# TODO: Requires raw data to be decompressed to 'data/raw data'

# define variables and objects
RAW_TRAIN_TEST_DIR = "data/raw data"
CLEANED_TRAIN_PATH = "data/processed data/cleaned_train.csv"
CLEANED_TEST_PATH = "data/processed data/cleaned_test.csv"

DO_LOGGING = True
DO_SAVE = True

SEED = 42
TEST_SIZE = 0.2

# define dictionaries and lists
RELEVANT_COLUMNS = [
    'committee_name', 'contributor_zip', 'contributor_occupation', 'contributor_aggregate_ytd', 'contribution_receipt_amount',
]
COMMITTEE_TO_PARTY = {
    'DONALD J. TRUMP FOR PRESIDENT 2024, INC.': "REPUBLICAN", 'HARRIS FOR PRESIDENT': "DEMOCRAT", 'JILL STEIN FOR PRESIDENT 2024': "INDEPENDENT", 'TEAM KENNEDY': "INDEPENDENT", 'CLAUDIA DE LA CRUZ FOR PRESIDENT': "INDEPENDENT", 'CORNEL WEST FOR PRESIDENT': "INDEPENDENT", 'MARIANNE WILLIAMSON FOR PRESIDENT': "DEMOCRAT", 'CHASE OLIVER FOR PRESIDENT': "INDEPENDENT", 'TERRY 2024 INC': "REPUBLICAN", 'MIKE PENCE FOR PRESIDENT': "REPUBLICAN", 'PETER SONSKI FOR PRESIDENT': "DEMOCRAT", 'SHIVA 4 PRESIDENT': "INDEPENDENT", 'DOUG BELL FOR PRESIDENT': "INDEPENDENT", 'MIKE TER MAAT FOR PRESIDENT': "INDEPENDENT", 'COMMITTEE TO ELECT JOSEPH KISHORE': "INDEPENDENT", 'NIKKI HALEY FOR PRESIDENT INC.': "REPUBLICAN", 'PALMER FOR PRESIDENT': "DEMOCRAT", 'STODDEN CHOLENSKY 2024': "INDEPENDENT"
}

FEATURE_RENAME = {
    'committee_name': 'party', 'contributor_employer': 'employer', 'contributor_occupation': 'occupation', 'contributor_aggregate_ytd': 'donations_ytd', 'contributor_city': 'city', 'contributor_state': 'state', 'contributor_zip': 'zip', 'contributor_first_name': 'first_name', 'contributor_last_name': 'last_name', 'contribution_receipt_amount': 'donation'
}

standardized_columns = ["donation", "donations_ytd"]

# define functions
def log(text: str, frame, do_time = True): # prints logs if enabled
    global df_size
    if DO_LOGGING:
        if frame is None:
            if do_time:
                print(f"{text}  {time_check()}")
            else:
                print(text)
        else:
            size_delta = len(frame) - df_size
            df_size = len(frame)
            if size_delta >= 0:
                size_delta = f"+{size_delta} lines"
            else:
                size_delta = f"{size_delta} lines"
            if do_time:
                print(f"{text}  [{size_delta}] {time_check()}")
            else:
                print(text)
    else:
        pass

time_checkpoint = time()
def time_check(): # returns time since last checkpoint
    global time_checkpoint
    out = "["
    time_elapsed = round(time() - time_checkpoint, 2)
    time_checkpoint = time()
    if time_elapsed > 60:
        out += f"{int(time_elapsed/60)} min, "
    out += f"{round(time_elapsed % 60, 2)} sec]"
    return out

def truncate_zip(inp): # if int (zip code) has more than 4 characters, return truncated int
    inp = int(inp)
    if inp < 1:
        return None
    digits = int(log10(inp))+1
    power = digits - 5
    if digits > 4:
        return int(inp // (10 ** power))
    else:
        return None

def embed_job_title(title: str): # convert job title strings to SOC code
    out = sockit.sort(sockit.search(sockit.clean(title)))
    if len(out) == 0:
        return None
    else:
        return out[0]["soc"]


In [4]:
## load data
time_check()
temp_dfs = []
df_size = 0


# load individual csv files
raw_data_paths = listdir(RAW_TRAIN_TEST_DIR)
csv_data_paths = []
for file in raw_data_paths:
    if file[-4:] == ".csv":
        csv_data_paths.append(file)

for file in csv_data_paths:
    if file[-4:] == ".csv":
        temp_dfs.append(pd.read_csv(RAW_TRAIN_TEST_DIR + "/" + file, header=0))
        log(f"Loaded {file} with {len(temp_dfs[-1].index)} lines", None, do_time=False)
log(f"Loaded {len(csv_data_paths)} csv files", None)

# merge loaded dataframes df
df = pd.concat(temp_dfs)
log(f"Merged lines into df", df)

# remove duplicates
df.drop_duplicates(inplace=True)
log(f"Dropped duplicates from df", df)

# remove irrelevant features
temp_count = len(df.columns)
df = df.filter(items=RELEVANT_COLUMNS)
log(f"Dropped {temp_count-len(df.columns)} columns from df", None)


# rename features & map committee to party
df.rename(columns=FEATURE_RENAME, inplace=True)
df['party'] = df['party'].map(COMMITTEE_TO_PARTY)
log("Renamed features and mapped committees to associated party", df)

# embed job titles
df = df.loc[df['occupation'] != 'INFORMATION REQUESTED']
df = df.loc[df['occupation'] != 'RETIRED']
df.dropna(inplace=True)
df['occupation'] = df['occupation'].apply(embed_job_title)
df.dropna(inplace=True)
log("Embedded job titles and removed unmatched titles", df)

# truncate zip codes to 5 digits
df['zip'] = df['zip'].apply(truncate_zip)
df.dropna(inplace=True)
log("Truncated zip codes to 5 digits, dropped invalid zips", df)

## Splitting data
df_train, df_test = train_test_split(df, test_size=0.2, random_state=SEED)
log(f"Split data into {100 * (1-TEST_SIZE)}% train and {100 * TEST_SIZE}% test", None)

# standardize donation values
scaler = StandardScaler()
df_train[standardized_columns] = scaler.fit_transform(df_train[standardized_columns])
log("Fitted scaler to train data", None)
log(f"Standardized {standardized_columns} of train data", None)

df_test[standardized_columns] = scaler.transform(df_test[standardized_columns])
log(f"Standardized {standardized_columns} of test data", None)

if DO_SAVE:
    with open("models/StandardScaler.pkl", "wb") as file:
        dump(scaler, file)
    log(f"Saved scaler to models/StandardScaler.pkl", None)

# balance training data
df_rep = df_train[df_train['party'] == 'REPUBLICAN']
df_dem = df_train[df_train['party'] == 'DEMOCRAT']
df_ind = df_train[df_train['party'] == 'INDEPENDENT']

sample_size = min(len(df_rep), len(df_dem))
df_rep = resample(df_rep, replace=False, n_samples=sample_size, random_state=SEED)
df_dem = resample(df_dem, replace=False, n_samples=sample_size, random_state=SEED)
df_ind = resample(df_ind, replace=True, n_samples=sample_size, random_state=SEED)

df_train = pd.concat([df_dem, df_rep, df_ind])
log(f"Balanced train data to {sample_size} lines per class", df)

# save processed data
if DO_SAVE:
    df_train.to_csv(CLEANED_TRAIN_PATH, index=False)
    log(f"Saved cleaned train data to {CLEANED_TRAIN_PATH}", None)
    df_test.to_csv(CLEANED_TEST_PATH, index=False)
    log(f"Saved cleaned test data to {CLEANED_TEST_PATH}", None)

Loaded 07.21-07.25.csv with 495762 lines
Loaded 07.25-08.01.csv with 495839 lines
Loaded 08.01-08.08.csv with 487538 lines
Loaded 08.09-08.17.csv with 383444 lines
Loaded 08.18-08.24.csv with 463564 lines
Loaded 08.25-08.29.csv with 463333 lines
Loaded 08.30-09.08.csv with 432302 lines
Loaded 09.09-09.15.csv with 471374 lines
Loaded 09.16-09.21.csv with 129794 lines
Loaded 09.27-10.01.csv with 434265 lines
Loaded 10.02-10.08.csv with 414111 lines
Loaded 10.09-10.15.csv with 299283 lines
Loaded 12 csv files  [1 min, 30.67 sec]
Merged lines into df  [+4970609 lines] [1 min, 0.6 sec]
Dropped duplicates from df  [-75742 lines] [4 min, 16.27 sec]
Dropped 74 columns from df  [8.44 sec]
Renamed features and mapped committees to associated party  [+0 lines] [1.53 sec]
Embedded job titles and removed unmatched titles  [-3289221 lines] [2 min, 11.21 sec]
Truncated zip codes to 5 digits, dropped invalid zips  [-16040 lines] [1.66 sec]
1589606
party
DEMOCRAT       1428292
REPUBLICAN      143283
IN