### Load packages

In [311]:
import pandas as pd
from pandas import json_normalize
import os
from datetime import datetime
import json
import glob
import re

# for figures
import matplotlib.pyplot as plt
import numpy as np
import textwrap
from pandas.plotting import scatter_matrix

### Set project parameters for this run

In [354]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3413 entries, 0 to 3412
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Object ID                     3413 non-null   int64 
 1   Master Org EIN                3413 non-null   Int64 
 2   Master Org Name               3413 non-null   object
 3   Year of Tax Period End Date   3413 non-null   int32 
 4   BsnssNmLn1Txt                 3406 non-null   object
 5   BsnssNmLn2Txt                 32 non-null     object
 6   Transaction Description       3380 non-null   object
 7   Transaction Amount            3178 non-null   Int64 
 8   Method of Determining Amount  3010 non-null   object
dtypes: Int64(2), int32(1), int64(1), object(5)
memory usage: 233.4+ KB


In [355]:
# Plot histograms of the numerical columns in the data

if use_plots:
    
    cols_to_ignore = ["Object ID", "EIN", "Year of Tax Period End Date", "IRS Tax Year", "Business Officer Signature Date"]

    plot_df = output_df.drop(
        columns=[c for c in cols_to_ignore if c in output_df.columns]
    )

    # Compute logs of the numeric columns
    numeric_df = plot_df.select_dtypes(include="number")

    # Make a new dataframe usig the logs instead of the raw values
    # log10(x + 1) avoids -inf for zeros
    # Keep only values where x + 1 > 0
    safe_numeric_df = numeric_df.where(numeric_df > -1)
    log_df = np.log10(safe_numeric_df + 1)
    # and then drop columns that are all nulls or NaN values
    log_df = log_df.dropna(axis=1, how="all")

    axes = log_df.hist(bins=50, figsize=(12, 15))

    for ax in axes.flatten():
        title = ax.get_title()
        wrapped_title = "\n".join(textwrap.wrap(title, width=20))
        ax.set_title(wrapped_title, fontsize=8)
        ax.tick_params(axis="y", labelsize=7)
        ax.tick_params(axis="x", labelsize=7)

    plt.subplots_adjust(hspace=0.6, wspace=0.3)

    plt.show()


In [356]:
# Compute correlation coefficients for every numerical variable

if use_plots:

    corr_matrix = log_df.corr()
    print(corr_matrix["Total assets EOY"].sort_values(ascending=False))


In [357]:
# Calculate scatterplots for every numerical variable pair

if use_plots:

    # columns = ["Total assets EOY", "Total revenue (Rev Exp)", "Contributions gifts grants received (Rev Exp)",
    #            "Total expenses and disbursements (Rev Exp)", "Qualifying Distributions (Part XI)"]
    # pick just first four columns to look at by default
    columns = log_df.select_dtypes(include="number").columns[:4]

    axes = scatter_matrix(log_df[columns], figsize=(12,8), alpha=0.1)

    for ax in axes.flatten():
        
        ax.tick_params(axis="x", labelsize=7)
        ax.tick_params(axis="y", labelsize=7)
        xlabel = ax.get_xlabel()
        wrapped_xlabel = "\n".join(textwrap.wrap(xlabel, width=20))
        ax.set_xlabel(wrapped_xlabel, fontsize=8)
        ylabel = ax.get_ylabel()
        wrapped_ylabel = "\n".join(textwrap.wrap(ylabel, width=20))
        ax.set_ylabel(wrapped_ylabel, fontsize=8)
        
        # ax.set_xlabel(ax.get_xlabel(), fontsize=8)
        # ax.set_ylabel(ax.get_ylabel(), fontsize=8)

    plt.subplots_adjust(hspace=0.6, wspace=0.3)

    plt.show()

In [358]:
# Separate output files by year, if needed,
# and create output

# Force Object ID to string before output
if "Object ID" in output_df.columns:
    output_df["Object ID"] = (
        output_df["Object ID"]
        .astype(str)
        .str.strip()
        .str.replace(r"\.0$", "", regex=True)  # Remove any float artifacts
    )

if custom_output_file_term == None:
    custom_term = ""
else:
    custom_term = custom_output_file_term + "_"

if separate_output_by_year:
    for year in years:
        print(year)
        df_year = output_df[output_df["Year of Tax Period End Date"] == int(year)]
        if df_year.empty:
            continue
        output_filename = f"output_{custom_term}{year}_{timestamp}.csv"
        output_path = os.path.join(output_directory, output_filename)
        print(output_path)
        df_year.to_csv(output_path, index=False)
elif len(year_range) == 1:
    output_filename = f"output_{custom_term}{year_range[0]}_{timestamp}.csv"
    output_path = os.path.join(output_directory, output_filename)
    print(output_path)
    output_df.to_csv(output_path, index=False)
else:
    output_filename = f"output_{custom_term}{year_range}_{timestamp}.csv"
    output_path = os.path.join(output_directory, output_filename)
    print(output_path)
    output_df.to_csv(output_path, index=False)


2023
..\..\classes\CS5540_2026S\output\output_c4s_skedr_partv_2023_20260116_143105.csv
