In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os

In [None]:
pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

In [None]:
#LOAD & COMBINE ALL EXCEL FILES (2011â€“2022)
folder_path = r"C:\Users\ifesolom\Documents\GitHub\spring-2026-rat-activity-nyc\IRS folder"

files = glob.glob(os.path.join(folder_path, "*.xlsx"))
df_list = []

for file in files:
    df = pd.read_excel(file)
    df_list.append(df)



In [2]:
 # Extract zip code, size of adjusted gross income, adjusted gross income from filename 
filename = os.path.basename(files)
parts = filename.split("_")
if len(parts) >= 3:
        zip_code = parts[0]
        size_of_ag_income = parts[1]
        ag_income = parts[2].split(".")[0]  # Remove file extension
        df["Zip Code"] = zip_code
        df["Size of AGI"] = size_of_ag_income
        df["AGI"] = ag_income


NameError: name 'os' is not defined

In [None]:
# Combine all files
df = pd.concat(df_list, ignore_index=True)

print("Combined Dataset Shape:", df.shape)

In [None]:
#DATA CLEANING
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)

# Remove duplicates
df = df.drop_duplicates()

# Missing values summary
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
 #Adjust these mappings if needed
column_map = {
    "returns": [col for col in df.columns if "return" in col],
    "exemptions": [col for col in df.columns if "exemption" in col],
    "agi": [col for col in df.columns if "adjusted_gross_income" in col or "agi" in col],
    "wages": [col for col in df.columns if "wage" in col],
    "dividends": [col for col in df.columns if "dividend" in col],
    "interest": [col for col in df.columns if "interest" in col]
    }

print("\nDetected columns:")
for k, v in column_map.items():
    print(k, ":", v)

In [None]:
print(df.columns.tolist())


In [None]:
temp_df = pd.read_excel(files[0], header=None)
print(temp_df.head(15))


In [None]:
df = pd.read_excel(file, header=[3,4])


In [None]:
# Drop rows with too many missing values
df = df.dropna(thresh=int(0.6 * len(df.columns)))

# Fill numeric missing values with 0
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].fillna(0)

# Ensure ZIP codes stay 5-digit
if "zipcode" in df.columns:
    df["zipcode"] = df["zipcode"].astype(str).str.zfill(5)

# Convert Year column to integer
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df = df.dropna(subset=["year"])   # Remove rows where year couldn't be extracted
df["year"] = df["year"].astype(int)


print("\nCleaned Dataset Shape:", df.shape)

In [None]:
4. #EXPLORATORY DATA ANALYSIS
# ============================================

print("\nSummary Statistics:")
print(df.describe())

print("\nData Info:")
print(df.info())

In [None]:
# 5. VISUALIZATIONS
# ============================================

# Example: Total AGI trend over time
if "adjusted_gross_income" in df.columns:
    yearly_agi = df.groupby("year")["adjusted_gross_income"].sum()
    
    plt.figure(figsize=(8,5))
    yearly_agi.plot(marker="o")
    plt.title("Total Adjusted Gross Income by Year")
    plt.xlabel("Year")
    plt.ylabel("Total AGI")
    plt.show()

# Correlation heatmap
plt.figure(figsize=(10,8))
corr = df.select_dtypes(include=np.number).corr()
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation Matrix")
plt.show()