In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os

In [8]:
pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

In [9]:
#LOAD & COMBINE ALL EXCEL FILES (2011â€“2022)
folder_path = r"../../IRS folder"

files = glob.glob(os.path.join(folder_path, "*.xlsx"))
df_list = []

for file in files:
    df = pd.read_excel(file)
    df_list.append(df)



In [10]:
 # Extract zip code, size of adjusted gross income, adjusted gross income from filename 
filename = os.path.basename("../../IRS folder")
parts = filename.split("_")
if len(parts) >= 3:
        zip_code = parts[0]
        size_of_ag_income = parts[1]
        ag_income = parts[2].split(".")[0]  # Remove file extension
        df["Zip Code"] = zip_code
        df["Size of AGI"] = size_of_ag_income
        df["AGI"] = ag_income


In [11]:
# Combine all files
df = pd.concat(df_list, ignore_index=True)

print("Combined Dataset Shape:", df.shape)

Combined Dataset Shape: (73922, 165)


In [12]:
#DATA CLEANING
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)

# Remove duplicates
df = df.drop_duplicates()

# Missing values summary
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
new_york           11
unnamed:_1       9307
unnamed:_2         89
unnamed:_3         89
unnamed:_4         89
                ...  
unnamed:_160    31773
unnamed:_161    31770
unnamed:_162    31773
unnamed:_163    52921
unnamed:_164    52922
Length: 165, dtype: int64


In [13]:
 #Adjust these mappings if needed
column_map = {
    "returns": [col for col in df.columns if "return" in col],
    "exemptions": [col for col in df.columns if "exemption" in col],
    "agi": [col for col in df.columns if "adjusted_gross_income" in col or "agi" in col],
    "wages": [col for col in df.columns if "wage" in col],
    "dividends": [col for col in df.columns if "dividend" in col],
    "interest": [col for col in df.columns if "interest" in col]
    }

print("\nDetected columns:")
for k, v in column_map.items():
    print(k, ":", v)


Detected columns:
returns : []
exemptions : []
agi : []
wages : []
dividends : []
interest : []


In [14]:
print(df.columns.tolist())


['new_york', 'unnamed:_1', 'unnamed:_2', 'unnamed:_3', 'unnamed:_4', 'unnamed:_5', 'unnamed:_6', 'unnamed:_7', 'unnamed:_8', 'unnamed:_9', 'unnamed:_10', 'unnamed:_11', 'unnamed:_12', 'unnamed:_13', 'unnamed:_14', 'unnamed:_15', 'unnamed:_16', 'unnamed:_17', 'unnamed:_18', 'unnamed:_19', 'unnamed:_20', 'unnamed:_21', 'unnamed:_22', 'unnamed:_23', 'unnamed:_24', 'unnamed:_25', 'unnamed:_26', 'unnamed:_27', 'unnamed:_28', 'unnamed:_29', 'unnamed:_30', 'unnamed:_31', 'unnamed:_32', 'unnamed:_33', 'unnamed:_34', 'unnamed:_35', 'unnamed:_36', 'unnamed:_37', 'unnamed:_38', 'unnamed:_39', 'unnamed:_40', 'unnamed:_41', 'unnamed:_42', 'unnamed:_43', 'unnamed:_44', 'unnamed:_45', 'unnamed:_46', 'unnamed:_47', 'unnamed:_48', 'unnamed:_49', 'unnamed:_50', 'unnamed:_51', 'unnamed:_52', 'unnamed:_53', 'unnamed:_54', 'unnamed:_55', 'unnamed:_56', 'unnamed:_57', 'unnamed:_58', 'unnamed:_59', 'unnamed:_60', 'unnamed:_61', 'unnamed:_62', 'unnamed:_63', 'unnamed:_64', 'unnamed:_65', 'unnamed:_66', 'unnam

In [15]:
temp_df = pd.read_excel(files[0], header=None)
print(temp_df.head(15))


                                                  0    \
0                                            NEW YORK   
1   Individual Income Tax Returns: \nSelected Inco...   
2         [Money amounts are in thousands of dollars]   
3                                       ZIP\ncode [1]   
4                                                 NaN   
5                                                 NaN   
6                                                   0   
7                                                   0   
8                                                   0   
9                                                   0   
10                                                  0   
11                                                  0   
12                                                  0   
13                                                NaN   
14                                              10001   

                              1                      2    \
0                          

In [16]:
df = pd.read_excel(file, header=[3,4])


In [17]:
# Drop rows with too many missing values
df = df.dropna(thresh=int(0.6 * len(df.columns)))

# Fill numeric missing values with 0
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].fillna(0)

# Ensure ZIP codes stay 5-digit
if "zipcode" in df.columns:
    df["zipcode"] = df["zipcode"].astype(str).str.zfill(5)

# Convert Year column to integer
df["year"] = pd.to_numeric(df["year"], errors="coerce")
df = df.dropna(subset=["year"])   # Remove rows where year couldn't be extracted
df["year"] = df["year"].astype(int)


print("\nCleaned Dataset Shape:", df.shape)

KeyError: 'year'

In [None]:
4. #EXPLORATORY DATA ANALYSIS
# ============================================

print("\nSummary Statistics:")
print(df.describe())

print("\nData Info:")
print(df.info())

In [None]:
# 5. VISUALIZATIONS
# ============================================

# Example: Total AGI trend over time
if "adjusted_gross_income" in df.columns:
    yearly_agi = df.groupby("year")["adjusted_gross_income"].sum()
    
    plt.figure(figsize=(8,5))
    yearly_agi.plot(marker="o")
    plt.title("Total Adjusted Gross Income by Year")
    plt.xlabel("Year")
    plt.ylabel("Total AGI")
    plt.show()

# Correlation heatmap
plt.figure(figsize=(10,8))
corr = df.select_dtypes(include=np.number).corr()
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation Matrix")
plt.show()