# Data Cleaning For Revenue Budget
## Mount and load file

In [21]:
import pandas as pd
import matplotlib.pyplot as plt

In [22]:
dset = pd.read_csv("data/fy24-adopted-revenue-budget.csv")

## Explore dataset info

In [23]:
print(dset.head(10))


    Revenue Category                     Account  Cabinet  \
0  Property Tax Levy           Real Estate Taxes  Finance   
1  Property Tax Levy       Personal Property Tax  Finance   
2  Property Tax Levy        Property Tax Overlay  Finance   
3            Excises    MV Excise - Current Year  Finance   
4            Excises      MV Excise - Prior Year  Finance   
5            Excises   MV Excise - 2 Years Prior  Finance   
6            Excises   MV Excise - 3 Years Prior  Finance   
7            Excises  MV Excise - 4+ Years Prior  Finance   
8            Excises                 Boat Excise  Finance   
9            Excises            Meals Excise Tax  Finance   

             Department    FY21 Actual    FY22 Actual FY23 Appropriation  \
0  Assessing Department  2,490,082,613  2,630,469,593      2,784,133,324   
1  Assessing Department    189,939,113    196,499,737        209,010,762   
2  Assessing Department     -3,735,387    -33,174,590        -29,845,007   
3  Assessing Department 

In [24]:
dset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Revenue Category    199 non-null    object 
 1   Account             199 non-null    object 
 2   Cabinet             199 non-null    object 
 3   Department          199 non-null    object 
 4   FY21 Actual         199 non-null    object 
 5   FY22 Actual         199 non-null    object 
 6   FY23 Appropriation  199 non-null    object 
 7   FY24 Adopted        199 non-null    object 
 8   Unnamed: 8          0 non-null      float64
 9   Unnamed: 9          1 non-null      object 
dtypes: float64(1), object(9)
memory usage: 15.7+ KB


## Drop columns with NaN values

In [25]:
dset.isnull().sum()

Revenue Category        0
Account                 0
Cabinet                 0
Department              0
FY21 Actual             0
FY22 Actual             0
FY23 Appropriation      0
FY24 Adopted            0
Unnamed: 8            199
Unnamed: 9            198
dtype: int64

In [26]:
dset = dset.dropna(axis=1)
dset.head

<bound method NDFrame.head of           Revenue Category                        Account  \
0        Property Tax Levy              Real Estate Taxes   
1        Property Tax Levy          Personal Property Tax   
2        Property Tax Levy           Property Tax Overlay   
3                  Excises       MV Excise - Current Year   
4                  Excises         MV Excise - Prior Year   
..                     ...                            ...   
194        Available Funds    Approp. Cemetery Trust Fund   
195        Available Funds         Approp. Parking Meters   
196  Non-Recurring Revenue           American Rescue Plan   
197  Non-Recurring Revenue           Approp. Fund Balance   
198  Non-Recurring Revenue  Approp. Surplus Property Fund   

                              Cabinet                     Department  \
0                             Finance           Assessing Department   
1                             Finance           Assessing Department   
2                    

In [27]:
null_res = pd.isnull(dset)
print(f"Result of checking if null value exists: {True if True in null_res else False}")

Result of checking if null value exists: False


## Remove duplicates, if any

In [28]:
dset_non_dup = dset.drop_duplicates()
print(dset_non_dup.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 198
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Revenue Category    199 non-null    object
 1   Account             199 non-null    object
 2   Cabinet             199 non-null    object
 3   Department          199 non-null    object
 4   FY21 Actual         199 non-null    object
 5   FY22 Actual         199 non-null    object
 6   FY23 Appropriation  199 non-null    object
 7   FY24 Adopted        199 non-null    object
dtypes: object(8)
memory usage: 14.0+ KB
None


## Convert Money Data into numerical data

In [30]:
money_cols = dset.iloc[:,-4:].columns
dset[money_cols] = dset[money_cols].replace(regex=',', value='')
dset[money_cols] = dset[money_cols].apply(pd.to_numeric)
dset.head()

Unnamed: 0,Revenue Category,Account,Cabinet,Department,FY21 Actual,FY22 Actual,FY23 Appropriation,FY24 Adopted
0,Property Tax Levy,Real Estate Taxes,Finance,Assessing Department,2490082613,2630469593,2784133324,2913736657
1,Property Tax Levy,Personal Property Tax,Finance,Assessing Department,189939113,196499737,209010762,214236031
2,Property Tax Levy,Property Tax Overlay,Finance,Assessing Department,-3735387,-33174590,-29845007,-30000000
3,Excises,MV Excise - Current Year,Finance,Assessing Department,36012943,36227381,34000000,34000000
4,Excises,MV Excise - Prior Year,Finance,Assessing Department,15202795,21692077,17500000,18000000


In [31]:
dset['FY21 Actual'][6]

557497