In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import networkx as nx
import csv
import numpy as np
import sklearn
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Cleaning and Processing

In [None]:
years = [year for year in range(2021, 2009, -1)]
years

[2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010]

In [None]:
from re import I
def process_excel_file(year, excel_path, list_index):
    df = pd.read_excel(excel_path, sheet_name=1, header=None)

    header = df.iloc[0:3].apply(lambda x: '_'.join(x.astype(str)), axis=0)
    df.columns = header
    df = df[3:]

    df = df.set_index(df.columns[0])
    df = df.transpose()

    df = df[df.reset_index().index % 2 == 0]

    # Calculate the starting and ending row indices based on the list_index
    start_row = list_index * 4
    end_row = (list_index + 1) * 4

    new_df = df.iloc[start_row:end_row].copy()
    new_df.insert(0, 'year', year)
    new_df = new_df.reset_index()

    if isinstance(new_df.iat[0, 0], str) and len(new_df.iat[0, 0]) > 23:
        new_df.iat[0, 0] = new_df.iat[0, 0][-23:]

    new_df = new_df.rename(columns={'index': 'estimate_type'})
    new_df['estimate_type'] = new_df['estimate_type'].str[4:]
    new_df['Total'] = pd.to_numeric(new_df['Total'].str.replace(',', ''), errors='coerce').astype(float)

    return new_df


In [None]:
list_of_indices = ['1101.05', '1201.04', '1202.01']



  #['70801', '70901', '801', '803', '80401', '805', '80601',
                   #'81302', '814', '815', '817', '818', '819', '820', '821', '904','906', '9803']


In [None]:
len(list_of_indices)

3

In [None]:
#this loop makes use of the function process_excel_file() to make different years of dataframe for each census tract
for index in list_of_indices:
    for year in years:
        excel_file_path = f'/content/drive/MyDrive/DS project/Jamaica_Plain_income/ACSST5Y{year}.xlsx'
        variable_name = f'ct{index}_{year}'

        # Determine the list_index based on the current index
        list_index = list_of_indices.index(index)

        # Call the function and assign the result to the variable
        globals()[variable_name] = process_excel_file(year, excel_file_path, list_index)


In [None]:
#this for loop generates the concated dataframe

for index in list_of_indices:
    concatenated_dfs = [globals()[f'ct{index}_2021']]  # Start with the 2021 DataFrame

    for year in years[1:]:
        variable_name = f'ct{index}_{year}'
        if not globals()[variable_name].empty:
            concatenated_dfs.append(globals()[variable_name])

    # Concatenate all DataFrames for the current index
    globals()[f'ct{index}'] = pd.concat(concatenated_dfs, ignore_index=True)

In [None]:
def convert_percentage_to_float_with_symbol(percentage_str):
    if isinstance(percentage_str, str):
        cleaned_str = percentage_str.replace('%', '').replace('-', '')
        try:
            return float(cleaned_str) / 100.0
        except ValueError:
            # Handle cases where the cleaned string couldn't be converted to float
            return None
    else:
        return percentage_str


def convert_column_to_float(df):
  columns_to_convert = df.columns[3:13]

  for col in columns_to_convert:
    df[col] = df[col].apply(convert_percentage_to_float_with_symbol)
    df[col] = df[col] * df['Total']
  # Convert the selected columns to float64
  #df[columns_to_convert] = df[columns_to_convert].astype(float)
  return df

In [None]:
#processing the concated dataframe, (strip%and, mutliplied by total)
for index in list_of_indices:
  dataframe_name = f'ct{index}'
  globals()[dataframe_name]  = convert_column_to_float(globals()[dataframe_name])


In [None]:
for index in list_of_indices:
  dataframe_name = f'ct{index}'
  globals()[dataframe_name]  = globals()[dataframe_name][globals()[dataframe_name]['estimate_type']=='Households_Estimate']

In [None]:
#calculating income difference
for index in list_of_indices:
    df = f'ct{index}'

    # Clean the 'Median income (dollars)' column by removing commas and dashes
    globals()[df]['Median income (dollars)'] = globals()[df]['Median income (dollars)'].str.replace(',', '').str.replace('-', '')
    # Clean the 'Mean income..
    globals()[df]['Mean income (dollars)'] = globals()[df]['Mean income (dollars)'].str.replace(',', '').str.replace('-', '')

    # Convert the cleaned column to float, handling non-numeric values
    def convert_to_float(value):
        try:
            return float(value)
        except (ValueError, TypeError):
            return None

    globals()[df]['Median income (dollars)'] = globals()[df]['Median income (dollars)'].apply(convert_to_float)
    globals()[df]['Mean income (dollars)'] = globals()[df]['Mean income (dollars)'].apply(convert_to_float)

    # Calculate the 'median_income_difference' column
    globals()[df]['median_income_difference'] = ((globals()[df]['Median income (dollars)'] - globals()[df]['Median income (dollars)'].shift(-1)) / globals()[df]['Median income (dollars)'].shift(-1))
    globals()[df]['mean_income_difference'] = ((globals()[df]['Mean income (dollars)'] - globals()[df]['Mean income (dollars)'].shift(-1)) / globals()[df]['Mean income (dollars)'].shift(-1))



In [None]:
# adding census tract identifier column
for index in list_of_indices:
  dataframe_name = f'ct{index}'
  globals()[dataframe_name]['census_tract']  = index

In [None]:

list_of_indices
# Create a list of DataFrames
concatenated_df = [globals()[f'ct{index}'] for index in list_of_indices]

# Initialize an empty DataFrame to store the concatenated result
Jamaica_Plain_income = pd.DataFrame()

# Concatenate the DataFrames in the list
for df in concatenated_df:
    Jamaica_Plain_income = pd.concat([Jamaica_Plain_income, df], ignore_index=True)

In [None]:
Jamaica_Plain_income.columns

Index(['estimate_type', 'year', 'Total', 'Less than $10,000',
       '$10,000 to $14,999', '$15,000 to $24,999', '$25,000 to $34,999',
       '$35,000 to $49,999', '$50,000 to $74,999', '$75,000 to $99,999',
       '$100,000 to $149,999', '$150,000 to $199,999', '$200,000 or more',
       'Median income (dollars)', 'Mean income (dollars)', 'PERCENT ALLOCATED',
       'Household income in the past 12 months',
       'Family income in the past 12 months',
       'Nonfamily income in the past 12 months', 'PERCENT IMPUTED',
       'median_income_difference', 'mean_income_difference', 'census_tract'],
      dtype='object', name='nan_nan_Label')

In [None]:
income_JamaicaPlain_result_df = Jamaica_Plain_income[['year', 'Median income (dollars)', 'Mean income (dollars)','median_income_difference',
                                                 'mean_income_difference', 'census_tract']]


income_JamaicaPlain_result_df = income_JamaicaPlain_result_df.reset_index(drop=True)


In [None]:
income_JamaicaPlain_result_df['census_tract'] = income_JamaicaPlain_result_df['census_tract'].astype('float64')

In [None]:
income_JamaicaPlain_result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   year                      36 non-null     int64  
 1   Median income (dollars)   36 non-null     float64
 2   Mean income (dollars)     36 non-null     float64
 3   median_income_difference  33 non-null     float64
 4   mean_income_difference    33 non-null     float64
 5   census_tract              36 non-null     float64
dtypes: float64(5), int64(1)
memory usage: 1.8 KB


In [None]:
income_JamaicaPlain_result_df.to_csv('/content/drive/MyDrive/DS project/Jamaica_Plain_income/Jamaica_Plain_income.csv')