# Clean Demographic Data

Data sourced from: https://data.census.gov/table/ACSDP5Y2012.DP05?g=1400000US25025070800,25025070801,25025070900,25025070901,25025080100,25025080300,25025080500,25025080601,25025081300,25025081301,25025081302,25025081400,25025081500,25025081700,25025081800,25025081900,25025082000,25025082100,25025090100,25025090200,25025090300,25025090400,25025090600,25025090700,25025090900,25025090901,25025091000,25025091001,25025091100,25025091200,25025091300,25025091400,25025091500,25025091600,25025091700,25025091800,25025091900,25025092000,25025092100,25025092101,25025092200,25025092300,25025092400,25025100100,25025100200,25025100300,25025100400,25025100500,25025100601,25025100602,25025100603,25025100700,25025100800&d=ACS+5-Year+Estimates+Data+Profiles

In [1]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
data_folder = "/content/drive/MyDrive/DS701/"  # Update this with your data folder path

# Define a list of DataFrame names from d10 to d21
data_frame_names = ['d10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16', 'd17', 'd18', 'd19', 'd20', 'd21']

for name in data_frame_names:
    # Load the DataFrame from a CSV file
    df = pd.read_csv(data_folder + '20{}_demo.csv'.format(name[1:]), index_col=False)

    # Transpose the DataFrame
    df = df.transpose()

    # Extract the first row as column labels
    new_columns = df.iloc[0]

    # Set the first row as the column labels
    df = df.iloc[1:]
    df = df.set_axis(new_columns, axis=1)

    # Reset the index
    df = df.reset_index()
    df.columns.names = ['']

    # Rename the 'index' column to 'tract'
    df = df.rename(columns={"index": "tract"})

    # Split 'tract' column into 'tract' and 'info'
    df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)

    # Clean the 'info' and 'tract' columns
    df['info'] = df['info'].str.replace('Suffolk County, Massachusetts!!', '', regex=True)
    df['tract'] = df['tract'].str.replace('Census Tract ', '', regex=True)

    # Remove commas
    df = df.replace(',', '', regex=True)

    # Remove spaces from column names
    df.columns = df.columns.str.strip()

    # Define columns to keep
    cols_to_keep = ['tract', 'Total population', 'Male', 'Female', 'White',
                    'Black or African American', 'American Indian and Alaska Native',
                    'Asian', 'Native Hawaiian and Other Pacific Islander', 'Some other race',
                    'Hispanic or Latino (of any race)', 'info', 'Median age (years)']

    # Drop columns not in the 'cols_to_keep' list
    df = df.drop([col for col in df.columns if col not in cols_to_keep], axis=1)

    # Remove duplicated columns
    df = df.loc[:, ~df.columns.duplicated(keep='first')]

    # Remove specific strings from the 'info' column
    df['info'] = df['info'].str.strip()  # Removes leading space
    strings_to_remove = ["Estimate Margin of Error", "Margin of Error", "Percent Margin of Error"]
    df = df[~df['info'].isin(strings_to_remove)]
    df = df.drop('info', axis=1)

    # Add year column
    df['year'] = '20{}'.format(name[1:])

    # Reorder columns
    leading_cols = ['tract', 'year']
    df = df[leading_cols + [col for col in df.columns if col not in leading_cols]]

    # Rename columns
    df.columns = ['tract', 'year', 'total_population', 'male', 'female', 'median_age',
                  'race_white', 'race_black', 'race_ai_alaskan', 'race_asian',
                  'race_hawaiian', 'race_other', 'race_hispanic']

    # Define the output filename - for exporting
    output_filename = data_folder + f"{name}_processed.csv"

    # Export the DataFrame to a CSV file
    df.to_csv(output_filename, index=False)

    print(f"{name} has been exported to {output_filename}")


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d10 has been exported to /content/drive/MyDrive/DS701/d10_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d11 has been exported to /content/drive/MyDrive/DS701/d11_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d12 has been exported to /content/drive/MyDrive/DS701/d12_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d13 has been exported to /content/drive/MyDrive/DS701/d13_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d14 has been exported to /content/drive/MyDrive/DS701/d14_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d15 has been exported to /content/drive/MyDrive/DS701/d15_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d16 has been exported to /content/drive/MyDrive/DS701/d16_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d17 has been exported to /content/drive/MyDrive/DS701/d17_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d18 has been exported to /content/drive/MyDrive/DS701/d18_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d19 has been exported to /content/drive/MyDrive/DS701/d19_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d20 has been exported to /content/drive/MyDrive/DS701/d20_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d21 has been exported to /content/drive/MyDrive/DS701/d21_processed.csv


In [3]:
csv_files = [
    "/content/drive/MyDrive/DS701/d10_processed.csv",
    "/content/drive/MyDrive/DS701/d11_processed.csv",
    "/content/drive/MyDrive/DS701/d12_processed.csv",
    "/content/drive/MyDrive/DS701/d13_processed.csv",
    "/content/drive/MyDrive/DS701/d14_processed.csv",
    "/content/drive/MyDrive/DS701/d15_processed.csv",
    "/content/drive/MyDrive/DS701/d16_processed.csv",
    "/content/drive/MyDrive/DS701/d17_processed.csv",
    "/content/drive/MyDrive/DS701/d18_processed.csv",
    "/content/drive/MyDrive/DS701/d19_processed.csv",
    "/content/drive/MyDrive/DS701/d20_processed.csv",
    "/content/drive/MyDrive/DS701/d21_processed.csv"
]

# Initialize an empty list to store the DataFrames
data_frames = []

# Read each CSV file into a DataFrame and store it in the data_frames list
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    data_frames.append(df)

# Concatenate the DataFrames into a single DataFrame
demo_df = pd.concat(data_frames, ignore_index=True)

# Display the concatenated DataFrame
demo_df

Unnamed: 0,tract,year,total_population,male,female,median_age,race_white,race_black,race_ai_alaskan,race_asian,race_hawaiian,race_other,race_hispanic
0,804.01,2010,2415,921,1494,29.7,290,1443,33,131,0,396,749
1,804.01,2010,2415,38.1%,61.9%,(X),12.0%,59.8%,1.4%,5.4%,0.0%,16.4%,31.0%
2,804.01,2011,2384,845,1539,31.1,317,1274,35,134,0,453,904
3,804.01,2011,2384,35.4%,64.6%,(X),13.3%,53.4%,1.5%,5.6%,0.0%,19.0%,37.9%
4,804.01,2012,2514,918,1596,32.3,381,1453,30,137,0,394,849
5,804.01,2012,2514,36.5%,63.5%,(X),15.2%,57.8%,1.2%,5.4%,0.0%,15.7%,33.8%
6,804.01,2013,2543,1041,1502,32.0,493,1378,23,60,0,444,1080
7,804.01,2013,2543,40.9%,59.1%,(X),19.4%,54.2%,0.9%,2.4%,0.0%,17.5%,42.5%
8,804.01,2014,2839,1223,1616,32.2,547,1645,0,67,0,452,1082
9,804.01,2014,2839,43.1%,56.9%,(X),19.3%,57.9%,0.0%,2.4%,0.0%,15.9%,38.1%


In [4]:
import numpy as np

# Replace '(X)' with NaN
demo_df = demo_df.replace('(X)', np.nan)

# Convert columns with percentage values to numeric
percent_cols = ['male', 'female', 'race_white', 'race_black',
                      'race_ai_alaskan', 'race_asian', 'race_hawaiian',
                      'race_other', 'race_hispanic']

for col in percent_cols:
    demo_df[col] = pd.to_numeric(demo_df[col].str.rstrip('%'), errors='coerce')

# Fill NaN values with the previous row's 'median_age' within the same 'tract'
demo_df['median_age'] = demo_df.groupby('tract')['median_age'].ffill()

demo_df = demo_df.drop_duplicates(subset=['tract', 'year', 'median_age'], keep='last')

demo_df[percent_cols] = demo_df[percent_cols] / 100

demo_df['median_age'] = demo_df['median_age'].astype(float)

demo_df.head()


Unnamed: 0,tract,year,total_population,male,female,median_age,race_white,race_black,race_ai_alaskan,race_asian,race_hawaiian,race_other,race_hispanic
1,804.01,2010,2415,0.381,0.619,29.7,0.12,0.598,0.014,0.054,0.0,0.164,0.31
3,804.01,2011,2384,0.354,0.646,31.1,0.133,0.534,0.015,0.056,0.0,0.19,0.379
5,804.01,2012,2514,0.365,0.635,32.3,0.152,0.578,0.012,0.054,0.0,0.157,0.338
7,804.01,2013,2543,0.409,0.591,32.0,0.194,0.542,0.009,0.024,0.0,0.175,0.425
9,804.01,2014,2839,0.431,0.569,32.2,0.193,0.579,0.0,0.024,0.0,0.159,0.381


In [5]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12 entries, 1 to 23
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tract             12 non-null     float64
 1   year              12 non-null     int64  
 2   total_population  12 non-null     int64  
 3   male              12 non-null     float64
 4   female            12 non-null     float64
 5   median_age        12 non-null     float64
 6   race_white        12 non-null     float64
 7   race_black        12 non-null     float64
 8   race_ai_alaskan   12 non-null     float64
 9   race_asian        12 non-null     float64
 10  race_hawaiian     12 non-null     float64
 11  race_other        12 non-null     float64
 12  race_hispanic     12 non-null     float64
dtypes: float64(11), int64(2)
memory usage: 1.3 KB


In [6]:
demo_df.to_csv("/content/drive/MyDrive/DS701/demographics.csv", index=False)

In [7]:
demo_df.tract.value_counts()

804.01    12
Name: tract, dtype: int64