# Clean Demographic Data

Data sourced from: https://data.census.gov/table/ACSDP5Y2012.DP05?g=1400000US25025070800,25025070801,25025070900,25025070901,25025080100,25025080300,25025080500,25025080601,25025081300,25025081301,25025081302,25025081400,25025081500,25025081700,25025081800,25025081900,25025082000,25025082100,25025090100,25025090200,25025090300,25025090400,25025090600,25025090700,25025090900,25025090901,25025091000,25025091001,25025091100,25025091200,25025091300,25025091400,25025091500,25025091600,25025091700,25025091800,25025091900,25025092000,25025092100,25025092101,25025092200,25025092300,25025092400,25025100100,25025100200,25025100300,25025100400,25025100500,25025100601,25025100602,25025100603,25025100700,25025100800&d=ACS+5-Year+Estimates+Data+Profiles

In [1]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data_folder = "/content/drive/MyDrive/DS701/"  # Update this with your data folder path

# Define a list of DataFrame names from d10 to d21
data_frame_names = ['d10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16', 'd17', 'd18', 'd19', 'd20', 'd21']

for name in data_frame_names:
    # Load the DataFrame from a CSV file
    df = pd.read_csv(data_folder + '20{}_demo.csv'.format(name[1:]), index_col=False)

    # Transpose the DataFrame
    df = df.transpose()

    # Extract the first row as column labels
    new_columns = df.iloc[0]

    # Set the first row as the column labels
    df = df.iloc[1:]
    df = df.set_axis(new_columns, axis=1)

    # Reset the index
    df = df.reset_index()
    df.columns.names = ['']

    # Rename the 'index' column to 'tract'
    df = df.rename(columns={"index": "tract"})

    # Split 'tract' column into 'tract' and 'info'
    df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)

    # Clean the 'info' and 'tract' columns
    df['info'] = df['info'].str.replace('Suffolk County, Massachusetts!!', '', regex=True)
    df['tract'] = df['tract'].str.replace('Census Tract ', '', regex=True)

    # Remove commas
    df = df.replace(',', '', regex=True)

    # Remove spaces from column names
    df.columns = df.columns.str.strip()

    # Define columns to keep
    cols_to_keep = ['tract', 'Total population', 'Male', 'Female', 'White',
                    'Black or African American', 'American Indian and Alaska Native',
                    'Asian', 'Native Hawaiian and Other Pacific Islander', 'Some other race',
                    'Hispanic or Latino (of any race)', 'info', 'Median age (years)']

    # Drop columns not in the 'cols_to_keep' list
    df = df.drop([col for col in df.columns if col not in cols_to_keep], axis=1)

    # Remove duplicated columns
    df = df.loc[:, ~df.columns.duplicated(keep='first')]

    # Remove specific strings from the 'info' column
    df['info'] = df['info'].str.strip()  # Removes leading space
    strings_to_remove = ["Estimate Margin of Error", "Margin of Error", "Percent", "Percent Margin of Error"]
    df = df[~df['info'].isin(strings_to_remove)]
    df = df.drop('info', axis=1)

    # Add year column
    df['year'] = '20{}'.format(name[1:])

    # Reorder columns
    leading_cols = ['tract', 'year']
    df = df[leading_cols + [col for col in df.columns if col not in leading_cols]]

    # Rename columns
    df.columns = ['tract', 'year', 'total_population', 'male', 'female', 'median_age',
                  'race_white', 'race_black', 'race_ai_alaskan', 'race_asian',
                  'race_hawaiian', 'race_other', 'race_hispanic']

    # Define the output filename - for exporting
    output_filename = data_folder + f"{name}_processed.csv"

    # Export the DataFrame to a CSV file
    df.to_csv(output_filename, index=False)

    print(f"{name} has been exported to {output_filename}")


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)
  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d10 has been exported to /content/drive/MyDrive/DS701/d10_processed.csv
d11 has been exported to /content/drive/MyDrive/DS701/d11_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)
  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)
  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)
  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d12 has been exported to /content/drive/MyDrive/DS701/d12_processed.csv
d13 has been exported to /content/drive/MyDrive/DS701/d13_processed.csv
d14 has been exported to /content/drive/MyDrive/DS701/d14_processed.csv
d15 has been exported to /content/drive/MyDrive/DS701/d15_processed.csv
d16 has been exported to /content/drive/MyDrive/DS701/d16_processed.csv
d17 has been exported to /content/drive/MyDrive/DS701/d17_processed.csv
d18 has been exported to /content/drive/MyDrive/DS701/d18_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)
  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)
  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)
  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


d19 has been exported to /content/drive/MyDrive/DS701/d19_processed.csv
d20 has been exported to /content/drive/MyDrive/DS701/d20_processed.csv
d21 has been exported to /content/drive/MyDrive/DS701/d21_processed.csv


  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)
  df[['tract', 'info']] = df['tract'].str.split(',', 1, expand=True)


In [6]:
csv_files = [
    "/content/drive/MyDrive/DS701/d10_processed.csv",
    "/content/drive/MyDrive/DS701/d11_processed.csv",
    "/content/drive/MyDrive/DS701/d12_processed.csv",
    "/content/drive/MyDrive/DS701/d13_processed.csv",
    "/content/drive/MyDrive/DS701/d14_processed.csv",
    "/content/drive/MyDrive/DS701/d15_processed.csv",
    "/content/drive/MyDrive/DS701/d16_processed.csv",
    "/content/drive/MyDrive/DS701/d17_processed.csv",
    "/content/drive/MyDrive/DS701/d18_processed.csv",
    "/content/drive/MyDrive/DS701/d19_processed.csv",
    "/content/drive/MyDrive/DS701/d20_processed.csv",
    "/content/drive/MyDrive/DS701/d21_processed.csv"
]

# Initialize an empty list to store the DataFrames
data_frames = []

# Read each CSV file into a DataFrame and store it in the data_frames list
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    data_frames.append(df)

# Concatenate the DataFrames into a single DataFrame
demo_df = pd.concat(data_frames, ignore_index=True)

convert = {'male': int,
           'female': int,
           'race_white': int,
           'race_black': int,
           'race_asian': int,
           'race_other': int,
           'race_hispanic': int}

demo_df = demo_df.astype(convert)

# Display the concatenated DataFrame
demo_df.head()

Unnamed: 0,tract,year,total_population,male,female,median_age,race_white,race_black,race_ai_alaskan,race_asian,race_hawaiian,race_other,race_hispanic
0,708.0,2010,3436,1757,1679,34.3,2150,856,14,163,0,201,365
1,709.0,2010,3057,1428,1629,36.0,1670,755,10,191,0,294,648
2,801.0,2010,2724,1819,905,35.0,588,1492,52,81,0,421,430
3,803.0,2010,1739,640,1099,24.8,128,1374,0,0,0,134,360
4,805.0,2010,2744,1203,1541,28.3,730,1469,11,0,0,469,1166


In [7]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 542 entries, 0 to 541
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tract             542 non-null    float64
 1   year              542 non-null    int64  
 2   total_population  542 non-null    int64  
 3   male              542 non-null    int64  
 4   female            542 non-null    int64  
 5   median_age        542 non-null    float64
 6   race_white        542 non-null    int64  
 7   race_black        542 non-null    int64  
 8   race_ai_alaskan   542 non-null    int64  
 9   race_asian        542 non-null    int64  
 10  race_hawaiian     542 non-null    int64  
 11  race_other        542 non-null    int64  
 12  race_hispanic     542 non-null    int64  
dtypes: float64(2), int64(11)
memory usage: 55.2 KB


In [8]:
demo_df.to_csv("/content/drive/MyDrive/DS701/demographics.csv", index=False)

In [10]:
demo_df.tract.value_counts()

913.00     12
920.00     12
912.00     12
1007.00    12
914.00     12
915.00     12
916.00     12
917.00     12
918.00     12
919.00     12
921.01     12
1006.03    12
922.00     12
923.00     12
924.00     12
1001.00    12
1002.00    12
1003.00    12
1004.00    12
1005.00    12
911.00     12
910.01     12
909.01     12
907.00     12
801.00     12
803.00     12
805.00     12
806.01     12
1008.00    12
814.00     12
815.00     12
817.00     12
818.00     12
819.00     12
820.00     12
821.00     12
901.00     12
902.00     12
903.00     12
904.00     12
906.00     12
1006.01    12
708.00     10
709.00     10
813.00     10
708.01      2
709.01      2
813.01      2
813.02      2
Name: tract, dtype: int64