# Connect Drive

In [1]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
data_folder = "/content/drive/MyDrive/DS701/"  # Update this with your data folder path

# Define a list of DataFrame names from d10 to d21
data_frame_names = ['d10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16', 'd17', 'd18', 'd19', 'd20', 'd21']

for name in data_frame_names:
    # Load the DataFrame from a CSV file
    df = pd.read_csv(data_folder + '20{}_demo.csv'.format(name[1:]), index_col=False)

    # Transpose the DataFrame
    df = df.transpose()

    # Extract the first row as column labels
    new_columns = df.iloc[0]

    # Set the first row as the column labels
    df = df.iloc[1:]
    df = df.set_axis(new_columns, axis=1)

    # Reset the index
    df = df.reset_index()
    df.columns.names = ['']

    # Rename the 'index' column to 'Tract'
    df = df.rename(columns={"index": "Tract"})

    # Split 'Tract' column into 'Tract' and 'Info'
    df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)

    # Clean the 'Info' and 'Tract' columns
    df['Info'] = df['Info'].str.replace('Suffolk County, Massachusetts!!', '', regex=True)
    df['Tract'] = df['Tract'].str.replace('Census Tract ', '', regex=True)

    # Remove commas
    df = df.replace(',', '', regex=True)

    # Remove spaces from column names
    df.columns = df.columns.str.strip()

    # Define columns to keep
    cols_to_keep = ['Tract', 'Total population', 'Male', 'Female', 'White',
                    'Black or African American', 'American Indian and Alaska Native',
                    'Asian', 'Native Hawaiian and Other Pacific Islander', 'Some other race',
                    'Hispanic or Latino (of any race)', 'Info', 'Median age (years)']

    # Drop columns not in the 'cols_to_keep' list
    df = df.drop([col for col in df.columns if col not in cols_to_keep], axis=1)

    # Remove duplicated columns
    df = df.loc[:, ~df.columns.duplicated(keep='first')]

    # Remove specific strings from the 'Info' column
    df.Info = df.Info.str.strip()  # Removes leading space
    strings_to_remove = ["Estimate Margin of Error", "Margin of Error", "Percent", "Percent Margin of Error"]
    df = df[~df['Info'].isin(strings_to_remove)]
    df = df.drop('Info', axis=1)

    # Add Year column
    df['Year'] = '20{}'.format(name[1:])

    # Reorder columns
    leading_cols = ['Tract', 'Year']
    df = df[leading_cols + [col for col in df.columns if col not in leading_cols]]

    # Define the output filename - for exporting
    output_filename = data_folder + f"{name}_processed.csv"

    # Export the DataFrame to a CSV file
    df.to_csv(output_filename, index=False)

    print(f"{name} has been exported to {output_filename}")


  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)
  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)
  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)
  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)


d10 has been exported to /content/drive/MyDrive/DS701/d10_processed.csv
d11 has been exported to /content/drive/MyDrive/DS701/d11_processed.csv
d12 has been exported to /content/drive/MyDrive/DS701/d12_processed.csv
d13 has been exported to /content/drive/MyDrive/DS701/d13_processed.csv


  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)
  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)
  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)
  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)


d14 has been exported to /content/drive/MyDrive/DS701/d14_processed.csv
d15 has been exported to /content/drive/MyDrive/DS701/d15_processed.csv
d16 has been exported to /content/drive/MyDrive/DS701/d16_processed.csv
d17 has been exported to /content/drive/MyDrive/DS701/d17_processed.csv
d18 has been exported to /content/drive/MyDrive/DS701/d18_processed.csv
d19 has been exported to /content/drive/MyDrive/DS701/d19_processed.csv
d20 has been exported to /content/drive/MyDrive/DS701/d20_processed.csv
d21 has been exported to /content/drive/MyDrive/DS701/d21_processed.csv


  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)
  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)
  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)
  df[['Tract', 'Info']] = df['Tract'].str.split(',', 1, expand=True)


In [12]:
csv_files = [
    "/content/drive/MyDrive/DS701/d10_processed.csv",
    "/content/drive/MyDrive/DS701/d11_processed.csv",
    "/content/drive/MyDrive/DS701/d12_processed.csv",
    "/content/drive/MyDrive/DS701/d13_processed.csv",
    "/content/drive/MyDrive/DS701/d14_processed.csv",
    "/content/drive/MyDrive/DS701/d15_processed.csv",
    "/content/drive/MyDrive/DS701/d16_processed.csv",
    "/content/drive/MyDrive/DS701/d17_processed.csv",
    "/content/drive/MyDrive/DS701/d18_processed.csv",
    "/content/drive/MyDrive/DS701/d19_processed.csv",
    "/content/drive/MyDrive/DS701/d20_processed.csv",
    "/content/drive/MyDrive/DS701/d21_processed.csv"
]

# Initialize an empty list to store the DataFrames
data_frames = []

# Read each CSV file into a DataFrame and store it in the data_frames list
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    data_frames.append(df)

# Concatenate the DataFrames into a single DataFrame
demo_df = pd.concat(data_frames, ignore_index=True)

convert = {'Male': int,
           'Female': int,
           'White': int,
           'Black or African American': int,
           'Asian': int,
           'Some other race': int,
           'Hispanic or Latino (of any race)': int}

demo_df = demo_df.astype(convert)

# Display the concatenated DataFrame
demo_df.head()

Unnamed: 0,Tract,Year,Total population,Male,Female,Median age (years),White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Some other race,Hispanic or Latino (of any race)
0,801.0,2010,2724,1819,905,35.0,588,1492,52,81,0,421,430
1,803.0,2010,1739,640,1099,24.8,128,1374,0,0,0,134,360
2,805.0,2010,2744,1203,1541,28.3,730,1469,11,0,0,469,1166
3,806.01,2010,2349,1162,1187,22.3,1053,752,0,275,0,237,732
4,813.0,2010,5230,2690,2540,30.3,871,2503,0,308,0,1367,2441


In [13]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 518 entries, 0 to 517
Data columns (total 13 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Tract                                       518 non-null    float64
 1   Year                                        518 non-null    int64  
 2   Total population                            518 non-null    int64  
 3   Male                                        518 non-null    int64  
 4   Female                                      518 non-null    int64  
 5   Median age (years)                          518 non-null    float64
 6   White                                       518 non-null    int64  
 7   Black or African American                   518 non-null    int64  
 8   American Indian and Alaska Native           518 non-null    int64  
 9   Asian                                       518 non-null    int64  
 10  Native Hawaiia

In [14]:
demo_df.to_csv("/content/drive/MyDrive/DS701/demographics.csv", index=False)

In [15]:
demo_df.Tract.value_counts()

801.00     12
803.00     12
915.00     12
916.00     12
917.00     12
918.00     12
919.00     12
920.00     12
921.01     12
922.00     12
923.00     12
924.00     12
1001.00    12
1002.00    12
1003.00    12
1004.00    12
1005.00    12
1006.01    12
1006.03    12
1007.00    12
1008.00    12
914.00     12
913.00     12
912.00     12
821.00     12
805.00     12
806.01     12
814.00     12
815.00     12
817.00     12
818.00     12
819.00     12
911.00     12
820.00     12
901.00     12
902.00     12
903.00     12
904.00     12
906.00     12
907.00     12
909.01     12
910.01     12
813.00     10
813.01      2
813.02      2
Name: Tract, dtype: int64