In [37]:
import os
import sys
import pickle
import numpy as np

from cities.utils.cleaning_utils import find_repo_root

root = find_repo_root()


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from cities.utils.cleaning_utils import standardize_and_scale

import matplotlib.pyplot as plt



from cities.utils.data_grabber import DataGrabber
data = DataGrabber()
data.get_features_wide(["gdp", "population", "transport",
            "spending_transportation", "spending_commerce", "spending_HHS"])
gdp = data.wide['gdp']
# population = data.wide['population']
# trnasport = data.wide['transport']
# spending_transportation = data.wide['spending_transportation']
# spending_commerce = data.wide['spending_commerce']

def tableInfo(tableName):

    print(tableName.head())
    print(tableName.dtypes)
    print(f'Number of rows: {tableName.shape[0]}')
    print(f'Unique FIPS numbers {tableName['GeoFIPS'].nunique()}')

In [75]:
column_names = ['col1', 'metro_area', 'col3', 'GeoName']

metro_areas = pd.read_csv(f"{root}/data/raw/metrolist.csv", header=None, names=column_names)



unique_metro_areas = metro_areas['metro_area'].nunique()
print(f"Number of unique metropolitan areas {unique_metro_areas}")

unique_geo_names = metro_areas['GeoName'].nunique()
print(f"Number of unique Counties {unique_geo_names}")

Number of unique metropolitan areas 384
Number of unique Counties 1160


In [76]:
# let's treat as a source of truth for metro areas

metro_areas = metro_areas.rename(columns={'GeoName': 'CountyName', 'col3': 'CountyFIPS', 'col1': 'GeoFIPS',
                                              'metro_area': 'GeoName'})
metro_areas = metro_areas.sort_values(by=['GeoFIPS', 'CountyFIPS']).reset_index(drop=True)

metro_areas['GeoName'] = metro_areas['GeoName'].astype(str)

metro_areas['GeoName'] = metro_areas['GeoName'].str.extract(r'^(.*?)\s*\(', expand=False).fillna('')
metro_areas['GeoName'] = metro_areas['GeoName'] + ' (MA)'
metro_areas



Unnamed: 0,GeoFIPS,GeoName,CountyFIPS,CountyName
0,10180,"Abilene, TX (MA)",48059,"Callahan, TX"
1,10180,"Abilene, TX (MA)",48253,"Jones, TX"
2,10180,"Abilene, TX (MA)",48441,"Taylor, TX"
3,10420,"Akron, OH (MA)",39133,"Portage, OH"
4,10420,"Akron, OH (MA)",39153,"Summit, OH"
...,...,...,...,...
1155,49660,"Youngstown-Warren-Boardman, OH-PA (MA)",39155,"Trumbull, OH"
1156,49660,"Youngstown-Warren-Boardman, OH-PA (MA)",42085,"Mercer, PA"
1157,49700,"Yuba City, CA (MA)",6101,"Sutter, CA"
1158,49700,"Yuba City, CA (MA)",6115,"Yuba, CA"


In [None]:
# compare geonames with cities source of truthe



# GeoNames that are not common with GDP
geo_names_not_in_gdp = metro_areas[~metro_areas['GeoName'].isin(gdp['GeoName'])]
geo_names_not_in_gdp


In [4]:
import requests
import json


api_key = "ED8C2AB7-DD09-4EE0-AC91-116A9E05348A"


years = '1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021'

url = f"https://apps.bea.gov/api/data/?UserID={api_key}&method=GetData&datasetname=Regional&TableName=CAINC30&LineCode=100&Year={years}&GeoFips=MSA&ResultFormat=json"
response = requests.get(url)
assert response.status_code == 200

# 100 stands for population

data = response.json()


data_series = data['BEAAPI']['Results']['Data']

# Create a DataFrame
df = pd.DataFrame(data_series)


# metro_areas fips rules: always 5 digits, last is 0

In [7]:
df.head(5)

Unnamed: 0,Code,GeoFips,GeoName,TimePeriod,CL_UNIT,UNIT_MULT,DataValue,NoteRef
0,CAINC30-100,998,United States (Metropolitan Portion),1997,Number of persons,0,229081203,3
1,CAINC30-100,998,United States (Metropolitan Portion),2005,Number of persons,0,250646689,3
2,CAINC30-100,998,United States (Metropolitan Portion),2009,Number of persons,0,261210075,3
3,CAINC30-100,998,United States (Metropolitan Portion),2018,Number of persons,0,283180665,3
4,CAINC30-100,998,United States (Metropolitan Portion),2001,Number of persons,0,240717866,3


In [96]:
population_ma = df.copy()

population_ma = population_ma[population_ma['GeoFips'].str.endswith('0')]


population_ma = population_ma[['GeoFips', 'GeoName', 'TimePeriod', 'DataValue']]
population_ma = population_ma.rename(columns={'DataValue': 'Value', 'TimePeriod': 'Year', 'GeoFips': 'GeoFIPS' })
population_ma = population_ma.sort_values(by=['GeoFIPS', 'Year']).reset_index(drop=True)
population_ma['Value'] = population_ma['Value'].astype(float)
population_ma['GeoName'] = population_ma['GeoName'].astype(str)
population_ma['GeoFIPS'] = population_ma['GeoFIPS'].astype(np.int64)

population_ma

# MA is a standard metropolitan statistical area abbreaviation

Unnamed: 0,GeoFIPS,GeoName,Year,Value
0,10180,"Abilene, TX (Metropolitan Statistical Area)",1993,152909.0
1,10180,"Abilene, TX (Metropolitan Statistical Area)",1994,153779.0
2,10180,"Abilene, TX (Metropolitan Statistical Area)",1995,156097.0
3,10180,"Abilene, TX (Metropolitan Statistical Area)",1996,156351.0
4,10180,"Abilene, TX (Metropolitan Statistical Area)",1997,157405.0
...,...,...,...,...
11131,49740,"Yuma, AZ (Metropolitan Statistical Area) *",2017,199915.0
11132,49740,"Yuma, AZ (Metropolitan Statistical Area) *",2018,200572.0
11133,49740,"Yuma, AZ (Metropolitan Statistical Area) *",2019,202099.0
11134,49740,"Yuma, AZ (Metropolitan Statistical Area) *",2020,204528.0


In [97]:
population_ma['GeoName'] = population_ma['GeoName'].str.extract(r'^(.*?)\s*\(', expand=False).fillna('')
population_ma['GeoName'] = population_ma['GeoName'] + ' (MA)'

population_ma

Unnamed: 0,GeoFIPS,GeoName,Year,Value
0,10180,"Abilene, TX (MA)",1993,152909.0
1,10180,"Abilene, TX (MA)",1994,153779.0
2,10180,"Abilene, TX (MA)",1995,156097.0
3,10180,"Abilene, TX (MA)",1996,156351.0
4,10180,"Abilene, TX (MA)",1997,157405.0
...,...,...,...,...
11131,49740,"Yuma, AZ (MA)",2017,199915.0
11132,49740,"Yuma, AZ (MA)",2018,200572.0
11133,49740,"Yuma, AZ (MA)",2019,202099.0
11134,49740,"Yuma, AZ (MA)",2020,204528.0


In [125]:
population_wide = population_ma.pivot(index=['GeoFIPS', 'GeoName'], columns='Year', values='Value')
population_wide = population_wide.reset_index()
population_wide.columns.name = None
population_wide




Unnamed: 0,GeoFIPS,GeoName,1993,1994,1995,1996,1997,1998,1999,2000,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,10180,"Abilene, TX (MA)",152909.0,153779.0,156097.0,156351.0,157405.0,158264.0,159755.0,160288.0,...,168246.0,168614.0,169859.0,171579.0,172242.0,172915.0,174005.0,175187.0,176866.0,177829.0
1,10420,"Akron, OH (MA)",674114.0,678063.0,682146.0,687264.0,689461.0,691039.0,693125.0,695946.0,...,702245.0,703809.0,705134.0,704664.0,703862.0,704229.0,703925.0,703361.0,701625.0,696225.0
2,10500,"Albany, GA (MA)",148122.0,149356.0,150100.0,151596.0,152692.0,153449.0,153555.0,153585.0,...,154669.0,153680.0,153263.0,152018.0,151044.0,150387.0,150440.0,149530.0,148244.0,144922.0
3,10540,"Albany-Lebanon, OR (MA)",95496.0,96919.0,98853.0,100582.0,102054.0,102770.0,103462.0,103020.0,...,117865.0,117822.0,118285.0,119295.0,121648.0,123657.0,125788.0,127700.0,128978.0,129948.0
4,10580,"Albany-Schenectady-Troy, NY (MA)",827812.0,830817.0,830439.0,828007.0,824711.0,823712.0,824119.0,827399.0,...,879397.0,883854.0,886981.0,889918.0,892762.0,897172.0,899605.0,898791.0,899748.0,905369.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,49420,"Yakima, WA (MA)",204266.0,208963.0,212601.0,214951.0,217201.0,219748.0,221573.0,222615.0,...,247072.0,247853.0,248660.0,250020.0,252180.0,253166.0,254420.0,255815.0,256702.0,256647.0
380,49620,"York-Hanover, PA (MA)",356156.0,361092.0,365997.0,369781.0,372706.0,375810.0,378905.0,382743.0,...,438872.0,440954.0,443127.0,445022.0,447735.0,450261.0,453380.0,454912.0,456692.0,459148.0
381,49660,"Youngstown-Warren-Boardman, OH-PA (MA)",618328.0,617253.0,615595.0,614369.0,611902.0,609286.0,605978.0,602227.0,...,560540.0,558798.0,556915.0,553591.0,550063.0,547497.0,545030.0,542582.0,540211.0,537837.0
382,49700,"Yuba City, CA (MA)",131950.0,134518.0,135323.0,136160.0,136425.0,137016.0,138097.0,139564.0,...,167918.0,168982.0,170146.0,171626.0,173529.0,176069.0,177586.0,179785.0,181458.0,182254.0


In [87]:
total_rows = len(population_ma)
unique_years = population_ma['Year'].nunique()
unique_geo_fips = population_ma['GeoFIPS'].nunique()

assert (unique_years * unique_geo_fips) == total_rows

In [84]:
population_ma['GeoFIPS'].nunique() == metro_areas['GeoFIPS'].nunique() # 384

True

In [85]:
ma_not_in_population = population_ma[~population_ma['GeoFIPS'].isin(metro_areas['GeoFIPS'])]
ma_not_in_population
# should be empty

Unnamed: 0,GeoFIPS,GeoName,Year,Value


In [None]:
def clean_variable(variable_name, path_to_raw_csv, YearOrCategory="Year", region_type: str = "county"):
    # function for cleaning a generic timeseries csv, wide format with these columns:
    # GeoFIPS, GeoName, 2001, 2002, 2003, 2004, 2005, 2006, 2007, ...
    
    # load raw csv
    variable_db = pd.read_csv(path_to_raw_csv)
    variable_db["GeoFIPS"] = variable_db["GeoFIPS"].astype(int)

    # drop nans
    variable_db = variable_db.dropna()

    
    if region_type == "county":
        
        # load gdb, to get list of current non-excluded FIPS codes
        data = DataGrabber()
        data.get_features_wide(["gdp"])
        gdp = data.wide["gdp"]
        
        # Check if there are any counties that are missing from variable_db but in exclusions_df
        # If so, add them to exclusions, and re-run variable_db with new exclusions

        if len(np.setdiff1d(gdp["GeoFIPS"].unique(), variable_db["GeoFIPS"].unique())) > 0:
            # add new exclusions

            new_exclusions = np.setdiff1d(
                gdp["GeoFIPS"].unique(), variable_db["GeoFIPS"].unique()
            )

            print("Adding new exclusions to exclusions.csv: " + str(new_exclusions))

            # open exclusions file

            exclusions = pd.read_csv(os.path.join(root, "/data/raw/exclusions.csv"))

            new_rows = pd.DataFrame(
                {
                    "dataset": [variable_name] * len(new_exclusions),
                    "exclusions": new_exclusions,
                }
            )

            # Concatenate the new rows to the existing DataFrame
            exclusions = pd.concat([exclusions, new_rows], ignore_index=True)

            # Remove duplicates
            exclusions = exclusions.drop_duplicates()

            exclusions = exclusions.sort_values(by=["dataset", "exclusions"]).reset_index(
                drop=True
            )

            exclusions.to_csv(
                os.path.join(root, "/data/raw/exclusions.csv"), index=False
            )

            print("Rerunning gdp cleaning with new exclusions")

            # rerun gdp cleaning
            clean_gdp()
            clean_variable(variable_name, path_to_raw_csv)
            return

        # restrict to only common FIPS codes
        common_fips = np.intersect1d(
            gdp["GeoFIPS"].unique(), variable_db["GeoFIPS"].unique()
        )
        variable_db = variable_db[variable_db["GeoFIPS"].isin(common_fips)]
        variable_db = variable_db.merge(
            gdp[["GeoFIPS", "GeoName"]], on=["GeoFIPS", "GeoName"], how="left"
        )
        variable_db = variable_db.sort_values(by=["GeoFIPS", "GeoName"])

        # make sure that it passes this test data.wide[feature][column].dtype == float
        for column in variable_db.columns:
            if column not in ["GeoFIPS", "GeoName"]:
                variable_db[column] = variable_db[column].astype(float)

        # save 4 formats to .csv
        variable_db_wide = variable_db.copy()
        variable_db_long = pd.melt(
            variable_db,
            id_vars=["GeoFIPS", "GeoName"],
            var_name=YearOrCategory,
            value_name="Value",
        )
        variable_db_std_wide = standardize_and_scale(variable_db)
        variable_db_std_long = pd.melt(
            variable_db_std_wide.copy(),
            id_vars=["GeoFIPS", "GeoName"],
            var_name=YearOrCategory,
            value_name="Value",
        )
        variable_db_wide.to_csv(
            os.path.join(root, "/data/processed/" + variable_name + "_wide.csv"),
            index=False,
        )
        variable_db_long.to_csv(
            os.path.join(root, "/data/processed/" + variable_name + "_long.csv"),
            index=False,
        )
        variable_db_std_wide.to_csv(
            os.path.join(root, "/data/processed/" + variable_name + "_std_wide.csv"),
            index=False,
        )
        variable_db_std_long.to_csv(
            os.path.join(root, "/data/processed/" + variable_name + "_std_long.csv"),
            index=False,
        )


    elif region_type == 'MA':
        
        metrolist = pd.read_csv(os.path.join(root, "/data/raw/metrolist.csv"))
        
        # dtypes
        # 
        
        # do something else
        
    else :
        raise ValueError("region_type must be either 'county' or 'MA'")

In [None]:
import os
import pandas as pd
from cities.utils.cleaning_utils import standardize_and_scale
from cities.utils.data_grabber import DataGrabber
from cities.utils.clean_gdp import clean_gdp

class VariableCleaner:
    def __init__(self, variable_name, path_to_raw_csv, YearOrCategory="Year", region_type="county"):
        self.variable_name = variable_name
        self.path_to_raw_csv = path_to_raw_csv
        self.YearOrCategory = YearOrCategory
        self.region_type = region_type
        self.root = find_repo_root()
        self.data_grabber = DataGrabber()
        self.gdp = None
        self.variable_db = None

    def clean_variable(self):
        self.load_raw_csv()
        self.drop_nans()
        if self.region_type == "county":
            self.load_gdp_data()
            self.check_exclusions()
            self.restrict_common_fips()
            self.save_csv_files(self.region_type)
        elif self.region_type == "MA":
            self.process_MA_data()
            
            # self.check_exclusions('MA') functionality to implement in the future
            self.save_csv_files(self.region_type)
        else:
            raise ValueError("region_type must be either 'county' or 'MA'")

    def load_raw_csv(self):
        self.variable_db = pd.read_csv(self.path_to_raw_csv)
        self.variable_db["GeoFIPS"] = self.variable_db["GeoFIPS"].astype(int)

    def drop_nans(self):
        self.variable_db = self.variable_db.dropna()
        
    def load_metro_areas(self):
        self.metro = pd.read_csv(f"{self.root}/data/raw/metrolist.csv")

    def load_gdp_data(self):
        self.data_grabber.get_features_wide(["gdp"])
        self.gdp = self.data_grabber.wide["gdp"]

    def check_exclusions(self):
        common_fips = np.intersect1d(self.gdp["GeoFIPS"].unique(), self.variable_db["GeoFIPS"].unique())
        if len(np.setdiff1d(self.gdp["GeoFIPS"].unique(), self.variable_db["GeoFIPS"].unique())) > 0:
            self.add_new_exclusions(common_fips)
            clean_gdp()
            self.clean_variable()

    def add_new_exclusions(self, common_fips):
        new_exclusions = np.setdiff1d(self.gdp["GeoFIPS"].unique(), self.variable_db["GeoFIPS"].unique())
        print("Adding new exclusions to exclusions.csv: " + str(new_exclusions))
        exclusions = pd.read_csv(os.path.join(self.root, "/data/raw/exclusions.csv"))
        new_rows = pd.DataFrame({"dataset": [self.variable_name] * len(new_exclusions), "exclusions": new_exclusions})
        exclusions = pd.concat([exclusions, new_rows], ignore_index=True)
        exclusions = exclusions.drop_duplicates()
        exclusions = exclusions.sort_values(by=["dataset", "exclusions"]).reset_index(drop=True)
        exclusions.to_csv(os.path.join(self.root, "/data/raw/exclusions.csv"), index=False)
        print("Rerunning gdp cleaning with new exclusions")


    def restrict_common_fips(self):
        common_fips = np.intersect1d(self.gdp["GeoFIPS"].unique(), self.variable_db["GeoFIPS"].unique())
        self.variable_db = self.variable_db[self.variable_db["GeoFIPS"].isin(common_fips)]
        self.variable_db = self.variable_db.merge(self.gdp[["GeoFIPS", "GeoName"]], on=["GeoFIPS", "GeoName"], how="left")
        self.variable_db = self.variable_db.sort_values(by=["GeoFIPS", "GeoName"])
        for column in self.variable_db.columns:
            if column not in ["GeoFIPS", "GeoName"]:
                self.variable_db[column] = self.variable_db[column].astype(float)
                
    def process_MA_data(self):
        
        metro_areas = self.load_metro_areas()
        assert metro_areas['GeoFIPS'].nunique() == self.variable_db['GeoFIPS'].nunique()
        self.variable_db['GeoFIPS'] = self.variable_db['GeoFIPS'].astype(np.int64)
        
    def save_csv_files(self, regions):
        
        # make sure that a db is wide, if not make it wide
        
        if regions == 'county':
            variable_db_wide = self.variable_db.copy()
            variable_db_long = pd.melt(self.variable_db, id_vars=["GeoFIPS", "GeoName"], var_name=self.YearOrCategory, value_name="Value")
            variable_db_std_wide = standardize_and_scale(self.variable_db)
            variable_db_std_long = pd.melt(variable_db_std_wide.copy(), id_vars=["GeoFIPS", "GeoName"], var_name=self.YearOrCategory, value_name="Value")
            variable_db_wide.to_csv(os.path.join(self.root, "/data/processed/" + self.variable_name + "_wide.csv"), index=False)
            variable_db_long.to_csv(os.path.join(self.root, "/data/processed/" + self.variable_name + "_long.csv"), index=False)
            variable_db_std_wide.to_csv(os.path.join(self.root, "/data/processed/" + self.variable_name + "_std_wide.csv"), index=False)
            variable_db_std_long.to_csv(os.path.join(self.root, "/data/processed/" + self.variable_name + "_std_long.csv"), index=False)
            
        elif regions == 'MA':
            
            variable_db_wide = self.variable_db.copy()
            variable_db_long = pd.melt(self.variable_db, id_vars=["GeoFIPS", "GeoName"], var_name=self.YearOrCategory, value_name="Value")
            variable_db_std_wide = standardize_and_scale(self.variable_db)
            variable_db_std_long = pd.melt(variable_db_std_wide.copy(), id_vars=["GeoFIPS", "GeoName"], var_name=self.YearOrCategory, value_name="Value")
            variable_db_wide.to_csv(os.path.join(self.root, "/data/processed/" + self.variable_name + "ma_wide.csv"), index=False)
            variable_db_long.to_csv(os.path.join(self.root, "/data/processed/" + self.variable_name + "ma_long.csv"), index=False)
            variable_db_std_wide.to_csv(os.path.join(self.root, "/data/processed/" + self.variable_name + "ma_std_wide.csv"), index=False)
            variable_db_std_long.to_csv(os.path.join(self.root, "/data/processed/" + self.variable_name + "ma_std_long.csv"), index=False)
            
        else :
            raise ValueError("region_type must be either 'county' or 'MA'")

    


In [None]:
# use example

cleaner = VariableCleaner(variable_name="example_variable",
                          path_to_raw_csv="path/to/raw/csv/file.csv", YearOrCategory="Year", region_type="county")
cleaner.clean_variable()
