In [1]:
import sqlite3
import csv
import os
from cities.utils.cleaning_utils import find_repo_root, standardize_and_scale
import pandas as pd
from cities.utils.data_grabber import list_available_features, DataGrabber
from typing import List
import time
import numpy as np
repo_root = find_repo_root()


In [16]:
# Functions to read and write to a SQLite database
def add_variable_table_to_db(variable_name, database_path): 
    # check data format
    df = pd.read_csv(os.path.join(repo_root, 'data', 'processed', variable_name + '_long.csv'))
    cols = df.columns
    assert len(cols) == 4, 'Data should have 4 columns: GeoFIPS, GeoName, Year/Category, Value'
    assert cols[0] == 'GeoFIPS' , 'First column should be GeoFIPS'
    assert cols[1] == 'GeoName' , 'Second column should be GeoName'
    col_name_category = cols[2] # sometimes this is year, sometimes category
    col_name_value = cols[3] # usually this is value, sometimes more specific

    if df[col_name_category].dtype == 'int': # TODO: currently in dataset, yr is sometimes str, sometimes int
        col_cat_type = 'INTEGER'
    else:
        col_cat_type = 'TEXT'

    # Step 1: Connect to a SQLite database
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()

    # Step 2: Drop the existing table if it exists
    cursor.execute('DROP TABLE IF EXISTS ' + variable_name)

    # Create the table with a composite primary key
    cursor.execute(
        'CREATE TABLE IF NOT EXISTS ' + variable_name + ' ('
            '''GeoFIPS INTEGER,
            GeoName TEXT,'''
            + col_name_category + ' ' + col_cat_type + ','
            + col_name_value + ' REAL,'
            'PRIMARY KEY (GeoFIPS, ' + col_name_category + ')'
            ')'
    )
    
    # Use pandas to insert data
    df.to_sql(variable_name, conn, if_exists='replace', index=False)
    # Step 4: Import data from a CSV file into the table
    # with open(os.path.join(repo_root, 'data', 'processed', variable_name + '_long.csv'), 'r') as file:
    #     csv_reader = csv.reader(file)
    #     next(csv_reader)  # Skip the header row
    #     for row in csv_reader:
    #         cursor.execute('INSERT INTO ' + variable_name + ' (GeoFIPS, GeoName,'
    #                        + col_name_category + ', ' + col_name_value + ') VALUES (?, ?, ?, ?)', row)

    # Close the connection
    conn.commit()
    conn.close()

def read_variable_table_from_db(variable_name, database_path):
    # Step 1: Connect to a SQLite database
    conn = sqlite3.connect(database_path)
    cursor = conn.cursor()

    # Step 2: Read the entire table into a pandas DataFrame
    df = pd.read_sql_query('SELECT * FROM ' + variable_name, conn)

    # Step 4: Close the connection
    conn.close()

    return df

In [17]:
#Add a single variable (GDP) to the database, and read it back
variable_name = 'gdp'
database_path = os.path.join(repo_root, 'data', 'us_counties.db')

add_variable_table_to_db(variable_name, database_path)
df = read_variable_table_from_db(variable_name, database_path)
df.head()


Unnamed: 0,GeoFIPS,GeoName,Year,Value
0,1001,"Autauga, AL",2001,59.839
1,1003,"Baldwin, AL",2001,73.853
2,1005,"Barbour, AL",2001,113.864
3,1007,"Bibb, AL",2001,80.443
4,1009,"Blount, AL",2001,92.104


In [18]:
# ADD ALL VARIABLES TO THE DB
# iterate over all variables
feature_list = list_available_features()
database_path = os.path.join(repo_root, 'data', 'us_counties.db')
for variable_name in feature_list:
    print(variable_name)
    add_variable_table_to_db(variable_name, database_path)
    df = read_variable_table_from_db(variable_name, database_path)
    print(df.head())


# Note, data/processed/industry_long.csv has a slight bug: 'Year' is a column name, when it should be 'Category'

unemployment_rate
   GeoFIPS             GeoName  Year  Value
0     1001  Autauga County, AL  1990    6.5
1     1003  Baldwin County, AL  1990    5.3
2     1005  Barbour County, AL  1990    7.9
3     1007     Bibb County, AL  1990    9.2
4     1009   Blount County, AL  1990    6.4
spending_transportation
   GeoFIPS      GeoName  Year  total_obligated_amount
0     1001  Autauga, AL  2011               1608527.0
1     1001  Autauga, AL  2012                134913.0
2     1001  Autauga, AL  2013                408157.0
3     1001  Autauga, AL  2014                218474.0
4     1001  Autauga, AL  2015                570771.0
transport
   GeoFIPS      GeoName     Category     Value
0     1001  Autauga, AL  roadDensity  1.981182
1     1003  Baldwin, AL  roadDensity  2.484343
2     1005  Barbour, AL  roadDensity  1.457892
3     1007     Bibb, AL  roadDensity  1.926318
4     1009   Blount, AL  roadDensity  2.808981
gdp
   GeoFIPS      GeoName  Year    Value
0     1001  Autauga, AL  2001   59.

In [44]:
# DATAGRABBER FROM DB
# Write an alternative datagrabber from the database. 
# There's a tradeoff between how much we store in the db, vs speed (we can compute wide from long, and standardize)
# Next step, testing it
class DataGrabber_FROM_DB:
    def __init__(self):
        self.repo_root = find_repo_root()
        self.database_path = os.path.join(self.repo_root, 'data', 'us_counties.db')
        self.wide = {}
        self.std_wide = {}
        self.long = {}
        self.std_long = {}

    def _fetch_data(self, table_name: str) -> pd.DataFrame:
        # Connect to the SQLite database
        conn = sqlite3.connect(self.database_path)

        # Fetch data from the table
        df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)

        # Close the connection
        conn.close()

        return df
    
    def _first_pull_long(self, feature) -> None:
        if feature not in self.long.keys():
            self.get_features_long([feature])

    def _convert_to_wide(self, df: pd.DataFrame, index_col: str, columns_col: str, values_col: str) -> pd.DataFrame:
        df_wide = df.pivot(index=index_col, columns=columns_col, values=values_col)
        df_wide.reset_index(inplace=True)
        df_wide.columns.name = None
        return df_wide

    def get_features_wide(self, features: List[str]) -> None:
        for feature in features:
            self._first_pull_long(feature)
            # Extract the last two column names for the pivot operation
            columns_col = self.long[feature].columns[-2]
            values_col = self.long[feature].columns[-1]
            self.wide[feature] = self._convert_to_wide(self.long[feature], ['GeoFIPS', 'GeoName'], columns_col, values_col)
    
    def get_features_std_wide(self, features: List[str]) -> None:
        for feature in features:
            if feature not in self.wide.keys(): # first pull wide
                self.get_features_wide([feature])
            self.std_wide[feature] = standardize_and_scale(self.wide[feature])

    def get_features_long(self, features: List[str]) -> None:
        for feature in features:
            if feature not in self.long.keys():
                self.long[feature] = self._fetch_data(f"{feature}")

    def get_features_std_long(self, features: List[str]) -> None:
        for feature in features:
            self._first_pull_long(feature)
            self.std_long[feature] = standardize_and_scale(self.long[feature])


In [78]:
# Test that the two DataGrabbers give the same data, and compare speed
features = list_available_features()

# Original DataGrabber (CSV)
start_time_csv = time.time()
data_grabber_csv = DataGrabber()
data_grabber_csv.get_features_long(features)  
data_grabber_csv.get_features_wide(features)  
data_grabber_csv.get_features_std_wide(features)  
data_grabber_csv.get_features_std_long(features)  
end_time_csv = time.time()

# New DataGrabber (DB)
start_time_db = time.time()
data_grabber_db = DataGrabber_FROM_DB()
data_grabber_db.get_features_long(features)  
data_grabber_db.get_features_wide(features)
data_grabber_db.get_features_std_wide(features)
data_grabber_db.get_features_std_long(features)
end_time_db = time.time()

# Compare results (example for one feature)
for feature in features:
      print("Feature:", feature)
      print("Are the results identical?:")
      print('long: ', data_grabber_csv.long[feature].equals(data_grabber_db.long[feature]))
      # in the existing dataset, the years are inconsistent types (Str vs int). convert all to str
      data_grabber_db.wide[feature].columns = data_grabber_db.wide[feature].columns.map(str)
      data_grabber_csv.wide[feature].columns = data_grabber_csv.wide[feature].columns.map(str)
      # make the columns the same order
      data_grabber_db.wide[feature] = data_grabber_db.wide[feature][data_grabber_csv.wide[feature].columns]
      # data_grabber_db.std_wide[feature] = data_grabber_db.std_wide[feature][data_grabber_csv.std_wide[feature].columns]
      print('std_long: ', data_grabber_csv.std_long[feature].equals(data_grabber_db.std_long[feature]))
      print(data_grabber_csv.std_long[feature].head())
      print(data_grabber_db.std_long[feature].head())
      print()

# Print performance
print("CSV DataGrabber Time:", end_time_csv - start_time_csv)
print("DB DataGrabber Time:", end_time_db - start_time_db)

Feature: unemployment_rate
Are the results identical?:
long:  True
std_long:  False
   GeoFIPS             GeoName  Year     Value
0     1001  Autauga County, AL  1990  0.008721
1     1003  Baldwin County, AL  1990 -0.140351
2     1005  Barbour County, AL  1990  0.049419
3     1007     Bibb County, AL  1990  0.087209
4     1009   Blount County, AL  1990  0.005814
   GeoFIPS             GeoName  Year     Value
0     1001  Autauga County, AL  1990  0.014451
1     1003  Baldwin County, AL  1990 -0.109091
2     1005  Barbour County, AL  1990  0.054913
3     1007     Bibb County, AL  1990  0.092486
4     1009   Blount County, AL  1990  0.011561

Feature: spending_transportation
Are the results identical?:
long:  True
std_long:  False
   GeoFIPS      GeoName  Year  total_obligated_amount
0     1001  Autauga, AL  2011               -0.988778
1     1001  Autauga, AL  2012               -0.999059
2     1001  Autauga, AL  2013               -0.997152
3     1001  Autauga, AL  2014               -

In [81]:
print(data_grabber_csv.std_long['unemployment_rate'])
print(data_grabber_db.std_long['unemployment_rate'])

        GeoFIPS                GeoName  Year     Value
0          1001     Autauga County, AL  1990  0.008721
1          1003     Baldwin County, AL  1990 -0.140351
2          1005     Barbour County, AL  1990  0.049419
3          1007        Bibb County, AL  1990  0.087209
4          1009      Blount County, AL  1990  0.005814
...         ...                    ...   ...       ...
101437    56037  Sweetwater County, WY  2022  0.054054
101438    56039       Teton County, WY  2022 -0.275862
101439    56041       Uinta County, WY  2022  0.018018
101440    56043    Washakie County, WY  2022  0.036036
101441    56045      Weston County, WY  2022 -0.275862

[101442 rows x 4 columns]
        GeoFIPS                GeoName  Year     Value
0          1001     Autauga County, AL  1990  0.014451
1          1003     Baldwin County, AL  1990 -0.109091
2          1005     Barbour County, AL  1990  0.054913
3          1007        Bibb County, AL  1990  0.092486
4          1009      Blount County, AL