In [2]:
from pathlib import Path

from cities.utils.data_grabber import DataGrabber
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import pickle
import os
import sys


current_dir =  os.getcwd()

grandparent_dir = os.path.dirname(os.path.dirname(current_dir))
sys.path.insert(0, grandparent_dir)

In [21]:
data = DataGrabber()
data.get_features_wide(["gdp", "population"])
gdp = data.wide
gdp = gdp.get("gdp")

population = data.wide
population = population.get("population")

In [30]:
def standardize_and_scale(data: pd.DataFrame, variable_name: str) -> pd.DataFrame:
    """
    Standardizes and scales float columns in a DataFrame to [-1,1], copying other columns.
    Returns a new DataFrame and updates the global transformation_info_dict.
    """
    transformation_info_dict_path = f"{grandparent_dir}/data/raw/std_info.pkl"
    if os.path.exists(transformation_info_dict_path):
        with open(transformation_info_dict_path, "rb") as file:
            transformation_info_dict = pickle.load(file)
    else:
        transformation_info_dict = {}
    
    standard_scaler = StandardScaler()  # Standardize to mean 0, std 1

    def sigmoid(x, scale=1 / 3):
        return 2 / (1 + np.exp(-x * scale)) - 1


    new_data = data.copy()


    float_cols = data.select_dtypes(include=["float64"])


    transformation_info = {}

    for column in float_cols.columns:
        # Get mean and std for each column
        mean_value = float_cols[column].mean()
        std_value = float_cols[column].std()

        # Save information in the dictionary
        transformation_info[column] = {"mean": mean_value, "std": std_value}

        # Standardize float columns to mean 0, std 1
        standardized_values = standard_scaler.fit_transform(float_cols[[column]])

        # Apply sigmoid transformation, [-3std, 3std] to [-1, 1]
        new_data[column] = sigmoid(standardized_values, scale=1 / 3)

    transformation_info_dict[variable_name] = transformation_info

    with open(f"{grandparent_dir}/data/raw/std_info.pkl", "wb") as file:
        pickle.dump(transformation_info_dict, file)


    return new_data


def revert_standardize_and_scale(data: pd.DataFrame, variable_name: str) -> pd.DataFrame:
    """
    Inverts the standardization and scaling operation using the provided transformation information from a file.
    Returns the original DataFrame.
    """
    def inverse_sigmoid(y, scale=1 / 3):
        return -np.log((2 / (y + 1)) - 1) / scale


    inverted_data = data.copy()


    file_path = f"{grandparent_dir}/data/raw/std_info.pkl"
    with open(file_path, "rb") as file:
        transformation_info_dict = pickle.load(file)
        

    transformation_info = transformation_info_dict.get(variable_name, {})

    for column, info in transformation_info.items():

        inverted_values = inverse_sigmoid(data[column])


        inverted_values = inverted_values * info["std"] + info["mean"]

        inverted_data[column] = inverted_values

    return inverted_data

In [31]:
gdp_stand = standardize_and_scale(gdp, "gdp")

In [32]:
gdp_restored = revert_standardize_and_scale(gdp_stand, "gdp")

gdp_restored

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2011,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,1001,"Autauga, AL",59.834763,61.991753,63.503627,73.726927,75.303761,80.456395,81.833542,73.866173,...,86.676848,93.058381,93.153106,99.929887,104.089509,99.796658,100.852571,97.230688,96.112770,94.635037
1,1003,"Baldwin, AL",73.851043,77.271238,81.568565,90.522659,101.403006,104.554315,107.841773,102.635853,...,98.999852,104.651267,106.431266,110.434595,115.477361,118.499701,125.070510,131.434252,131.617546,144.299116
2,1005,"Barbour, AL",113.868552,111.856864,114.631944,124.478183,125.008846,122.615253,118.400491,110.697164,...,103.918652,113.336680,106.760319,103.701500,101.968164,100.507774,101.802725,102.051472,98.042084,99.390811
3,1007,"Bibb, AL",80.442115,81.525931,85.123144,89.316463,88.780953,89.595882,95.307734,94.744569,...,102.559431,99.536435,97.931883,94.592018,95.810162,96.876183,96.985942,104.618890,109.486946,107.877191
4,1009,"Blount, AL",92.105012,92.593731,95.469827,98.129897,100.918928,97.428156,96.719964,97.076949,...,91.936703,99.317400,101.583477,106.505956,98.392582,104.330396,109.559987,106.564206,100.420471,113.455099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3069,56037,"Sweetwater, WY",92.341050,81.092860,83.235837,84.313649,89.538076,103.242102,109.913110,112.522461,...,103.317555,99.505430,96.654675,93.913908,91.540467,89.445975,86.263197,88.191217,80.159175,74.905828
3070,56039,"Teton, WY",81.930357,84.885477,85.315175,86.357982,88.882970,96.508006,98.646277,97.627038,...,95.809333,102.193868,110.461921,112.837986,114.847259,119.416850,121.269892,123.477957,125.414537,155.528943
3071,56041,"Uinta, WY",99.908281,110.553652,107.055711,108.931654,112.810862,123.556406,126.920877,127.788945,...,106.182021,96.672969,94.342299,95.300134,94.678978,93.021556,89.427712,86.754984,77.506744,76.141029
3072,56043,"Washakie, WY",92.188026,91.281517,95.083764,95.411455,91.316366,102.153925,95.986845,106.531487,...,107.935306,101.334728,104.042877,106.055883,94.448941,89.126923,89.156668,92.312888,91.536026,91.999609


In [18]:
gdp

Unnamed: 0,GeoFIPS,GeoName,2001,2002,2003,2004,2005,2006,2007,2008,...,2011,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,1001,"Autauga, AL",59.839,61.996,63.508,73.730,75.307,80.459,81.836,73.870,...,86.679,93.060,93.155,99.931,104.090,99.798,100.854,97.233,96.115,94.638
1,1003,"Baldwin, AL",73.853,77.273,81.570,90.523,101.402,104.553,107.840,102.635,...,99.000,104.651,106.431,110.434,115.476,118.498,125.068,131.431,131.614,144.294
2,1005,"Barbour, AL",113.864,111.853,114.628,124.473,125.004,122.611,118.397,110.695,...,103.918,113.335,106.760,103.702,101.969,100.509,101.804,102.053,98.044,99.393
3,1007,"Bibb, AL",80.443,81.527,85.124,89.317,88.782,89.597,95.308,94.745,...,102.559,99.537,97.933,94.594,95.812,96.878,96.988,104.620,109.487,107.878
4,1009,"Blount, AL",92.104,92.593,95.469,98.129,100.918,97.428,96.720,97.077,...,91.938,99.318,101.584,106.506,98.394,104.331,109.560,106.565,100.422,113.455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3069,56037,"Sweetwater, WY",92.340,81.094,83.237,84.315,89.539,103.241,109.911,112.520,...,103.317,99.506,96.656,93.916,91.543,89.449,86.267,88.195,80.164,74.912
3070,56039,"Teton, WY",81.931,84.886,85.316,86.359,88.884,96.508,98.646,97.627,...,95.810,102.194,110.461,112.837,114.846,119.415,121.268,123.476,125.412,155.522
3071,56041,"Uinta, WY",99.906,110.550,107.053,108.929,112.808,123.552,126.916,127.784,...,106.181,96.674,94.344,95.302,94.681,93.024,89.431,86.759,77.512,76.147
3072,56043,"Washakie, WY",92.187,91.281,95.083,95.411,91.317,102.153,95.987,106.530,...,107.934,101.335,104.043,106.056,94.451,89.130,89.160,92.316,91.539,92.003


In [20]:
# testing for differences

tolerance = 0.5

float_cols_gdp = gdp.select_dtypes(include=["float64"])
float_cols_restored = gdp_restored.select_dtypes(include=["float64"])

differences = (float_cols_gdp - float_cols_restored).abs() > tolerance

assert not differences.any().any(), "Differences found!"


In [22]:
population_stand = standardize_and_scale(population, "population")
population_restored = revert_standardize_and_scale(population_stand, "population")


# the differences for the population are much larger than for the gdp

In [27]:
population_restored

Unnamed: 0,GeoFIPS,GeoName,1993,1994,1995,1996,1997,1998,1999,2000,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,1001,"Autauga, AL",36945.470240,38178.503688,39104.489799,40199.505352,41230.503131,42098.476499,42955.449805,44013.459498,...,55550.658214,55586.540125,56026.479313,56270.385547,56946.362797,57363.312784,57720.270088,58237.265716,58869.301829,59087.316036
1,1003,"Baldwin, AL",111420.584939,116570.255498,120901.795583,125418.367713,130170.970877,134451.499358,137562.839376,141350.293062,...,190955.684245,196082.394270,200775.025234,204941.569555,210021.263147,215383.017495,221150.854974,227098.734031,233160.653411,239315.633371
2,1005,"Barbour, AL",27361.911304,27741.805974,27844.658187,28288.567827,28405.416904,28831.318360,28856.156304,29005.018108,...,27308.064055,27145.913753,27030.762645,26620.562477,26204.362065,25595.145128,25367.007256,25191.890299,25166.819522,24950.763120
3,1007,"Bibb, AL",17746.347162,18042.228165,18496.137485,18874.036386,19376.948263,19839.855739,20548.804966,19901.537266,...,22669.309479,22531.163081,22573.037517,22593.907470,22640.782385,22592.656720,22369.519661,22391.434755,22209.338435,22463.358499
4,1009,"Blount, AL",41859.269557,42513.208805,44053.294810,45337.341112,47048.449524,48817.569479,50230.633243,51100.612349,...,57850.032248,58016.935472,58040.806980,58160.693039,58230.571696,58668.525100,58757.438802,58948.381391,59073.335019,59033.307251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3069,56037,"Sweetwater, WY",40058.976707,39952.792307,39841.609705,39249.350793,38720.094769,38556.900399,38133.665456,37543.407029,...,44899.925683,45000.818170,44753.645587,44482.468033,43969.251839,43267.019772,42744.834068,42423.693357,42147.581744,41603.471979
3070,56039,"Teton, WY",13721.692480,14308.620828,14895.551785,15482.484690,16170.426665,16871.372863,17660.335105,18369.288018,...,21599.135396,22274.121269,22724.062084,22988.971735,23143.864220,23256.764749,23120.641845,23228.570930,23333.521304,23561.537138
3071,56041,"Uinta, WY",19570.643916,19916.533054,19873.361515,19925.207378,19924.037257,20017.884699,19889.697751,19654.497080,...,21045.045264,21026.918389,20914.767770,20877.628287,20832.488234,20590.331006,20461.209241,20379.107414,20427.048514,20621.058817
3072,56043,"Washakie, WY",8439.833292,8471.671344,8498.511193,8458.342089,8518.181892,8572.022826,8411.830672,8238.640088,...,8370.983603,8365.858843,8215.702042,8208.567439,8096.416486,7920.269996,7784.147092,7712.046892,7641.968794,7688.955180


In [28]:
population

Unnamed: 0,GeoFIPS,GeoName,1993,1994,1995,1996,1997,1998,1999,2000,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,1001,"Autauga, AL",36953.0,38186.0,39112.0,40207.0,41238.0,42106.0,42963.0,44021.0,...,55558.0,55594.0,56034.0,56278.0,56954.0,57371.0,57728.0,58245.0,58877.0,59095.0
1,1003,"Baldwin, AL",111416.0,116565.0,120896.0,125412.0,130164.0,134444.0,137555.0,141342.0,...,190941.0,196067.0,200759.0,204925.0,210004.0,215365.0,221132.0,227079.0,233140.0,239294.0
2,1005,"Barbour, AL",27371.0,27751.0,27854.0,28298.0,28415.0,28841.0,28866.0,29015.0,...,27320.0,27158.0,27043.0,26633.0,26217.0,25608.0,25380.0,25205.0,25180.0,24964.0
3,1007,"Bibb, AL",17757.0,18053.0,18507.0,18885.0,19388.0,19851.0,20560.0,19913.0,...,22682.0,22544.0,22586.0,22607.0,22654.0,22606.0,22383.0,22405.0,22223.0,22477.0
4,1009,"Blount, AL",41866.0,42520.0,44060.0,45344.0,47055.0,48824.0,50237.0,51107.0,...,57857.0,58024.0,58048.0,58168.0,58238.0,58676.0,58765.0,58956.0,59081.0,59041.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3069,56037,"Sweetwater, WY",40066.0,39960.0,39849.0,39257.0,38728.0,38565.0,38142.0,37552.0,...,44909.0,45010.0,44763.0,44492.0,43979.0,43277.0,42755.0,42434.0,42158.0,41614.0
3070,56039,"Teton, WY",13733.0,14320.0,14907.0,15494.0,16182.0,16883.0,17672.0,18381.0,...,21612.0,22287.0,22737.0,23002.0,23157.0,23270.0,23134.0,23242.0,23347.0,23575.0
3071,56041,"Uinta, WY",19581.0,19927.0,19884.0,19936.0,19935.0,20029.0,19901.0,19666.0,...,21058.0,21040.0,20928.0,20891.0,20846.0,20604.0,20475.0,20393.0,20441.0,20635.0
3072,56043,"Washakie, WY",8452.0,8484.0,8511.0,8471.0,8531.0,8585.0,8425.0,8252.0,...,8386.0,8381.0,8231.0,8224.0,8112.0,7936.0,7800.0,7728.0,7658.0,7705.0


In [36]:
with open(f"{grandparent_dir}/data/raw/std_info.pkl", "rb") as file:
    loaded_data = pickle.load(file)

print(loaded_data.keys())
print(loaded_data)

dict_keys(['population', 'gdp'])
{'population': {'1993': {'mean': 83234.6701366298, 'std': 275725.1910719795}, '1994': {'mean': 84262.08197787899, 'std': 277644.2767566897}, '1995': {'mean': 85273.45022771633, 'std': 279510.03573437105}, '1996': {'mean': 86272.85230969421, 'std': 282014.73171763204}, '1997': {'mean': 87317.50195185427, 'std': 285132.8225671796}, '1998': {'mean': 88349.1977878985, 'std': 288672.8169758182}, '1999': {'mean': 89370.2745608328, 'std': 292335.234841532}, '2000': {'mean': 90368.69583604424, 'std': 295559.1292853688}, '2001': {'mean': 91263.97625243982, 'std': 298459.51732106647}, '2002': {'mean': 92098.6245933637, 'std': 300789.90069377265}, '2003': {'mean': 92890.91021470397, 'std': 302710.45906455605}, '2004': {'mean': 93749.98698763826, 'std': 304318.88879457273}, '2005': {'mean': 94613.13402732596, 'std': 305723.72453276993}, '2006': {'mean': 95645.20201691607, 'std': 307124.2710225948}, '2007': {'mean': 96541.34970722186, 'std': 308411.451972312}, '2008