# Import

In [980]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.optimize import curve_fit

In [981]:
raw_data = pd.read_csv('HATCH_dataset/HATCH_v1.5_Clean.csv')
year_columns = [col for col in raw_data.columns if col.isdigit()]
data = pd.melt(raw_data, id_vars=[col for col in raw_data.columns if col not in year_columns], 
                   value_vars=year_columns, var_name='Year', value_name='Value')
data['Year'] = data['Year'].astype(int)

# grouped_metrics = data.groupby('Region')['Country Name'].unique()
# for region, country in grouped_metrics.items():
#     print(f"{region}: {list(country)}")

len(data['Technology Name'].unique())

202

In [982]:
oecd_countries = [  # as per https://www.oecd.org/en/about/members-partners.html 
    'AUS',  # Australia
    'AUT',  # Austria
    'BEL',  # Belgium
    'CAN',  # Canada
    'CHL',  # Chile
    'COL',  # Colombia
    'CRI',  # Costa Rica
    'CZE',  # Czech Republic / Czechia
    'DNK',  # Denmark
    'EST',  # Estonia
    'FIN',  # Finland
    'FRA',  # France
    'DEU',  # Germany
    'GRC',  # Greece
    'HUN',  # Hungary
    'ISL',  # Iceland
    'IRL',  # Ireland
    'ISR',  # Israel
    'ITA',  # Italy
    'JPN',  # Japan
    'KOR',  # Korea
    'LVA',  # Latvia
    'LTU',  # Lithuania
    'LUX',  # Luxembourg
    'MEX',  # Mexico
    'NLD',  # Netherlands
    'NZL',  # New Zealand
    'NOR',  # Norway
    'POL',  # Poland
    'PRT',  # Portugal
    'SVK',  # Slovak Republic
    'SVN',  # Slovenia
    'ESP',  # Spain
    'SWE',  # Sweden
    'CHE',  # Switzerland
    'TUR',  # Turkey
    'GBR',  # United Kingdom
    'USA',  # United States
    'World'
]

data_oecd = data[data['Region'].isin(oecd_countries)]
len(data_oecd['Technology Name'].unique())

# [tech for tech in data_oecd['Technology Name'].unique() if 'Offshore'.lower() in tech.lower()]

202

In [983]:
null_years = data_oecd.groupby('Year').sum('Value')
null_years = null_years[null_years['Value'] == 0].index
data_oecd = data_oecd[~data_oecd['Year'].isin(null_years)]
data_oecd = data_oecd.replace(0, np.nan)

def gap_filter(group):
    group = group.sort_values('Year')
    i0 = group['Value'].first_valid_index()
    il = group['Value'].last_valid_index()

    if i0 is not None:
        group = group.loc[i0:il]
        is_nan = group['Value'].isnull()
        nan_gaps = (is_nan != is_nan.shift()).cumsum()[is_nan].value_counts()

        return nan_gaps.max() if not nan_gaps.empty else 0
    return None
# print(data_oecd.groupby('ID').apply(gap_filter).sort_values().iloc[-10:])

# data_filter = (
#     data_oecd.groupby('ID').filter(lambda x: x['Value'].notnull().sum() >= 20)
# )
data_filter = data_oecd.groupby('ID').filter(
    lambda group: (
        group['Value'].notnull().sum() >= 20 and    # series must have at least 20 entries
        (gap := gap_filter(group)) is not None and 
        gap <= 5                                    # series with gaps of more than 5 years are removed
    )
)
len(data_filter['Technology Name'].unique())

# print(data_filter.groupby('ID').apply(gap_filter).sort_values().iloc[-10:])
# data_filter[data_filter['ID'] == 'Passenger Cars_Total Number_PT'].plot(x='Year', y='Value')

118

In [984]:
data_filter['Metric'] = data_filter['Metric'].replace(to_replace=['Annual production', 'Cumulative total capacity', 'Net Total Capacity', 'Installed electricity capacity', 'Total Length'], 
                                        value=['Annual Production', 'Cumulative Total Capacity', 'Installed Capacity', 'Installed Capacity', 'Cumulative Length'])

grouped_metrics = data_filter.groupby('Metric')['Technology Name'].unique()
print('##### metric: techs #####')
for metric, techs in grouped_metrics.items():
    print(f"{metric}: {list(techs)}")

grouped_metrics = data_filter.groupby('Metric')['Unit'].unique()
print('\n##### metric: units #####')
for metric, units in grouped_metrics.items():
    print(f"{metric}: {list(units)}")

keep_metrics = ['Total Number',
                'Annual Production',
                'Cumulative Total Capacity',
                'Cumulative Length',            # TODO - need to reconsider
                'Installed Capacity']

data_filter.loc[(data_filter['Metric'] == 'Cumulative Total Capacity') & (data_filter['Unit'] == 'kilometers'), 'Metric'] = 'Cumulative Length'
data_filter.loc[(data_filter['Metric'] == 'Cumulative Total Capacity') & (data_filter['Unit'] == '-'), 'Metric'] = 'Total Number'
data_metrics = data_filter[data_filter['Metric'].isin(keep_metrics)]
len(data_metrics['Technology Name'].unique())

##### metric: techs #####
Annual Production: ['Beer Production', 'Gold', 'Zinc', 'Crude Oil', 'Silver', 'Sulphuric Acid', 'Primary Bauxite Production', 'Nickel Production', 'Cement', 'Rare Earth Mine Production', 'Primary Aluminum Production', 'Raw Steel Production', 'Graphite', 'Cadmium Refining', 'Iron Ore', 'Primary Copper', 'Cane Sugar', 'Cobalt', 'Lead', 'Sand and Gravel|Industrial', 'Sand and Gravel|Construction', 'Salt Production', 'Tin', 'Milk Production', 'Lithium Mine Production', 'Caustic Soda', 'Synthetic Filaments', 'Nitric Acid', 'Hydrochloric Acid', 'Aquaculture Production', 'Capture Fisheries', 'Potash Fertilizer', 'Phosphate Fertilizer', 'Nitrogen Fertilizer', 'Renewable Power', 'Hydroelectricity', 'Nuclear Energy', 'Oil Production', 'Natural Gas Production', 'Coal Production', 'Primary Magnesium', 'Liquefied Natural Gas', 'Electricity', 'All Biofuels', 'Polystyrene', 'Copper|Refining', 'Motor Gasoline', 'Copper|Mining', 'Polyvinylchloride', 'Pentaerythritol', 'Air-Sou

89

In [985]:
# data_metrics[(data_metrics['Technology Name'].isin(['Crude Oil', 'Oil Production', 'Oil Refining Capacity'])) & 
#           (data_metrics['Country Name'] == 'Mexico') & (data_metrics['Year'].between(2000, 2010))]

# grouped_metrics = data_metrics.groupby('Technology Name')['Metric'].unique()
# for techs, metrics in grouped_metrics.items():
#     # if len(metrics) > 1:
#         print(f"{techs} - {metrics}")

remove_ids = ['Cellphones_Cumulative Total Capacity_World', 'Passenger Cars_Cumulative Total Capacity_World',   # repetitive or with data quality issues
              'Passenger Cars_Cumulative Total Capacity_US', 'Passenger Cars_Total Number_PT']

remove_techs = [    # nuclear weapons, space launches, oil production and nitrogen fertilizer techs were removed as per Edwards et al. 2023 (SI)
    'Aquaculture Production',
    'Beer Production',
    'Cane Sugar',
    'Capture Fisheries',
    'Caustic Soda',
    'Copper|Mining',
    'Copper|Refining'
    'Crude Oil',
    'Milk Production',
    'Nitrogen Fertilizer',
    'Nuclear Weapons',
    'Oil Production',
    'Postal Traffic',
    'Primary Bauxite Production',
    'Sand and Gravel|Industrial',
    'Space Launches'  
]

data_metrics = data_metrics[~data_metrics['ID'].isin(remove_ids)]
data_clean = data_metrics[~data_metrics['Technology Name'].isin(remove_techs)].dropna()
data_clean['Region'] = data_clean['Region'].replace({
    'AUS': 'Australia',
    'AUT': 'Austria',
    'BEL': 'Belgium',
    'CAN': 'Canada',
    'CHL': 'Chile',
    'COL': 'Colombia',
    'CRI': 'Costa Rica',
    'CZE': 'Czech Republic',
    'DNK': 'Denmark',
    'EST': 'Estonia',
    'FIN': 'Finland',
    'FRA': 'France',
    'DEU': 'Germany',
    'GRC': 'Greece',
    'HUN': 'Hungary',
    'ISL': 'Iceland',
    'IRL': 'Ireland',
    'ISR': 'Israel',
    'ITA': 'Italy',
    'JPN': 'Japan',
    'KOR': 'Korea',
    'LVA': 'Latvia',
    'LTU': 'Lithuania',
    'LUX': 'Luxembourg',
    'MEX': 'Mexico',
    'NLD': 'Netherlands',
    'NZL': 'New Zealand',
    'NOR': 'Norway',
    'POL': 'Poland',
    'PRT': 'Portugal',
    'SVK': 'Slovak Republic',
    'SVN': 'Slovenia',
    'ESP': 'Spain',
    'SWE': 'Sweden',
    'CHE': 'Switzerland',
    'TUR': 'Turkey',
    'GBR': 'United Kingdom',
    'USA': 'United States'
})
data_clean = data_clean.drop(['Country Name', 'Spatial Scale'], axis=1)
data_clean = data_clean.rename(columns={'ID': 'id', 'Region':'country', 'Technology Name': 'tech', 
                                        'Metric': 'metric', 'Unit': 'unit', 'Data Source': 'source',
                                        'Variable': 'variable', 'Year': 'year', 'Value': 'value'})

# data_clean.to_csv('hatch_clean.csv', index=False)

len(data_clean['tech'].unique())

75

In [None]:
def get_growth_function(function):
    if function == 'logistic':
        return lambda x, L, k, x0: L / (1 + np.exp(-k * (x - x0)))
    elif function == 'gompertz':
        return lambda x, L, k, x0: L * np.exp(-np.exp(-k * (x - x0)))
    elif function == 'softplus':
        return lambda x, L, k, x0: L / k * np.log(1 + np.exp(k * (x - x0)))     # from https://doi.org/10.1038/s43247-023-01056-1 - relatively untested
    
def growth_rate_fitting(data=data_clean, error_metric='rmse', country=None, tech=None, sort_by='rate'):
    if error_metric not in ['rmse', 'mae']:
        raise ValueError(f"Unsupported error metric: {error_metric}")
    if sort_by not in ['rate', 'r2']:
        raise ValueError(f"Unsupported sorting param: {sort_by}")
    
    formulations = ['logistic', 'gompertz', 'softplus']
    growth_rates = data[['tech', 'country', 'metric', 'unit']].drop_duplicates().reset_index(drop=True)
    growth_rates = growth_rates.assign(rate=np.nan, asymptote=np.nan, inflection=np.nan, 
                                       r2=np.nan, rmse=np.nan, mae=np.nan, formulation=pd.Series(dtype='string'))

    if country is not None:
        growth_rates = growth_rates[growth_rates['country'] == country]

    if tech is not None:
        growth_rates = growth_rates[growth_rates['tech'] == tech]

    for i, row in growth_rates.iterrows():
        tech = row['tech']
        country = row['country']
        hist_growth = data[(data['tech'] == tech) & (data['country'] == country)]
        x = hist_growth['year']
        y = hist_growth['value']

        best_metric, best_params, best_formulation, best_r2 = float('inf'), None, None, None
        for func in formulations:
            f = get_growth_function(func)
            L0 = y.max()                                # initial guesses
            k0 = 1.0
            x0 = np.median(x) if func in ['logistic', 'softplus'] else x.min()

            try:
                popt, _ = curve_fit(f, x.values, y.values, [L0, k0, x0], 
                                    # maxfev=17000      # does not converge in less iterations for some techs
                                    maxfev=10000
                                    )
                
                y_pred = f(x.values, *popt)             # error metrics
                ss_res = np.sum((y.values - y_pred) ** 2)
                ss_tot = np.sum((y.values - y.values.mean()) ** 2)
                r2 = 1 - (ss_res / ss_tot)
                rmse = np.sqrt(ss_res / len(y))         # len(y), not len(y) - 1
                mae = np.mean(np.abs(y.values - y_pred))

                current_metric = rmse if error_metric == 'rmse' else mae        # check for formulation with the lowest error_metric
                if current_metric < best_metric:
                    best_metric, best_params, best_formulation, best_r2 = current_metric, popt, func, r2

            except RuntimeError:
                print(f"Optimal {func} params not found in {tech} from {country}, removed row")    
                continue        # if nan, will be removed

        # Update the DataFrame with the best formulation's results
        if best_params is not None:
            growth_rates.at[i, 'rate'] = best_params[1]                 # k, target parameter
            growth_rates.at[i, 'asymptote'] = best_params[0]            # L, for demonstration
            growth_rates.at[i, 'inflection'] = best_params[2]           # x0, for demonstration
            growth_rates.at[i, 'r2'] = best_r2                          # Coefficient of determination
            growth_rates.at[i, 'rmse'] = rmse                           # Root mean squared error
            growth_rates.at[i, 'mae'] = mae                             # Mean absolute error
            growth_rates.at[i, 'formulation'] = best_formulation        # Formulation with the lowest error_metric

    growth_rates = growth_rates[growth_rates['rate'] >= 0]      

    if tech is not None or country is not None:
        max_rate_i = growth_rates.groupby('tech')['rate'].idxmax()      # max growth rate by country if no specific country
        growth_rates = growth_rates.loc[max_rate_i]
        return growth_rates.sort_values(sort_by, ascending=False).reset_index(drop=True)
    else:
        return growth_rates.sort_values(sort_by, ascending=False).reset_index(drop=True)

In [1049]:
def plot_growth_rates(data=data_clean, error_metric='rmse', tech=None, country=None, sort_by='rate'):
    if tech is None and country is None:
        print("Too many plots to show. Define either a specific technology or a country.")
        return
    
    rates = growth_rate_fitting(data=data, error_metric=error_metric, tech=tech, country=country, sort_by=sort_by)

    # Determine grid size for subplots
    rows = len(rates)
    cols = 4 if len(rates) >= 4 else len(rates)
    rows = -(-rows // cols)  # Ceiling division for rows

    fig = make_subplots(rows=rows, cols=cols, 
                        vertical_spacing=0.026, 
                        horizontal_spacing=0.05,
                        subplot_titles=[f"<b>{row['tech']}</b> in {row['country']}<br>({row['formulation']}'s growth rate={row['rate']:.3f})" for _, row in rates.iterrows()])

    annotations = []
    for i, row in rates.iterrows():
        tech, country, formulation = row['tech'], row['country'], row['formulation']
        f = get_growth_function(formulation)
        
        yaxis_title = f"{row['metric']} [{row['unit']}]"
        annotation_text = f"R²: {row['r2']:.3f}"

        L, k, x0 = row['asymptote'], row['rate'], row['inflection']
        hist = data[(data['tech'] == tech) & (data['country'] == country)]
        y, x = hist['value'].values, hist['year'].values        
        y_fit = f(x, L, k, x0)

        # Add traces to subplot
        row_i = i // cols + 1
        col_i = i % cols + 1
        fig.add_trace(go.Scatter(x=x, y=y, mode='markers', name=f"{tech} - {country}",
                                 marker=dict(size=4, opacity=0.8, color='black')),
                      row=row_i, col=col_i)
        fig.add_trace(go.Scatter(x=x, y=y_fit, mode='lines', name=f"{tech} - {country} Curve Fit",
                                 line=dict(width=2.5, color='red')),
                      row=row_i, col=col_i)
        fig.update_yaxes(title_text=yaxis_title, row=row_i, col=col_i, title_standoff=5)
        annotations.append(
            dict(
                xref=f"x{(i+1)}",  # Use subplot-specific axis
                yref=f"y{(i+1)}",
                x=x.min(),  # Place at the minimum x-value
                y=y.max(),  # Place near the maximum y-value
                text=annotation_text,
                showarrow=False,
                xanchor="left",
                yanchor="top"
            )
        )
    existing_annotations = list(fig['layout']['annotations'])
    fig.update_layout(
        annotations=existing_annotations + annotations
    )
    fig.update_layout(
        title='Historical growth rates',
        font=dict(size=11),
        showlegend=False,
        height=350 * rows,
        width=400 * cols,
        template='plotly_white',
        annotations=[
        dict(font=dict(size=13)) for annotation in fig['layout']['annotations']
        ]
    )
    fig.show()

In [1050]:
plot_growth_rates(
    # data=data_clean[data_clean['metric'] != 'Annual Production'],
    tech='Solar Photovoltaic', 
    # country='World',
    # sort_by='r2'
    )


Covariance of the parameters could not be estimated



Optimal gompertz params not found in Solar Photovoltaic from Spain, removed row
Optimal gompertz params not found in Solar Photovoltaic from Norway, removed row
Optimal gompertz params not found in Solar Photovoltaic from Portugal, removed row
Optimal gompertz params not found in Solar Photovoltaic from Costa Rica, removed row


In [1043]:
k_rates = growth_rate_fitting(
    data=data_clean[data_clean['country'] != 'World'],
    # tech='Passenger Cars',
    # country='United States',
    # sort_by='r2'
    )

Optimal gompertz params not found in Steamships from Austria, removed row
Optimal gompertz params not found in Steamships from Italy, removed row



overflow encountered in exp


Covariance of the parameters could not be estimated


overflow encountered in exp



Optimal gompertz params not found in Silver from Australia, removed row
Optimal gompertz params not found in Steamships from Mexico, removed row
Optimal gompertz params not found in Telephones from France, removed row
Optimal gompertz params not found in Telephones from Denmark, removed row
Optimal gompertz params not found in Telephones from Norway, removed row
Optimal gompertz params not found in Telephones from Hungary, removed row
Optimal gompertz params not found in Zinc from Mexico, removed row
Optimal gompertz params not found in Railroad from Korea, removed row
Optimal gompertz params not found in Silver from Korea, removed row
Optimal gompertz params not found in Radio from Austria, removed row
Optimal logistic params not found in Crude Oil from Israel, removed row
Optimal gompertz params not found in Television from Mexico, removed row
Optimal gompertz params not found in Hydroelectricity from Czech Republic, removed row
Optimal gompertz params not found in Renewable Power fr


divide by zero encountered in scalar divide



Optimal gompertz params not found in Biogas from Turkey, removed row
Optimal gompertz params not found in Solar Photovoltaic from Norway, removed row
Optimal gompertz params not found in Solar Photovoltaic from Portugal, removed row
Optimal gompertz params not found in Biogas from Greece, removed row
Optimal gompertz params not found in Solar Photovoltaic from Costa Rica, removed row
Optimal gompertz params not found in All Biofuels from Sweden, removed row
Optimal gompertz params not found in Oil Pipeline from United States, removed row
Optimal gompertz params not found in Air-Source Heat Pumps from United States, removed row


In [None]:
k_rates

Unnamed: 0,tech,country,metric,unit,rate,asymptote,inflection,r2,rmse,mae,formulation
0,Hydroelectricity,Israel,Annual Production,TWh,30.924812,2.441250e-02,1991.036256,0.697137,0.005994,0.004601,logistic
1,Crude Oil,Israel,Annual Production,thousand metric tons,17.238686,9.855000e+02,1966.029093,0.041023,1746.890953,1185.865951,gompertz
2,Crop Harvester,Iceland,Total Number,-,16.862575,7.275000e+00,1960.902349,0.001791,4.643314,3.164636,gompertz
3,Salt Production,Slovenia,Annual Production,metric tons,5.501519,1.071854e+05,1999.547019,0.079255,167695.002129,75646.556950,logistic
4,Biogas,Austria,Installed Capacity,MW,4.490653,2.333201e+02,2004.426764,0.683139,58.686948,48.595129,logistic
...,...,...,...,...,...,...,...,...,...,...,...
654,Salt Production,Poland,Annual Production,metric tons,0.002847,5.076097e+08,3716.982958,0.008357,763701.311876,458718.336484,logistic
655,Hydroelectricity,Italy,Annual Production,TWh,0.001514,1.108712e+04,5667.882246,0.047352,4.770844,3.385238,logistic
656,Copper|Refining,Norway,Annual Production,metric tons,0.001462,3.655077e+06,5173.495168,0.004774,4500.176764,3900.000002,logistic
657,Salt Production,Colombia,Annual Production,metric tons,0.001284,3.489761e+07,5269.188281,0.001369,114142.533521,93183.364841,logistic


In [1045]:
k_rates[k_rates['r2'] > 0.85]

Unnamed: 0,tech,country,metric,unit,rate,asymptote,inflection,r2,rmse,mae,formulation
5,Solid Biofuels,Korea,Installed Capacity,MW,2.880267,1.848380e+03,2015.803219,0.994275,6.090139e+01,4.402762e+01,logistic
9,Nuclear Energy,Netherlands,Annual Production,TWh,2.140699,3.817618e+00,1973.337191,0.910473,3.359744e-01,2.473990e-01,logistic
10,Solar Photovoltaic,Czech Republic,Installed Capacity,MW,1.985505,2.070707e+03,2009.195776,0.999073,3.029841e+01,1.685521e+01,gompertz
13,Solar Photovoltaic,Greece,Installed Capacity,MW,1.455683,2.874756e+03,2011.865580,0.976334,1.349821e+03,1.297931e+03,logistic
14,Solid Biofuels,Estonia,Installed Capacity,MW,1.234132,1.676717e+02,2008.974214,0.980300,1.067940e+01,8.320328e+00,gompertz
...,...,...,...,...,...,...,...,...,...,...,...
629,Electricity,Colombia,Annual Production,TWh,0.012392,4.483880e+02,2063.293853,0.983961,1.932710e+00,1.489316e+00,gompertz
636,Passenger Cars,United States,Total Number,-,0.009224,3.316030e+09,2110.515849,0.989313,6.484290e+06,4.808807e+06,gompertz
648,Radio,Finland,Total Number,-,0.005118,4.891644e+07,2421.716328,0.974391,3.591524e+02,2.222121e+02,gompertz
651,Television,Ireland,Total Number,-,0.004115,1.020657e+11,2581.918449,0.909346,1.560097e+05,9.259112e+04,gompertz


In [1051]:
plot_growth_rates(country='Korea', tech='Solid Biofuels')

In [1047]:
k_rates[k_rates['r2'] > 0.85]['rate'].quantile(q=[0, 0.25, 0.5, 0.75, 1])

0.00    0.003920
0.25    0.067671
0.50    0.111304
0.75    0.240873
1.00    2.880267
Name: rate, dtype: float64

In [990]:
cars = data_clean[data_clean['variable'] == 'Number of Units|Passenger Cars']
cars = cars.reset_index(drop=True).sort_values(['country', 'year'])

def interpolate_up_to_last_valid_year(group):
    # Find the last year with a non-null 'Value' in the group
    last_valid_year = group['year'][group['value'].last_valid_index()]
    # Interpolate only up to this last valid year
    group.loc[group['year'] <= last_valid_year, 'value'] = group['value'].interpolate()
    return group

# Apply interpolation function to each country's group
cars = cars.groupby('country', group_keys=False).apply(interpolate_up_to_last_valid_year)
cars.isna().sum()





id          0
country     0
tech        0
metric      0
unit        0
source      0
variable    0
year        0
value       0
dtype: int64

In [991]:
fig = px.line(cars, x='year', y='value', color='country', 
              template='plotly_white', log_y=True,
              title='Historical adoption of passenger vehicles by country')

# fig.update_layout(xaxis=dict(range=[1920, 2003]))
fig.show()