# Import

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from scipy.optimize import curve_fit

In [79]:
raw_data = pd.read_csv('HATCH_dataset/HATCH_v1.5_Clean.csv')
year_columns = [col for col in raw_data.columns if col.isdigit()]
data = pd.melt(raw_data, id_vars=[col for col in raw_data.columns if col not in year_columns], 
                   value_vars=year_columns, var_name='Year', value_name='Value')
data['Year'] = data['Year'].astype(int)
data.shape

(2398708, 11)

In [80]:
null_years = data.groupby('Year').sum('Value')
null_years = null_years[null_years['Value'] == 0].index
data = data[~data['Year'].isin(null_years)]

data = (
    data.groupby(['Country Name', 'Variable'])
    .filter(lambda x: x['Value'].notnull().sum() >= 20)
)
data.shape

(1354332, 11)

In [81]:
# data[~data['Value'].isna()].sample(n=10).to_clipboard()
[tech for tech in data['Technology Name'].unique() if 'Photovoltaic'.lower() in tech.lower()]

['Solar Photovoltaic']

In [None]:
data['Metric'] = data['Metric'].replace(to_replace='Annual production', value='Annual Production')

grouped_metrics = data.groupby('Metric')['Technology Name'].unique()
for metric, techs in grouped_metrics.items():
    print(f"{metric}: {list(techs)}")

keep_metrics = ['Total Number',
                'Annual Production',
                # 'Annual production',
                'Cumulative Total Capacity',
                'Installed electricity capacity']

data_metrics = data[data['Metric'].isin(keep_metrics)]
data_metrics.shape

Annual Production: ['Cane Sugar', 'Beer Production', 'Gold', 'Zinc', 'Crude Oil', 'Silver', 'Sulphuric Acid', 'Primary Bauxite Production', 'Nickel Production', 'Cement', 'Rare Earth Mine Production', 'Primary Aluminum Production', 'Raw Steel Production', 'Graphite', 'Cadmium Refining', 'Iron Ore', 'Primary Copper', 'Cobalt', 'Lead', 'Ammonia Synthesis', 'Sand and Gravel|Industrial', 'Sand and Gravel|Construction', 'Salt Production', 'Tin', 'Milk Production', 'Lithium Mine Production', 'Caustic Soda', 'Synthetic Filaments', 'Nitric Acid', 'Hydrochloric Acid', 'Aquaculture Production', 'Capture Fisheries', 'Potash Fertilizer', 'Phosphate Fertilizer', 'Nitrogen Fertilizer', 'Nuclear Energy', 'Renewable Power', 'Hydroelectricity', 'Oil Production', 'Natural Gas Production', 'Coal Production', 'Primary Magnesium', 'Electricity', 'Liquefied Natural Gas', 'All Biofuels', 'Polystyrene', 'Copper|Refining', 'Motor Gasoline', 'Copper|Mining', 'Polyvinylchloride', 'Pentaerythritol', 'Air-Source H

(927912, 11)

In [110]:
oecd_countries = [
    'Australia', 'Austria', 'Belgium', 
    'Canada', 'Chile', 'Colombia', 'Costa Rica' 'Czech Republic', 'Czechia', 
    'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 
    'Iceland', 'Ireland', 'Israel', 'Italy', 'Japan', 'Korea', 'Latvia', 'Lithuania', 'Luxembourg', 'Mexico', 
    'Netherlands', 'New Zealand', 'Norway', 'Poland', 'Portugal', 
    'Slovak Republic', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'United Kingdom', 'United States'
]

data_oecd = data_metrics[data_metrics['Country Name'].isin(oecd_countries)]
data_oecd.shape

(282348, 11)

In [None]:
# data_oecd[(data_oecd['Technology Name'].isin(['Crude Oil', 'Oil Production', 'Oil Refining Capacity'])) & 
#           (data_oecd['Country Name'] == 'Mexico') & (data_oecd['Year'].between(2000, 2010))]

grouped_metrics = data_oecd.groupby('Technology Name')['Metric'].unique()
for techs, metrics in grouped_metrics.items():
    print(f"{techs} - {metrics[0]}")

remove_techs = [
    'Aquaculture Production',
    'Beer Production',
    'Cane Sugar',
    'Capture Fisheries',
    'Caustic Soda',
    'Copper|Mining',
    'Copper|Refining'
    'Crude Oil',
    'Milk Production',
    'Primary Bauxite Production',
    'Sand and Gravel|Industrial'  
]

data_clean = data_oecd[~data_oecd['Technology Name'].isin(remove_techs)].dropna()
data_clean = data_clean.rename(columns={'Country Name': 'country', 'Technology Name': 'tech', 'Metric': 'metric', 
                                        'Variable': 'variable', 'Year': 'year', 'Value': 'value'})

data_clean.shape

Air-Source Heat Pumps - Annual Production
All Biofuels - Annual Production
Ammonia Synthesis - Annual Production
Aquaculture Production - Annual Production
Beer Production - Annual Production
Biogas - Installed electricity capacity
Cadmium Refining - Annual Production
Cane Sugar - Annual Production
Capture Fisheries - Annual Production
Caustic Soda - Annual Production
Cellphones - Total Number
Cement - Annual Production
Coal Production - Annual Production
Cobalt - Annual Production
Copper|Mining - Annual Production
Copper|Refining - Annual Production
Crop Harvester - Total Number
Crude Oil - Annual Production
Electricity - Annual Production
Geothermal Energy - Installed electricity capacity
Gold - Annual Production
Graphite - Annual Production
Hydrochloric Acid - Annual Production
Hydroelectricity - Annual Production
Iron Ore - Annual Production
Lead - Annual Production
Liquefied Natural Gas - Annual Production
Liquid Biofuels - Installed electricity capacity
Lithium Mine Production - 

(46786, 11)

In [114]:
data_clean[data_clean['ID'] == 'Radio_Total Number_US']

Unnamed: 0,ID,Spatial Scale,Region,Country Name,Technology Name,Metric,Unit,Data Source,Variable,Year,Value
846214,Radio_Total Number_US,National,USA,United States,Radio,Total Number,-,CHAT,Number of Units|Radios,1815,0.0
853572,Radio_Total Number_US,National,USA,United States,Radio,Total Number,-,CHAT,Number of Units|Radios,1816,0.0
860930,Radio_Total Number_US,National,USA,United States,Radio,Total Number,-,CHAT,Number of Units|Radios,1817,0.0
868288,Radio_Total Number_US,National,USA,United States,Radio,Total Number,-,CHAT,Number of Units|Radios,1818,0.0
875646,Radio_Total Number_US,National,USA,United States,Radio,Total Number,-,CHAT,Number of Units|Radios,1819,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2170654,Radio_Total Number_US,National,USA,United States,Radio,Total Number,-,CHAT,Number of Units|Radios,1995,559000.0
2178012,Radio_Total Number_US,National,USA,United States,Radio,Total Number,-,CHAT,Number of Units|Radios,1996,570000.0
2185370,Radio_Total Number_US,National,USA,United States,Radio,Total Number,-,CHAT,Number of Units|Radios,1997,575000.0
2192728,Radio_Total Number_US,National,USA,United States,Radio,Total Number,-,CHAT,Number of Units|Radios,1998,580000.0


In [None]:
# def growth_rate_fitting(function, data):
#     if function == 'logistic':
#         def f(x, L, k, x0):
#             return L / (1 + np.exp(-k * (x - x0)))
#     elif function == 'gompertz':
#         def f(x, L, k, x0):
#             return L * np.exp(-np.exp(-k * (x - x0)))
#     elif function == 'softplus':
#         def f(x, L, k, x0):
#             return L/k * np.log(1 + np.exp(k * (x - x0)))
        
growth_rates = data_clean[['tech']].drop_duplicates().reset_index(drop=True)
growth_rates['rate'] = np.nan

for _, row in growth_rates.iterrows():
    tech = row['tech']
    hist_growth = data_clean[data['tech'] == tech]
    x = hist_growth['year']
    y = hist_growth['value']
    
    # Initial guess
    L0 = y.max()
    k0 = 1.0
    if function == 'logistic':
        x0 = np.median(x)
    elif function == 'gompertz':
        x0 = x.min()
    elif function == 'softplus':
        x0 = np.median(x)   # TODO - need to confirm from literature
    p0 = [L0, k0, x0]

    popt, _ = curve_fit(function, x.values, y.values, p0, 
                        # maxfev = 5000     seems deprecated
                        )
    row['rate'] = popt[1]

growth_rates = growth_rates[growth_rates['rate'] >= 0]  # remove techs with negative k parameter


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57


In [None]:
cars = data[(data['Variable'] == 'Number of Units|Passenger Cars') & (data['Year'].between(1895, 2003)) & (data['Country Name'].isin(oecd_countries))]
cars = cars.reset_index(drop=True).sort_values(['Country Name', 'Year'])
# cars['Value'] = cars.groupby('Country Name', group_keys=False)['Value'].apply(lambda group: group.interpolate(method='linear'))

def interpolate_up_to_last_valid_year(group):
    # Find the last year with a non-null 'Value' in the group
    last_valid_year = group['Year'][group['Value'].last_valid_index()]
    # Interpolate only up to this last valid year
    group.loc[group['Year'] <= last_valid_year, 'Value'] = group['Value'].interpolate()
    return group

# Apply interpolation function to each country's group
cars = cars.groupby('Country Name', group_keys=False).apply(interpolate_up_to_last_valid_year)
cars.isna().sum()





ID                    0
Spatial Scale         0
Region                0
Country Name          0
Technology Name       0
Metric                0
Unit                  0
Data Source           0
Variable              0
Year                  0
Value              1590
dtype: int64

In [96]:
min_max_periods = (
    cars.groupby('Country Name')['Value']
    .agg(min_value='min', max_value='max', periods='count')
    .dropna()  # Drop countries without any non-null values
)

def first_last_years(group):
    # Find the first and last year with a non-null 'Value'
    first_year = group['Year'][group['Value'].first_valid_index()]
    last_year = group['Year'][group['Value'].last_valid_index()]
    return pd.Series({'first_year': first_year, 'last_year': last_year})

# Apply the function and merge with the min, max, and periods summary
first_last_years_df = cars.groupby('Country Name').apply(first_last_years)

# Combine the results into a single dataframe
min_max_periods_years = min_max_periods.join(first_last_years_df)
filtered_countries = min_max_periods_years[min_max_periods_years['periods'] > 70].index

def quinquennial_cagr(group):
    # Filter group to include only years from first to last valid data points
    group = group.dropna(subset=['Value'])
    first_year = group['Year'].iloc[0]
    last_year = group['Year'].iloc[-1]
    
    # Calculate CAGR if we have at least a five-year span
    if last_year - first_year >= 5:
        start_value = group['Value'].iloc[0]
        end_value = group['Value'].iloc[-1]
        periods = (last_year - first_year) / 5  # Quinquennial basis
        
        if start_value > 0 and end_value > 0:
            cagr = (end_value / start_value) ** (1 / periods) - 1
        else:
            cagr = np.nan  # If start or end values are zero, CAGR is undefined
    else:
        cagr = np.nan  # Not enough time span for quinquennial CAGR
    
    return pd.Series({'quinquennial_cagr': cagr})

# Calculate the quinquennial CAGR for each country
cqgr = cars.groupby('Country Name').apply(quinquennial_cagr)

def calculate_cagr(group):
    # Filter group to include only years from first to last valid data points
    group = group.dropna(subset=['Value'])
    first_year = group['Year'].iloc[0]
    last_year = group['Year'].iloc[-1]
    
    # Calculate CAGR if we have a valid span
    if last_year > first_year:
        start_value = group['Value'].iloc[0]
        end_value = group['Value'].iloc[-1]
        years = last_year - first_year  # Total number of years
        
        if start_value > 0 and end_value > 0:
            cagr = (end_value / start_value) ** (1 / years) - 1
        else:
            cagr = np.nan  # If start or end values are zero, CAGR is undefined
    else:
        cagr = np.nan  # Not enough time span for CAGR calculation
    
    return pd.Series({'cagr': cagr})

cagr = cars.groupby('Country Name').apply(calculate_cagr)
cars_summary = min_max_periods_years.join([cagr, cqgr])

cars_summary.sort_values('cagr')
# min_max_periods_years.sort_values('periods')









Unnamed: 0_level_0,min_value,max_value,periods,first_year,last_year,cagr,quinquennial_cagr
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Portugal,3532.0,3295000.0,68,1931,1998,-0.025049,-0.119124
Iceland,116242.0,157080.0,11,1990,2000,0.027443,0.144953
Luxembourg,183451.7,245952.0,9,1990,1998,0.037328,0.201103
Czech Republic,2366702.0,3490005.0,11,1990,2000,0.038149,0.205867
Slovenia,578241.2,847314.0,11,1990,2000,0.038947,0.210508
Slovak Republic,860019.6,1272276.0,11,1990,2000,0.039938,0.216288
New Zealand,108000.0,2229924.0,74,1927,2000,0.042346,0.230437
Belgium,9200.0,4752594.0,74,1928,2001,0.057013,0.319474
Australia,117000.0,9240000.0,75,1923,1997,0.06082,0.34341
Sweden,31900.0,3991050.0,78,1923,2000,0.062381,0.353326


In [115]:
cars_filtered = cars[cars['Country Name'].isin(filtered_countries)]


fig = px.line(cars_filtered, x='Year', y='Value', color='Country Name', 
              template='plotly_white', 
              title='Historical adoption of passenger vehicles by country')

# fig.update_layout(xaxis=dict(range=[1920, 2003]))
fig.show()

In [114]:
# data[data['Technology Name'] == 'Lithium-Ion Battery Storage'].groupby(['Country Name', 'Variable'])['Value'].agg(min_value='min', max_value='max', periods='count').dropna()
batteries = data[(data['Technology Name'] == 'Lithium-Ion Battery Storage') & (data['Unit'] == 'kW')].dropna()

fig = px.line(batteries, x='Year', y='Value', color='Country Name', 
              template='plotly_white', 
            #   title='Historical adoption of passenger vehicles by country'
              )

# fig.update_layout(xaxis=dict(range=[1920, 2003]))
fig.show()