# Dataiku Exam
Done with programming

In [2]:
%matplotlib notebook

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact
from pathlib import Path

dataset_dir = Path('./dataset')


## Data
### Loading

We can load the dataframe directly into memory using pandas. We, could, hovewer load a portion of it like *Dataiku* does.

In [50]:
!ls 

[1m[36m__pycache__[m[m app.py      [1m[36mcheckpoints[m[m [1m[36mdataset[m[m     main.ipynb


In [11]:

c02_oil_df = pd.read_csv(dataset_dir / 'CO2_and_Oil.csv')
meat_egg_prod_df = pd.read_csv(dataset_dir / 'Meat_and_Egg_Production.csv')
urb_gdp_pop_df = pd.read_csv(dataset_dir / 'Urbanization_GDP_and_Population.csv')

### Merge

We can merge them all by Entity, Year and Code

In [12]:
c02_oil_df_meat_egg_prod_df = pd.merge(c02_oil_df, meat_egg_prod_df, how='left', 
                                       left_on=['Entity', 'Year', 'Code'],
                                       right_on=['Entity', 'Year', 'Code'])

In [13]:
c02_oil_df_meat_egg_prod_urb_gdp_pop_df = pd.merge(c02_oil_df_meat_egg_prod_df, urb_gdp_pop_df,  how='left', 
                                       left_on=['Entity', 'Year', 'Code'],
                                       right_on=['Entity', 'Year', 'Code'])

In [14]:
c02_oil_df_meat_egg_prod_urb_gdp_pop_df.to_csv(dataset_dir / 'merged.csv', index=False)

### Preparation

In [15]:
df = pd.read_csv(dataset_dir / 'merged.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41859 entries, 0 to 41858
Data columns (total 11 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Entity                                                        41859 non-null  object 
 1   Code                                                          41646 non-null  object 
 2   Year                                                          41859 non-null  int64  
 3   Per capita CO₂ emissions (tonnes per capita)                  41859 non-null  float64
 4   Oil production (Etemad & Luciana) (terawatt-hours)            6550 non-null   float64
 5   meat_prod_tonnes                                              8683 non-null   float64
 6   Meat_supply_kgperCap                                          7750 non-null   float64
 7   Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)  7646 

In [16]:
df.describe()

Unnamed: 0,Year,Per capita CO₂ emissions (tonnes per capita),Oil production (Etemad & Luciana) (terawatt-hours),meat_prod_tonnes,Meat_supply_kgperCap,Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes),Urban (%),GDP per capita (int.-$) ($),Population
count,41859.0,41859.0,6550.0,8683.0,7750.0,7646.0,12423.0,14301.0,41645.0
mean,1906.122578,1.322799,318.325651,1972130.0,37.712817,498416.2,45.174922,6968.483183,21773080.0
std,61.490126,4.52021,888.991999,14709580.0,29.605886,3655195.0,25.052487,10979.27688,209504700.0
min,1751.0,0.0,4.7e-05,0.0,2.51,0.0,1.1,134.0,905.0
25%,1853.0,0.0,1.410138,22775.0,13.5,4000.0,24.4195,1533.0,300482.0
50%,1906.0,0.0,18.810108,127235.0,27.915,23000.0,42.681,3066.0,1719628.0
75%,1959.0,0.377702,137.005995,494628.0,57.74,110000.0,65.326,7605.0,5758692.0
max,2012.0,252.645121,7325.22031,305671500.0,146.68,72033000.0,100.0,220717.0,7125828000.0


In [56]:
df.head(5)

Unnamed: 0,Entity,Code,Year,Per capita CO₂ emissions (tonnes per capita),Oil production (Etemad & Luciana) (terawatt-hours),meat_prod_tonnes,Meat_supply_kgperCap,Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes),Urban (%),GDP per capita (int.-$) ($),Population
0,Afghanistan,AFG,1800,0.0,,,,,,,3280000.0
1,Afghanistan,AFG,1801,0.0,,,,,,,3280000.0
2,Afghanistan,AFG,1802,0.0,,,,,,,3280000.0
3,Afghanistan,AFG,1803,0.0,,,,,,,3280000.0
4,Afghanistan,AFG,1804,0.0,,,,,,,3280000.0


#### Co2 and Oil

We need to first fix the columns for co2 and oil

Let's check the `Oil production` column and see if we can fix the `nan` values. Let's take some countries and see

In [57]:
codes = df.Code.unique()

def plot_oil_for_country(code):
    oil_per_code = df[df['Code'] == code].set_index('Year', drop=True)['Oil production (Etemad & Luciana) (terawatt-hours)']
    
    if(oil_per_code.isnull().all()):
        print('No values!')
    else:
        fig = plt.figure()
        oil_per_code.plot()
    
    
interact(plot_oil_for_country, code=codes)

interactive(children=(Dropdown(description='code', options=('AFG', 'ALB', 'DZA', 'AND', 'AGO', 'ATG', 'ARG', '…

<function __main__.plot_oil_for_country(code)>

In this case we can just fill the nan with zeros!

In [58]:
df['Oil production (Etemad & Luciana) (terawatt-hours)'].fillna(0, inplace=True)

We have some entry in `Code` without a name! Let's just take the first three letters of Entity as code

In [59]:
df[df.Code.isnull()]

Unnamed: 0,Entity,Code,Year,Per capita CO₂ emissions (tonnes per capita),Oil production (Etemad & Luciana) (terawatt-hours),meat_prod_tonnes,Meat_supply_kgperCap,Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes),Urban (%),GDP per capita (int.-$) ($),Population
23643,Micronesia,,1800,0.0,0.0,,,,,,
23644,Micronesia,,1801,0.0,0.0,,,,,,
23645,Micronesia,,1802,0.0,0.0,,,,,,
23646,Micronesia,,1803,0.0,0.0,,,,,,
23647,Micronesia,,1804,0.0,0.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
23851,Micronesia,,2008,0.0,0.0,,,,,,
23852,Micronesia,,2009,0.0,0.0,,,,,,
23853,Micronesia,,2010,0.0,0.0,,,,,,
23854,Micronesia,,2011,0.0,0.0,,,,,,


In [60]:
def create_code_from_entity(row):

    return row['Entity'][:3].upper()

df.loc[df.Code.isnull(),'Code'] = df[df.Code.isnull()].apply(create_code_from_entity, axis=1)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41859 entries, 0 to 41858
Data columns (total 11 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Entity                                                        41859 non-null  object 
 1   Code                                                          41859 non-null  object 
 2   Year                                                          41859 non-null  int64  
 3   Per capita CO₂ emissions (tonnes per capita)                  41859 non-null  float64
 4   Oil production (Etemad & Luciana) (terawatt-hours)            41859 non-null  float64
 5   meat_prod_tonnes                                              8683 non-null   float64
 6   Meat_supply_kgperCap                                          7750 non-null   float64
 7   Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)  7646 

Fixed!

### Meat and Eggs

In [62]:
df

Unnamed: 0,Entity,Code,Year,Per capita CO₂ emissions (tonnes per capita),Oil production (Etemad & Luciana) (terawatt-hours),meat_prod_tonnes,Meat_supply_kgperCap,Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes),Urban (%),GDP per capita (int.-$) ($),Population
0,Afghanistan,AFG,1800,0.000000,0.000000,,,,,,3280000.0
1,Afghanistan,AFG,1801,0.000000,0.000000,,,,,,3280000.0
2,Afghanistan,AFG,1802,0.000000,0.000000,,,,,,3280000.0
3,Afghanistan,AFG,1803,0.000000,0.000000,,,,,,3280000.0
4,Afghanistan,AFG,1804,0.000000,0.000000,,,,,,3280000.0
...,...,...,...,...,...,...,...,...,...,...,...
41854,Zimbabwe,ZWE,2008,0.569303,0.023583,245066.0,18.93,29000.0,33.560,1260.0,12380000.0
41855,Zimbabwe,ZWE,2009,0.399657,0.035375,243059.0,20.57,30000.0,33.378,1329.0,12527000.0
41856,Zimbabwe,ZWE,2010,0.545282,0.070749,255823.0,20.06,30000.0,33.196,1425.0,12698000.0
41857,Zimbabwe,ZWE,2011,0.656154,0.070749,263281.0,20.76,30000.0,33.015,1515.0,12894000.0


We can just replace the nan values with zero

In [63]:
df.loc[:, ['meat_prod_tonnes', 'Meat_supply_kgperCap', 'Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)']] = df.loc[:, ['meat_prod_tonnes', 'Meat_supply_kgperCap', 'Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)']] = df.loc[:, ['meat_prod_tonnes', 'Meat_supply_kgperCap', 'Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)']] = df.loc[:, ['meat_prod_tonnes', 'Meat_supply_kgperCap', 'Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)']].fillna(0)



In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41859 entries, 0 to 41858
Data columns (total 11 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Entity                                                        41859 non-null  object 
 1   Code                                                          41859 non-null  object 
 2   Year                                                          41859 non-null  int64  
 3   Per capita CO₂ emissions (tonnes per capita)                  41859 non-null  float64
 4   Oil production (Etemad & Luciana) (terawatt-hours)            41859 non-null  float64
 5   meat_prod_tonnes                                              41859 non-null  float64
 6   Meat_supply_kgperCap                                          41859 non-null  float64
 7   Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)  41859

### Gdp and population

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41859 entries, 0 to 41858
Data columns (total 11 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Entity                                                        41859 non-null  object 
 1   Code                                                          41859 non-null  object 
 2   Year                                                          41859 non-null  int64  
 3   Per capita CO₂ emissions (tonnes per capita)                  41859 non-null  float64
 4   Oil production (Etemad & Luciana) (terawatt-hours)            41859 non-null  float64
 5   meat_prod_tonnes                                              41859 non-null  float64
 6   Meat_supply_kgperCap                                          41859 non-null  float64
 7   Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)  41859

Interesting, we have almost always the population. We could predict the old gdp and urban by using the new data. For now we will just will values with nan

In [66]:
df = df.fillna(0)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41859 entries, 0 to 41858
Data columns (total 11 columns):
 #   Column                                                        Non-Null Count  Dtype  
---  ------                                                        --------------  -----  
 0   Entity                                                        41859 non-null  object 
 1   Code                                                          41859 non-null  object 
 2   Year                                                          41859 non-null  int64  
 3   Per capita CO₂ emissions (tonnes per capita)                  41859 non-null  float64
 4   Oil production (Etemad & Luciana) (terawatt-hours)            41859 non-null  float64
 5   meat_prod_tonnes                                              41859 non-null  float64
 6   Meat_supply_kgperCap                                          41859 non-null  float64
 7   Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)  41859

Convert to capita

In [67]:
df.loc[:, 'Oil production_perCap'] = df.loc[:, 'Oil production (Etemad & Luciana) (terawatt-hours)'] / df.Population

In [68]:
df.loc[:, 'meat_prod_tonnes_perCap'] = df.loc[:, 'meat_prod_tonnes'] / df.Population

In [69]:
df.loc[:, 'eggs_prod_perCap'] = df.loc[:, 'Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes)'] / df.Population


In [70]:
codes_idx = { code : i for i,code in enumerate(df.Code.unique())}

In [71]:
codes_idx

{'AFG': 0,
 'ALB': 1,
 'DZA': 2,
 'AND': 3,
 'AGO': 4,
 'ATG': 5,
 'ARG': 6,
 'ARM': 7,
 'AUS': 8,
 'AUT': 9,
 'AZE': 10,
 'BHS': 11,
 'BHR': 12,
 'BGD': 13,
 'BRB': 14,
 'BLR': 15,
 'BEL': 16,
 'BLZ': 17,
 'BEN': 18,
 'BTN': 19,
 'BOL': 20,
 'BIH': 21,
 'BWA': 22,
 'BRA': 23,
 'BRN': 24,
 'BGR': 25,
 'BFA': 26,
 'BDI': 27,
 'KHM': 28,
 'CMR': 29,
 'CAN': 30,
 'CPV': 31,
 'CAF': 32,
 'TCD': 33,
 'CHL': 34,
 'CHN': 35,
 'COL': 36,
 'COM': 37,
 'COG': 38,
 'CRI': 39,
 'CIV': 40,
 'HRV': 41,
 'CUB': 42,
 'CYP': 43,
 'CZE': 44,
 'COD': 45,
 'DNK': 46,
 'DJI': 47,
 'DMA': 48,
 'DOM': 49,
 'ECU': 50,
 'EGY': 51,
 'SLV': 52,
 'GNQ': 53,
 'ERI': 54,
 'EST': 55,
 'ETH': 56,
 'FJI': 57,
 'FIN': 58,
 'FRA': 59,
 'GAB': 60,
 'GMB': 61,
 'GEO': 62,
 'DEU': 63,
 'GHA': 64,
 'GRC': 65,
 'GRD': 66,
 'GTM': 67,
 'GIN': 68,
 'GNB': 69,
 'GUY': 70,
 'HTI': 71,
 'HND': 72,
 'HKG': 73,
 'HUN': 74,
 'ISL': 75,
 'IND': 76,
 'IDN': 77,
 'IRN': 78,
 'IRQ': 79,
 'IRL': 80,
 'ISR': 81,
 'ITA': 82,
 'JAM': 83,
 '

In [72]:
df['CodeId'] = df.Code.apply(lambda x: codes_idx[x])

In [73]:
df

Unnamed: 0,Entity,Code,Year,Per capita CO₂ emissions (tonnes per capita),Oil production (Etemad & Luciana) (terawatt-hours),meat_prod_tonnes,Meat_supply_kgperCap,Food Balance Sheets: Eggs - Production (FAO (2017)) (tonnes),Urban (%),GDP per capita (int.-$) ($),Population,Oil production_perCap,meat_prod_tonnes_perCap,eggs_prod_perCap,CodeId
0,Afghanistan,AFG,1800,0.000000,0.000000,0.0,0.00,0.0,0.000,0.0,3280000.0,0.000000e+00,0.000000,0.000000,0
1,Afghanistan,AFG,1801,0.000000,0.000000,0.0,0.00,0.0,0.000,0.0,3280000.0,0.000000e+00,0.000000,0.000000,0
2,Afghanistan,AFG,1802,0.000000,0.000000,0.0,0.00,0.0,0.000,0.0,3280000.0,0.000000e+00,0.000000,0.000000,0
3,Afghanistan,AFG,1803,0.000000,0.000000,0.0,0.00,0.0,0.000,0.0,3280000.0,0.000000e+00,0.000000,0.000000,0
4,Afghanistan,AFG,1804,0.000000,0.000000,0.0,0.00,0.0,0.000,0.0,3280000.0,0.000000e+00,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41854,Zimbabwe,ZWE,2008,0.569303,0.023583,245066.0,18.93,29000.0,33.560,1260.0,12380000.0,1.904932e-09,0.019795,0.002342,196
41855,Zimbabwe,ZWE,2009,0.399657,0.035375,243059.0,20.57,30000.0,33.378,1329.0,12527000.0,2.823867e-09,0.019403,0.002395,196
41856,Zimbabwe,ZWE,2010,0.545282,0.070749,255823.0,20.06,30000.0,33.196,1425.0,12698000.0,5.571678e-09,0.020147,0.002363,196
41857,Zimbabwe,ZWE,2011,0.656154,0.070749,263281.0,20.76,30000.0,33.015,1515.0,12894000.0,5.486984e-09,0.020419,0.002327,196


In [74]:
df = df[['Year', 'Per capita CO₂ emissions (tonnes per capita)', 
         'Oil production_perCap', 'meat_prod_tonnes_perCap', 'Urban (%)',
         'eggs_prod_perCap', 'CodeId', 'GDP per capita (int.-$) ($)'
        ]]

In [75]:
# we may have new nan if we divided 0 by 0
df = df.fillna(0)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41859 entries, 0 to 41858
Data columns (total 8 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Year                                          41859 non-null  int64  
 1   Per capita CO₂ emissions (tonnes per capita)  41859 non-null  float64
 2   Oil production_perCap                         41859 non-null  float64
 3   meat_prod_tonnes_perCap                       41859 non-null  float64
 4   Urban (%)                                     41859 non-null  float64
 5   eggs_prod_perCap                              41859 non-null  float64
 6   CodeId                                        41859 non-null  int64  
 7   GDP per capita (int.-$) ($)                   41859 non-null  float64
dtypes: float64(6), int64(2)
memory usage: 2.6 MB


In [78]:
df.to_csv(dataset_dir / 'prepared.csv', index=False)

## Prediction
Let's predict the GDP

In [110]:
df = pd.read_csv(dataset_dir / 'prepared.csv')

df = df[df['GDP per capita (int.-$) ($)'] != 0]
df = df.reset_index(drop=True)
df.Year = df.Year - df.Year.min()

In [111]:
df.describe()

Unnamed: 0,Year,Per capita CO₂ emissions (tonnes per capita),Oil production_perCap,meat_prod_tonnes_perCap,Urban (%),eggs_prod_perCap,CodeId,GDP per capita (int.-$) ($)
count,14301.0,14301.0,14301.0,14301.0,14301.0,14301.0,14301.0,14301.0
mean,154.887351,3.051506,1.215438e-05,0.020306,31.845678,0.003016,97.239634,6968.483183
std,48.006048,5.315968,7.152424e-05,0.041179,29.678768,0.005321,56.626188,10979.27688
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,134.0
25%,131.0,0.113904,0.0,0.0,0.0,0.0,47.0,1533.0
50%,168.0,0.834149,0.0,0.004191,27.741,0.0,94.0,3066.0
75%,191.0,3.946702,4.408243e-07,0.022832,55.919,0.00393,144.0,7605.0
max,212.0,69.945693,0.00186518,0.402853,100.0,0.046128,196.0,220717.0


In [187]:
4.408243e-07 * 1000000

0.4408243

In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14301 entries, 0 to 14300
Data columns (total 8 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Year                                          14301 non-null  int64  
 1   Per capita CO₂ emissions (tonnes per capita)  14301 non-null  float64
 2   Oil production_perCap                         14301 non-null  float64
 3   meat_prod_tonnes_perCap                       14301 non-null  float64
 4   Urban (%)                                     14301 non-null  float64
 5   eggs_prod_perCap                              14301 non-null  float64
 6   CodeId                                        14301 non-null  int64  
 7   GDP per capita (int.-$) ($)                   14301 non-null  float64
dtypes: float64(6), int64(2)
memory usage: 893.9 KB


In [113]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump, load
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [114]:
checkpoints = Path('./checkpoints')
checkpoints.mkdir(exist_ok = True)

In [115]:
df

Unnamed: 0,Year,Per capita CO₂ emissions (tonnes per capita),Oil production_perCap,meat_prod_tonnes_perCap,Urban (%),eggs_prod_perCap,CodeId,GDP per capita (int.-$) ($)
0,150,0.010871,0.000000e+00,0.000000,6.000,0.000000,0,2392.0
1,151,0.011684,0.000000e+00,0.000000,6.208,0.000000,0,2422.0
2,152,0.011544,0.000000e+00,0.000000,6.422,0.000000,0,2462.0
3,153,0.013218,0.000000e+00,0.000000,6.643,0.000000,0,2568.0
4,154,0.013037,0.000000e+00,0.000000,6.872,0.000000,0,2576.0
...,...,...,...,...,...,...,...,...
14296,208,0.569303,1.904932e-09,0.019795,33.560,0.002342,196,1260.0
14297,209,0.399657,2.823867e-09,0.019403,33.378,0.002395,196,1329.0
14298,210,0.545282,5.571678e-09,0.020147,33.196,0.002363,196,1425.0
14299,211,0.656154,5.486984e-09,0.020419,33.015,0.002327,196,1515.0


**Review** we can remove the CodeId

Train test split

In [158]:
df_train = df.drop(['CodeId'], axis=1)

X = df_train[df_train.columns[:-1]]
Y = df_train[df_train.columns[-1]]

scale the data and store the scaler

In [183]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
dump(scaler, checkpoints / f"scaler.joblib")

['checkpoints/scaler.joblib']

In [176]:
scaled_features_df = pd.DataFrame(X_scaled, index=df.index, columns=df.columns[:-2])

In [177]:
scaled_features_df

Unnamed: 0,Year,Per capita CO₂ emissions (tonnes per capita),Oil production_perCap,meat_prod_tonnes_perCap,Urban (%),eggs_prod_perCap
0,-0.101811,-0.572001,-0.169940,-0.493138,-0.870878,-0.566867
1,-0.080979,-0.571848,-0.169940,-0.493138,-0.863869,-0.566867
2,-0.060148,-0.571875,-0.169940,-0.493138,-0.856658,-0.566867
3,-0.039316,-0.571560,-0.169940,-0.493138,-0.849212,-0.566867
4,-0.018485,-0.571594,-0.169940,-0.493138,-0.841496,-0.566867
...,...,...,...,...,...,...
14296,1.106413,-0.466950,-0.169913,-0.012411,0.057765,-0.126580
14297,1.127244,-0.498863,-0.169900,-0.021943,0.051632,-0.116742
14298,1.148076,-0.471469,-0.169862,-0.003878,0.045499,-0.122804
14299,1.168907,-0.450611,-0.169863,0.002732,0.039401,-0.129554


divide the data in train and test

In [164]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2)

### Linear Regression

In [165]:
def run_model(model, seed=0, random_grid={}):
    clf = RandomizedSearchCV(model, param_distributions = random_grid, 
                             n_iter = 100, cv = 3, 
                             verbose=2, random_state=42, n_jobs = -1)
    
    clf.fit(X_train, Y_train)
    
    res_df = pd.DataFrame(clf.cv_results_)
    best_model = clf.best_estimator_
    Y_pred = best_model.predict(X_test)
    print(f"R^2 is = {best_model.score(X_test, Y_test):.4f}")
    print(f"MSE = {mean_squared_error(Y_pred, Y_test):.4f}")
    save_path = checkpoints / f"{model.__class__.__name__}-best.joblib"
    
    dump(clf.best_estimator_, save_path)
    return res_df
    


In [166]:
model = LinearRegression()

run_model(model)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
R^2 is = 0.7097
MSE = 33739947.2525


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001768,5.9e-05,0.000519,7.5e-05,{},0.71725,0.719835,0.701904,0.712996,0.007914,1


In [167]:
model = RandomForestRegressor()
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 80, stop = 100, num = 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 50)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
run_model(model, random_grid)



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.5s finished


R^2 is = 0.9664
MSE = 3909955.6683


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,2.081588,0.025411,0.054625,0.000584,{},0.955469,0.951338,0.946827,0.951211,0.003529,1


In [168]:
!ls checkpoints/

 LinearRegression-best.joblib	   RandomForestRegressor-best.joblib
'LinearRegression-seed=0.joblib'  'RandomForestRegressor-seed=0.joblib'


In [169]:
codes = df.CodeId.unique()

def plot_pred_for_code(code, model):
    model = load(checkpoints / f"{model}-best.joblib")
    X_code = X[df['CodeId'] == code]
    X_code = X_code.sort_values('Year')
    
    Y_pred = model.predict(scaler.transform(X_code))
    Y_code = Y.iloc[X_code.index]
    
#     X_code_test = X_test[df['CodeId'] == code].sort_values('Year')
    fig = plt.figure()
    plt.plot(X_code.Year, Y_code, label='true')
    plt.plot(X_code.Year, Y_pred, label='pred')
    
#     plt.scatter(X_code_test.Year, Y_code[X_code_test.index], label='test')
    plt.legend()
    
interact(plot_pred_for_code, code=codes, model=['RandomForestRegressor', 'LinearRegression'], seed=[0])

interactive(children=(Dropdown(description='code', options=(0, 1, 2, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 18…

<function __main__.plot_pred_for_code(code, model)>

In [170]:
model = load(checkpoints / f"RandomForestRegressor-best.joblib")


def predict_gdp(year: int, co2: float, oil: float, meat: float, urban: float, eggs: float):
    x = np.array([[year, co2, oil, meat, urban, eggs]])
    x = scaler.transform(x)
    
    pred = model.predict(x)
    
    print(pred)
    


In [181]:
predict_gdp(2000, co2=0.1, oil=0.001, meat=0, urban=20, eggs=0.01)

[4374.84]
