### Setup our notebook

In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
co2_df = pd.read_csv("./Resources/GlobalCo2TempByCountry.csv")
co2_df

Unnamed: 0,country,year,co2,co2_growth_prct,co2_growth_abs,co2_per_capita,share_global_co2,cumulative_co2,share_global_cumulative_co2,oil_co2,oil_co2_per_capita,total_ghg,population,AverageTemperature
0,Afghanistan,1990,2.602,-5.877,-0.162,0.210,0.012,59.205,0.010,1.850,0.149,15.14,12412000.0,14.993333
1,Afghanistan,1991,2.426,-6.760,-0.176,0.182,0.011,61.632,0.010,1.718,0.129,15.06,13299000.0,14.370750
2,Afghanistan,1992,1.382,-43.065,-1.045,0.095,0.006,63.013,0.010,0.927,0.064,13.60,14486000.0,14.056083
3,Afghanistan,1993,1.334,-3.452,-0.048,0.084,0.006,64.347,0.010,0.894,0.057,13.43,15817000.0,14.439250
4,Afghanistan,1994,1.282,-3.852,-0.051,0.075,0.006,65.630,0.010,0.860,0.050,13.24,17076000.0,14.754750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5130,Zimbabwe,2012,7.695,-18.985,-1.803,0.587,0.022,647.512,0.050,3.693,0.282,67.63,13115000.0,21.521333
5131,Zimbabwe,2013,11.632,51.153,3.936,0.871,0.033,659.144,0.050,4.110,0.308,67.55,13350000.0,20.710750
5132,Zimbabwe,2014,11.962,2.838,0.330,0.880,0.034,671.105,0.049,3.726,0.274,66.10,13587000.0,
5133,Zimbabwe,2015,12.163,1.685,0.202,0.880,0.034,683.268,0.049,3.624,0.262,67.49,13815000.0,


### Preprocess the data for modeling

In [3]:
# drop NAs from the dataset
co2_df = co2_df.dropna()
co2_df

Unnamed: 0,country,year,co2,co2_growth_prct,co2_growth_abs,co2_per_capita,share_global_co2,cumulative_co2,share_global_cumulative_co2,oil_co2,oil_co2_per_capita,total_ghg,population,AverageTemperature
0,Afghanistan,1990,2.602,-5.877,-0.162,0.210,0.012,59.205,0.010,1.850,0.149,15.14,12412000.0,14.993333
1,Afghanistan,1991,2.426,-6.760,-0.176,0.182,0.011,61.632,0.010,1.718,0.129,15.06,13299000.0,14.370750
2,Afghanistan,1992,1.382,-43.065,-1.045,0.095,0.006,63.013,0.010,0.927,0.064,13.60,14486000.0,14.056083
3,Afghanistan,1993,1.334,-3.452,-0.048,0.084,0.006,64.347,0.010,0.894,0.057,13.43,15817000.0,14.439250
4,Afghanistan,1994,1.282,-3.852,-0.051,0.075,0.006,65.630,0.010,0.860,0.050,13.24,17076000.0,14.754750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5127,Zimbabwe,2009,5.519,-28.494,-2.199,0.441,0.018,622.612,0.053,1.668,0.133,63.54,12527000.0,21.377250
5128,Zimbabwe,2010,7.707,39.628,2.187,0.607,0.023,630.318,0.052,1.762,0.139,66.54,12698000.0,21.986250
5129,Zimbabwe,2011,9.498,23.250,1.792,0.737,0.028,639.817,0.051,3.261,0.253,67.42,12894000.0,21.602417
5130,Zimbabwe,2012,7.695,-18.985,-1.803,0.587,0.022,647.512,0.050,3.693,0.282,67.63,13115000.0,21.521333


In [4]:
# encode categorical data
from sklearn.preprocessing import LabelEncoder

#After importing the module, an instance of the label encoder object is created and assigned the variable le. 
le = LabelEncoder()

# The label encoder's fit_transform() method is used to first train the label encoder, then convert the text data into numerical data.
co2_df['country'] = le.fit_transform(co2_df['country']) 
co2_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,country,year,co2,co2_growth_prct,co2_growth_abs,co2_per_capita,share_global_co2,cumulative_co2,share_global_cumulative_co2,oil_co2,oil_co2_per_capita,total_ghg,population,AverageTemperature
0,0,1990,2.602,-5.877,-0.162,0.210,0.012,59.205,0.010,1.850,0.149,15.14,12412000.0,14.993333
1,0,1991,2.426,-6.760,-0.176,0.182,0.011,61.632,0.010,1.718,0.129,15.06,13299000.0,14.370750
2,0,1992,1.382,-43.065,-1.045,0.095,0.006,63.013,0.010,0.927,0.064,13.60,14486000.0,14.056083
3,0,1993,1.334,-3.452,-0.048,0.084,0.006,64.347,0.010,0.894,0.057,13.43,15817000.0,14.439250
4,0,1994,1.282,-3.852,-0.051,0.075,0.006,65.630,0.010,0.860,0.050,13.24,17076000.0,14.754750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5127,169,2009,5.519,-28.494,-2.199,0.441,0.018,622.612,0.053,1.668,0.133,63.54,12527000.0,21.377250
5128,169,2010,7.707,39.628,2.187,0.607,0.023,630.318,0.052,1.762,0.139,66.54,12698000.0,21.986250
5129,169,2011,9.498,23.250,1.792,0.737,0.028,639.817,0.051,3.261,0.253,67.42,12894000.0,21.602417
5130,169,2012,7.695,-18.985,-1.803,0.587,0.022,647.512,0.050,3.693,0.282,67.63,13115000.0,21.521333


In [9]:
# # scale the data for modeling

# #create scaler instance
# from sklearn.preprocessing import StandardScaler
# data_scaler = StandardScaler()

# # train scaler instance and transform data
# scaled_co2 = data_scaler.fit_transform(co2_df)
# scaled_co2

In [11]:
# Define the feature set

X = co2_df.drop('co2_per_capita', axis=1)
X.head()

Unnamed: 0,country,year,co2,co2_growth_prct,co2_growth_abs,share_global_co2,cumulative_co2,share_global_cumulative_co2,oil_co2,oil_co2_per_capita,total_ghg,population,AverageTemperature
0,0,1990,2.602,-5.877,-0.162,0.012,59.205,0.01,1.85,0.149,15.14,12412000.0,14.993333
1,0,1991,2.426,-6.76,-0.176,0.011,61.632,0.01,1.718,0.129,15.06,13299000.0,14.37075
2,0,1992,1.382,-43.065,-1.045,0.006,63.013,0.01,0.927,0.064,13.6,14486000.0,14.056083
3,0,1993,1.334,-3.452,-0.048,0.006,64.347,0.01,0.894,0.057,13.43,15817000.0,14.43925
4,0,1994,1.282,-3.852,-0.051,0.006,65.63,0.01,0.86,0.05,13.24,17076000.0,14.75475


In [12]:
#Define the target set

y= co2_df['co2_per_capita']

### Building a random forest model

In [13]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [14]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [16]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

ValueError: Unknown label type: 'continuous'