In [3]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error

In [4]:
# Load the data
url = "https://raw.githubusercontent.com/Ekenc/Project4/main/Data/Merged_Target_Glaciers.csv"
df = pd.read_csv(url)

# Show DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,Year,Mean Nitrous Oxide Concentration,Mean Carbon Dioxide Concentration,Mean Methane Concentration,Mean cumulative mass balance of glaciers,Precipitation Anomaly,Mean Adjusted Sea Level (inches),Sea Temperature Anomaly,Earth Surface Temperature Anomaly (land and ocean)
0,0,1964,291.2,319.62,1260.3,-2.545,-0.041776,4.169291,-0.546,-0.252
1,1,1967,291.5,322.18,1284.03,-2.662,-0.096894,4.452756,-0.42,0.0
2,2,1970,293.8,325.620315,1351.7,-3.519,-0.070516,4.677165,-0.294,0.108
3,3,1971,294.0,326.32,1357.2,-3.758,0.03224,4.88189,-0.51,-0.126
4,4,1972,295.6,328.74211,1380.1,-4.016,-0.772485,5.240157,-0.186,0.072


In [5]:
# Define features set
X = df.drop("Mean cumulative mass balance of glaciers", axis = 1)
X = X.drop("Unnamed: 0", axis = 1)
X.head()


Unnamed: 0,Year,Mean Nitrous Oxide Concentration,Mean Carbon Dioxide Concentration,Mean Methane Concentration,Precipitation Anomaly,Mean Adjusted Sea Level (inches),Sea Temperature Anomaly,Earth Surface Temperature Anomaly (land and ocean)
0,1964,291.2,319.62,1260.3,-0.041776,4.169291,-0.546,-0.252
1,1967,291.5,322.18,1284.03,-0.096894,4.452756,-0.42,0.0
2,1970,293.8,325.620315,1351.7,-0.070516,4.677165,-0.294,0.108
3,1971,294.0,326.32,1357.2,0.03224,4.88189,-0.51,-0.126
4,1972,295.6,328.74211,1380.1,-0.772485,5.240157,-0.186,0.072


In [6]:
# Define target vector
y = df["Mean cumulative mass balance of glaciers"]
y[:48]

0     -2.545
1     -2.662
2     -3.519
3     -3.758
4     -4.016
5     -4.147
6     -4.339
7     -4.534
8     -5.140
9     -5.919
10    -6.726
11    -7.009
12    -7.586
13    -7.475
14    -7.559
15    -7.892
16    -8.399
17    -8.832
18    -8.935
19    -9.242
20    -9.917
21   -10.384
22   -10.819
23   -11.611
24   -12.134
25   -12.861
26   -13.088
27   -13.317
28   -13.755
29   -14.326
30   -15.018
31   -15.758
32   -16.591
33   -17.202
34   -17.619
35   -18.169
36   -19.011
37   -19.920
38   -20.657
39   -21.502
40   -22.285
41   -23.402
42   -24.383
43   -25.152
44   -26.043
45   -27.174
Name: Mean cumulative mass balance of glaciers, dtype: float64

In [7]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [8]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [9]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create the random forest classifier instance
random_model = RandomForestRegressor(n_estimators=100, random_state=78)

In [11]:
# Fit the model 
random_model = random_model.fit(X_train_scaled, y_train)

In [12]:
# Making predictions using the testing data
predictions = random_model.predict(X_test_scaled)

In [13]:
# Calculate the mean squared error, model evaluation
print(
  'mean_squared_error : ', mean_squared_error(y_test, predictions))
print(
  'mean_absolute_error : ', mean_absolute_error(y_test, predictions))

mean_squared_error :  0.11983690525000192
mean_absolute_error :  0.2650366666666689


In [14]:
# Compute the model score with training data
random_model.score(X_train_scaled, y_train)

0.9984769666595956

In [15]:
# Compute the model score with testing data
random_model.score(X_test_scaled, y_test)

0.995715673630794