In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /opt/anaconda3/lib/python3.7/site-packages (0.0)


In [2]:
# install joblib for saving
# Restart kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
# import missingno as msno
import seaborn as sns

In [4]:
df = pd.read_csv("../data/ml_compile.csv")

In [5]:
df.describe()

Unnamed: 0,land_avg_temp,land_max_temp,land_min_temp,land_ocean_avg_temp,north_min_temp_anomoly,north_max_temp_anomoly,north_mean_temp_anomoly,global_avg_co2,seaice_extent,bear_rsf_mean,bear_rsf_var,num_bears,avg_distance_traveled,avg_num_land_dens,avg_num_ice_dens,avg_num_active_dens
count,1392.0,1392.0,1392.0,1392.0,1428.0,1434.0,1446.0,752.0,506.0,334.0,334.0,84.0,84.0,84.0,84.0,84.0
mean,8.762454,14.516995,3.000047,15.31703,0.122602,0.213536,0.21362,355.582926,11.405146,15.820979,3.045457,6.690476,591633200.0,41.553384,504.157876,40.258267
std,4.199958,4.294911,4.114758,1.259405,0.709268,0.620087,0.630979,28.949294,3.25291,2.430975,1.087362,3.951441,676993700.0,76.629269,481.543071,70.866545
min,1.395,7.082,-4.298,12.839,-1.951,-1.772,-1.647,312.43,3.5656,6.093212,0.0,1.0,178423.0,0.0,0.0,0.0
25%,4.65575,10.42625,-1.10325,14.15075,-0.35425,-0.18475,-0.198,329.1125,8.571152,14.87223,2.420035,3.0,129620600.0,0.0,171.946429,0.0
50%,9.0875,14.899,3.2135,15.373,0.0225,0.1405,0.098,352.88,12.095933,16.388356,2.949145,7.0,258839700.0,10.083333,270.75,0.0
75%,12.953,18.867,7.048,16.47775,0.58325,0.5635,0.5765,378.8575,14.318582,17.471275,3.629055,8.0,928648800.0,55.9375,820.386905,68.392857
max,15.482,21.32,9.715,17.611,2.97,2.805,2.877,417.07,16.341938,20.0,6.027518,17.0,2617030000.0,422.333333,2012.5,333.0


In [6]:
# df = df[['land_avg_temp', 'land_max_temp', 'land_min_temp',
#        'land_ocean_avg_temp', 'north_min_temp_anomoly',
#        'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
#        'seaice_extent', 'bear_rsf_mean']]
df = df[['land_avg_temp', 'global_avg_co2',
       'seaice_extent', 'bear_rsf_mean']]

In [7]:
df = df.dropna(axis=0, how="any")
df.describe()

Unnamed: 0,land_avg_temp,global_avg_co2,seaice_extent,bear_rsf_mean
count,331.0,331.0,331.0,331.0
mean,9.385082,372.225619,11.357481,15.851405
std,4.121994,16.649756,3.22025,2.392539
min,2.71,343.13,3.5656,6.093212
25%,5.552,356.145,8.513177,14.928223
50%,9.618,372.25,12.0164,16.39683
75%,13.443,386.235,14.326682,17.470912
max,15.482,403.96,16.050143,20.0


In [45]:
from statistics import median 
print(median(df["bear_rsf_mean"]))

16.39683045


## Categorical Algorithms
### Preprocessing - Transform RSF values to Categorical

In [91]:
conditions = [(df["bear_rsf_mean"] < 15), (df["bear_rsf_mean"] > 15) & (df["bear_rsf_mean"] < 18), (df["bear_rsf_mean"]>18)]
values = ['low', 'mid', 'high']
df["bear_mean_cat"] = np.select(conditions,values)

In [92]:
#Change Threshold to 9
df_cat = df [['land_avg_temp', 'global_avg_co2',
       'seaice_extent','bear_mean_cat']]

### Assign new variables

In [93]:
X = df_cat[['land_avg_temp', 'global_avg_co2',
       'seaice_extent']]
y_cat = df_cat["bear_mean_cat"]
print(X.shape, y_cat.shape)

(331, 3) (331,)


### Split the data into testing and training dataset

In [94]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_cat_train, y_cat_test = train_test_split(X, y_cat, test_size=0.5, random_state=42)

### MinMaxScalar to fit and transform X features and y target

In [95]:
#Fit Transform using MinMaxScalar for X features
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

# Random Forest Analysis

In [96]:
from sklearn.ensemble import RandomForestClassifier
y_cat_rf = df_cat["bear_mean_cat"]
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_minmax, y_cat_train)
score = rf.score(X_test_minmax, y_cat_test)

In [97]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_cat_train)
encoded_y_train = label_encoder.transform(y_cat_train)
encoded_y_test = label_encoder.transform(y_cat_test)

In [98]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y_cat_train)
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'high': 0, 'low': 1, 'mid': 2}


In [99]:
rf2 = RandomForestClassifier(n_estimators=200, min_samples_split =5, min_samples_leaf = 4, max_features='auto', max_depth=60, bootstrap='True')
rf2 = rf2.fit(X_train_minmax, encoded_y_train)
score = rf2.score(X_test_minmax, encoded_y_test)
score

0.6506024096385542

In [1]:
import pickle
pickle.dump(rf2, open("/../../rf_rsf2.pkl", "wb"))

NameError: name 'rf2' is not defined

In [101]:
#load_file = open("rf_rsf.pkl", "rb")
with open("/../..mod/rf_rsf2.pkl", "rb") as f:
    model_object = pickle.load(f)
    f.close()

In [102]:
model_object

RandomForestClassifier(bootstrap='True', ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=60, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [103]:
 #print(model_object)
from sklearn.preprocessing import MinMaxScaler
seaice = 6
temp = 2
co2 = 440
X_test = [[temp, co2, seaice]]
X_minmax = MinMaxScaler().fit(X_test)
X_test_minmax = X_minmax.transform(X_test)
Ypredict = model_object.predict(X_test_minmax)  
print(Ypredict)

[2]
