In [3]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /opt/anaconda3/envs/PythonAdv/lib/python3.6/site-packages (0.0)


In [4]:
# install joblib for saving
# Restart kernel after installing 
!pip install joblib



In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
import missingno as msno
import seaborn as sns

In [6]:
df = pd.read_csv("../data/ml_compile.csv")

## Categorical Algorithms
### Preprocessing - Transform RSF values to Categorical

In [7]:
df["bear_rsf_mean"].fillna(0, inplace=True)

In [8]:
conditions = [(df["bear_rsf_mean"] < 6), (df["bear_rsf_mean"] > 6) & (df["bear_rsf_mean"] < 10), (df["bear_rsf_mean"] > 10) & (df["bear_rsf_mean"] < 15), (df["bear_rsf_mean"]>15)]
values = ['unknown','low', 'mid', 'high']
df["bear_mean_cat"] = np.select(conditions,values)

In [11]:
#Change Threshold to 9
df_cat_null = df [['land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent','bear_mean_cat']]

In [12]:
df_cat_null = df_cat_null.dropna(axis=0, how="any")

### Assign new variables

In [14]:
X = df_cat_null[['land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent']]
y_cat = df_cat_null["bear_mean_cat"]
print(X.shape, y_cat.shape)

(447, 9) (447,)


### Split the data into testing and training dataset

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_cat_train, y_cat_test = train_test_split(X, y_cat, test_size=0.5, random_state=42)

### MinMaxScalar to fit and transform X features and y target

In [16]:
#Fit Transform using MinMaxScalar for X features
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

# Random Forest Analysis

In [17]:
from sklearn.ensemble import RandomForestClassifier
y_cat_rf = df_cat_null["bear_mean_cat"]
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_minmax, y_cat_train)
score = rf.score(X_test_minmax, y_cat_test)


In [18]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_cat_train)
encoded_y_train = label_encoder.transform(y_cat_train)
encoded_y_test = label_encoder.transform(y_cat_test)

In [19]:
rf2 = RandomForestClassifier(n_estimators=200, min_samples_split =5, min_samples_leaf = 4, max_features='auto', max_depth=60, bootstrap='True')
rf2 = rf2.fit(X_train_minmax, encoded_y_train)
score = rf2.score(X_test_minmax, encoded_y_test)

In [20]:
import joblib
filename = 'rf_rsf.sav'
joblib.dump("rf_rsf", filename)



['rf_rsf.sav']