In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib for saving
# Restart kernel after installing 
!pip install joblib

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
import missingno as msno
import seaborn as sns

In [None]:
sns.set()

In [None]:
df = pd.read_csv("https://github.com/CodeReber/dataviz_final/blob/main/data/ml_compile.csv")

In [None]:
df.columns

In [None]:
#Visualize missing values with MSNO
msno.bar(df)

## Machine Learning: Climate x Resource Selection Factor (RSF)

In [None]:
df_rsf = df [['year_month', 'land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent', 'bear_rsf_mean', 'bear_rsf_var']]

In [None]:
#Drop null values
df_rsf_null = df_rsf.dropna(axis=0)

In [None]:
df_rsf_null.describe()

In [None]:
import seaborn as sns
import pandas as pd
numerical = ['land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent']
df_rsf_null[numerical].hist(bins=30, figsize=(15, 10), layout=(3, 3));

# Preprocessing the Data

## Assign X(features) and y(target) data

In [None]:
X = df_rsf_null[['land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent']]
y_rsf = df_rsf_null["bear_rsf_mean"].values.reshape(-1, 1)
y_var = df_rsf_null["bear_rsf_var"].values.reshape(-1, 1)
print(X.shape, y_rsf.shape)

## Split the data into testing and training dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_rsf_train, y_rsf_test = train_test_split(X, y_rsf, test_size=0.2, random_state=42)
# X_train, X_test, y_var_train, y_var_test = train_test_split(X, y_var, test_size=0.2, random_state=42)

## MinMaxScalar to fit and transform X features and y target

In [None]:
#Fit Transform using MinMaxScalar for X features
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)
y_rsf_minmax = MinMaxScaler().fit(y_rsf_train)
y_var_minmax = MinMaxScaler().fit(y_var_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

#Target value #1
y_rsf_train_minmax = y_rsf_minmax.transform(y_rsf_train)
y_rsf_test_minmax = y_rsf_minmax.transform(y_rsf_test)

#Target value #2
y_var_train_minmax = y_var_minmax.transform(y_var_train)
y_var_test_minmax = y_var_minmax.transform(y_var_test)

# Linear Regression Analysis

## RSF

In [None]:
# Creat model and fit to scaled training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_minmax, y_rsf_train_minmax)

In [None]:
# Make Prediction using a FIT model and plot ==SEE BELOW FOR ALTERNATIVES==
predictions = model.predict(X_test_minmax)
model.fit(X_train_minmax, y_rsf_train_minmax)

plt.scatter(model.predict(X_train_minmax), model.predict(X_train_minmax) - y_rsf_train_minmax, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_minmax), model.predict(X_test_minmax) - y_rsf_test_minmax, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_rsf_test_minmax.min(), xmax=y_rsf_test_minmax.max())
plt.title("Residual Plot")
plt.show()

In [None]:
# Validate the Model with MSE and R2
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_rsf_test_minmax, predictions)
r2 = model.score(X_test_minmax, y_rsf_test_minmax)

print(f"MSE: {MSE}, R2: {r2}")

## RSF variability

In [None]:
# Creat model and fit to scaled training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_minmax, y_var_train_minmax)

In [None]:
predictions = model.predict(X_test_minmax)
model.fit(X_train_minmax, y_var_train_minmax)

plt.scatter(model.predict(X_train_minmax), model.predict(X_train_minmax) - y_var_train_minmax, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_minmax), model.predict(X_test_minmax) - y_var_test_minmax, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_var_test_minmax.min(), xmax=y_var_test_minmax.max())
plt.title("Residual Plot")
plt.show()

In [None]:
# Validate the Model with MSE and R2
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_var_test_minmax, predictions)
r2 = model.score(X_test_minmax, y_var_test_minmax)

print(f"MSE: {MSE}, R2: {r2}")

## Preprocessing - Transform RSF values to Categorical

In [None]:
df["bear_rsf_mean"].fillna(0, inplace=True)

In [None]:
conditions = [(df["bear_rsf_mean"] < 6), (df["bear_rsf_mean"] > 6) & (df["bear_rsf_mean"] < 10), (df["bear_rsf_mean"] > 10) & (df["bear_rsf_mean"] < 15), (df["bear_rsf_mean"]>15)]
values = ['unknown','low', 'mid', 'high']
df["bear_mean_cat"] = np.select(conditions,values)

In [None]:
df.head()

In [None]:
df["bear_mean_cat"].unique()

In [None]:
#Change Threshold to 9
df_cat_null = df [['land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent','bear_mean_cat']]

In [None]:
df_cat_null = df_cat_null.dropna(axis=0, how="any")

In [None]:
df_cat_null.describe()

## Assign new variables

In [None]:
X = df_cat_null[['land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent']]
y_cat = df_cat_null["bear_mean_cat"]
print(X.shape, y_cat.shape)

## Split the data into testing and training dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_cat_train, y_cat_test = train_test_split(X, y_cat, test_size=0.5, random_state=42)

## MinMaxScalar to fit and transform X features and y target

In [None]:
#Fit Transform using MinMaxScalar for X features
from sklearn.preprocessing import MinMaxScaler
X_minmax = MinMaxScaler().fit(X_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)

# Logistic Regression Analysis

## RSF Category

In [None]:
# Creat model and fit to scaled training data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_minmax, y_cat_train)
print(f"Training Data Score: {classifier.score(X_train, y_cat_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_cat_test)}")

# Random Forest Analysis

## RSF Category

In [None]:
from sklearn.ensemble import RandomForestClassifier
y_cat_rf = df_cat_null["bear_mean_cat"]

rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_minmax, y_cat_train)
score = rf.score(X_test_minmax, y_cat_test)


In [None]:
importances = rf.feature_importances_
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

In [None]:
print(f"Random Forest Testing Score: {score}")

## hypertune the model

In [None]:
rf.get_params().keys()

In [None]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 3)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

### Label encoding y_cat

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_cat_train)
encoded_y_train = label_encoder.transform(y_cat_train)
encoded_y_test = label_encoder.transform(y_cat_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train_minmax, encoded_y_train)

In [None]:
rf_random.best_params_

In [None]:
rf2 = RandomForestClassifier(n_estimators=200, min_samples_split =5, min_samples_leaf = 4, max_features='auto', max_depth=60, bootstrap='True')
rf2 = rf2.fit(X_train_minmax, encoded_y_train)
score = rf2.score(X_test_minmax, encoded_y_test)

print(f"Random Forest Testing Score: {score}")

# Machine Learning Prediction: Den location

df.columns

In [None]:
df_dens = df[['year_month', 'land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent','num_land_dens',
       'num_ice_dens', 'num_active_dens']]

In [None]:
df_dens.describe()

In [None]:
df_dens_null = df_dens.dropna(axis=0)

In [None]:
df_dens_null.describe()

In [None]:
X_den = df_dens_null[['land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent', "num_active_dens"]]
y_den_land = df_dens_null["num_land_dens"].values.reshape(-1, 1)
y_den_ice = df_dens_null["num_ice_dens"].values.reshape(-1,1)
print(X_den.shape, y_den_land.shape, y_den_ice.shape)

## Land Dens Model

In [None]:
from sklearn.model_selection import train_test_split
X_den_train, X_den_test, y_den_land_train, y_den_land_test = train_test_split(X_den, y_den_land, test_size=0.2, random_state=42)
# X_train, X_test, y_var_train, y_var_test = train_test_split(X, y_var, test_size=0.2, random_state=42)

In [None]:
#Fit Transform using MinMaxScalar for X features
from sklearn.preprocessing import MinMaxScaler
X_den_minmax = MinMaxScaler().fit(X_den_train)
X_den_train_minmax = X_den_minmax.transform(X_den_train)
X_den_test_minmax = X_den_minmax.transform(X_den_test)

#Target value #1
y_den_minmax = MinMaxScaler().fit(y_den_land_train)
y_den_land_train_minmax = y_den_minmax.transform(y_den_land_train)
y_den_land_test_minmax = y_den_minmax.transform(y_den_land_test)


# Multiple Linear Regression Analysis

## Number of Land Dens

In [None]:
# Creat model and fit to scaled training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_den_train_minmax, y_den_land_train_minmax)

In [None]:
# Make Prediction using a FIT model and plot
predictions = model.predict(X_den_test_minmax)
model.fit(X_den_train_minmax, y_den_land_train_minmax)

plt.scatter(model.predict(X_den_train_minmax), model.predict(X_den_train_minmax) - y_den_land_train_minmax, c="blue", label="Training Data")
plt.scatter(model.predict(X_den_test_minmax), model.predict(X_den_test_minmax) - y_den_land_test_minmax, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_den_land_test_minmax.min(), xmax=y_den_land_test_minmax.max())
plt.title("Residual Plot")
plt.show()

In [None]:
# Validate the Model with MSE and R2
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_den_land_test_minmax, predictions)
r2 = model.score(X_den_test_minmax, y_den_land_test_minmax)

print(f"MSE: {MSE}, R2: {r2}")

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

train_scores = []
test_scores = []
for k in range (1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_den_train_minmax, y_den_land_train)
    train_score = knn.score(X_den_train_minmax, y_den_land_train)
    test_score = knn.score(X_den_test_minmax, y_den_land_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
#STEP 5: re-fit classifier with optimal k value
knn = KNeighborsClassifier(n_neighbors = 13)
knn.fit(X_den_train_minmax, y_den_land_train)
print("k=15 Test Acc: %.3f" %knn.score(X_den_test_minmax, y_den_land_test))