In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /opt/anaconda3/envs/PythonAdv/lib/python3.6/site-packages (0.0)


In [2]:
# install joblib for saving
# Restart kernel after installing 
!pip install joblib



In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
df = pd.read_csv("../data/ml_compile.csv")

In [5]:
df.columns

Index(['year_month', 'land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent', 'bear_rsf_mean', 'bear_rsf_var', 'num_bears',
       'avg_distance_traveled', 'avg_num_land_dens', 'avg_num_ice_dens',
       'avg_num_active_dens'],
      dtype='object')

# Machine Learning: Climate x Resource Selection Factor (RSF)

In [9]:
#Drop null values
df = df.dropna(axis=0, thresh = 10)

In [10]:
df.describe()

Unnamed: 0,land_avg_temp,land_max_temp,land_min_temp,land_ocean_avg_temp,north_min_temp_anomoly,north_max_temp_anomoly,north_mean_temp_anomoly,global_avg_co2,seaice_extent,avg_distance_traveled
count,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,447.0,81.0
mean,9.226651,14.930488,3.615477,15.641613,0.775022,0.659465,0.686273,366.108054,11.559153,601792800.0
std,4.134906,4.249913,4.010169,1.229699,0.528649,0.567605,0.540999,18.90857,3.185538,684015800.0
min,2.558,8.071,-2.853,13.566,-0.787,-1.256,-1.065,332.41,3.5656,178423.0
25%,5.189,10.696,-0.308,14.441,0.4135,0.253,0.289,350.32,8.842585,132456500.0
50%,9.394,15.174,3.759,15.683,0.766,0.653,0.671,364.31,12.199871,265070000.0
75%,13.1855,19.062,7.447,16.8015,1.1015,1.037,1.0295,382.25,14.409063,982286300.0
max,15.482,21.32,9.715,17.611,2.554,2.465,2.51,403.96,16.341938,2617030000.0


# Den location

In [None]:
df_loc = df[['year_month', 'land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent', 'num_bears',
       'avg_distance_traveled', 'avg_num_land_dens', 'avg_num_ice_dens',
       'avg_num_active_dens']]

In [None]:
df_loc.describe()

In [None]:
df_loc_null = df_loc.dropna(axis=0)

In [None]:
df_loc_null.describe()

## Climate X Bear Mobility 

In [None]:
X_mob = df_loc_null[['year_month', 'land_avg_temp', 'land_max_temp', 'land_min_temp',
       'land_ocean_avg_temp', 'north_min_temp_anomoly',
       'north_max_temp_anomoly', 'north_mean_temp_anomoly', 'global_avg_co2',
       'seaice_extent']]
y_mob = df_loc_null["avg_distance_traveled"].values.reshape(-1, 1)
print(X_mob.shape, y_mob.shape)

## Land Dens Model

In [None]:
from sklearn.model_selection import train_test_split
X_mob_train, X_mob_test, y_mob_train, y_mob_test = train_test_split(X_mob, y_mob, test_size=0.2, random_state=42)
# X_train, X_test, y_var_train, y_var_test = train_test_split(X, y_var, test_size=0.2, random_state=42)

In [None]:
#Fit Transform using MinMaxScalar for X features
from sklearn.preprocessing import MinMaxScaler
X_mob_minmax = MinMaxScaler().fit(X_mob_train)
X_mob_train_minmax = X_mob_minmax.transform(X_mob_train)
X_mob_test_minmax = X_mob_minmax.transform(X_mob_test)

#Target value
y_mob_minmax = MinMaxScaler().fit(y_mob_train)
y_mob_land_train_minmax = y_mob_minmax.transform(y_mob_train)
y_mob_land_test_minmax = y_mob_minmax.transform(y_mob_test)


# Multiple Linear Regression Analysis

## Number of Land Dens

In [None]:
# Creat model and fit to scaled training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_den_train_minmax, y_den_land_train_minmax)

In [None]:
# Make Prediction using a FIT model and plot
predictions = model.predict(X_den_test_minmax)
model.fit(X_den_train_minmax, y_den_land_train_minmax)

plt.scatter(model.predict(X_den_train_minmax), model.predict(X_den_train_minmax) - y_den_land_train_minmax, c="blue", label="Training Data")
plt.scatter(model.predict(X_den_test_minmax), model.predict(X_den_test_minmax) - y_den_land_test_minmax, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_den_land_test_minmax.min(), xmax=y_den_land_test_minmax.max())
plt.title("Residual Plot")
plt.show()

In [None]:
# Validate the Model with MSE and R2
from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_den_land_test_minmax, predictions)
r2 = model.score(X_den_test_minmax, y_den_land_test_minmax)

print(f"MSE: {MSE}, R2: {r2}")

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

train_scores = []
test_scores = []
for k in range (1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_den_train_minmax, y_den_land_train)
    train_score = knn.score(X_den_train_minmax, y_den_land_train)
    test_score = knn.score(X_den_test_minmax, y_den_land_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
#STEP 5: re-fit classifier with optimal k value
knn = KNeighborsClassifier(n_neighbors = 13)
knn.fit(X_den_train_minmax, y_den_land_train)
print("k=15 Test Acc: %.3f" %knn.score(X_den_test_minmax, y_den_land_test))