# AI Launch Lab - Sea Ice Movement Challenge - Training

Do some more analysis on the data before choosing our final model.

In [10]:
import pandas as pd
import pickle
import os 
pd.options.mode.chained_assignment = None
import datetime
import numpy as np
import os
import zipfile
import modules.ml_pipeline.readdata as mlpp
import sys

Load the data from disk and display it

In [22]:
from os import listdir
from os.path import isfile, join

import glob
print(glob.glob("./data"))

['./data']


In [26]:

for root, dirs, files in os.walk("."):
    print(root)
    print(dir)
    for filename in files:
        print(filename)

.
<built-in function dir>
README.md
.gitignore
sea_ice_data_visualization.ipynb
sea_ice_data_PCA_analysis.ipynb
main.ipynb
sea_ice_data_ML_analysis.ipynb
requirements.txt
./modules
<built-in function dir>
utils.py
./modules/ml_pipeline
<built-in function dir>
README.md
NN_regression.py
soil_prediction.py
multilabelClassificationMetrics.py
NN_regression_pytorch.py
multilabelClassification.py
gcn.py
data_view.py
gcn_model.py
res_transcript_count_matrix_norm_lab.csv
multioutput_nn_regressor.py
spatial_graph_cons.py
refine.py
readdata.py
NN.py
soil_classification.py
main.py
analysis.py
./modules/ml_pipeline/__pycache__
<built-in function dir>
readdata.cpython-37.pyc
readdata.cpython-36.pyc
./.git
<built-in function dir>
index
description
config
packed-refs
FETCH_HEAD
HEAD
./.git/logs
<built-in function dir>
HEAD
./.git/logs/refs
<built-in function dir>
./.git/logs/refs/remotes
<built-in function dir>
./.git/logs/refs/remotes/origin
<built-in function dir>
density-vis
HEAD
master
./.git/log

In [24]:
# Load the raw data to disk

input_path = "./data/converted.csv"
df = pd.read_csv(input_path)

# Convert all column names to lower case and display the dataframe 
df = df.rename(str.lower, axis='columns')
print(df.columns)

Index(['index', 'year', 'month', 'day', 'doy', 'id_buoy', 'sic_cdr', 'd2c',
       'ice_thickness', 'buoy_lat', 'buoy_lon', 'buoy_vel_mag', 'buoy_vel_dir',
       'wind_vel_mag', 'wind_vel_dir'],
      dtype='object')


In [8]:
# Do some manipulations on the data and clean it (remove rows of all NaNs, remove duplicates, etc.)

# Remove any rows that have buoy velocity/mag =0 
df = df.drop(df[df["buoy_vel_mag"] == 0].index)

# Remove any rows that have very low ice_thickness  
"""
sea ice concentration of 0.5 means that the surroundings of the buoy is 50% sea ice and 50% open water. So ice concentration of 
1 is full ice cover, while a concentration of 0.1 means there’S barely any ice left at that location!
"""
df = df.drop(df[df["ice_thickness"] < 0.1].index)

# Print the dataframe dimensions
print("Dataframe final shape: ", df.shape)
display(df)

Dataframe final shape:  (327042, 15)


Unnamed: 0,index,year,month,day,doy,id_buoy,sic_cdr,d2c,ice_thickness,buoy_lat,buoy_lon,buoy_vel_mag,buoy_vel_dir,wind_vel_mag,wind_vel_dir
0,0,1979,2,18,49,1906,0.990195,522.523298,3.189743,78.007070,-128.549129,1.370671,2.191824,6.711849,3.189490
1,1,1979,2,18,49,1913,0.966372,412.767669,2.484009,74.498024,-119.750294,0.741408,0.520564,6.851881,3.240164
2,2,1979,2,18,49,1914,0.996022,362.547379,2.474106,74.003619,-134.786524,1.187695,2.934923,8.896751,3.014921
3,3,1979,2,18,49,1918,0.982681,381.025629,3.740522,81.019593,-145.578020,0.920127,0.028026,1.496117,3.905953
4,4,1979,2,19,50,1906,0.990302,521.535334,3.188522,78.002077,-128.560665,1.300527,1.273525,3.338513,2.278041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329883,339472,2019,12,30,364,25560,1.000000,470.197315,0.933125,75.686986,-96.341501,17.168005,3.811734,8.234026,3.676847
329884,339473,2019,12,30,364,44880,1.000000,702.312813,1.517084,86.513431,-29.153877,11.739478,4.628723,6.522125,4.314615
329885,339474,2019,12,30,364,53005,1.000000,360.491321,1.164462,79.286760,-53.579091,8.973035,3.772087,4.630856,3.224141
329886,339475,2019,12,30,364,95020,1.000000,393.799208,2.010032,79.025667,-135.924079,2.365742,1.967356,2.240471,0.523007


Define the train/test variables that we will use. 

In [None]:
print("Sea Ice Movement Datasets")
training_targets = df[["buoy_vel_mag","buoy_vel_dir"]]

# We will leave the month for now because it could be an indicator of weather/season 
print("Target Variables")
display(training_targets)

In [None]:
# Drop the time related columns 
training_data = df.drop(["index", "year","day","sic_cdr", "doy", "id_buoy","buoy_vel_mag","buoy_vel_dir"], axis = 1)
display(training_data)

In [None]:
# Evaluate multioutput regression models
%matplotlib inline
import numpy as np
import time 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

Split the dataset into training and validation sets. 

In [None]:
# test_size: what proportion of original data is used for test set
train_data, test_data, train_labels, test_labels = train_test_split(
    training_data, training_targets, test_size= 0.25, shuffle=False)

# show the sizes of the training and test sets
print("Training data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

print("Training labels shape: ", train_labels.shape)
print("Test labels shape: ", test_labels.shape)

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

scaler = MinMaxScaler()

# Fit on training set only.
scaler.fit(train_data)

# Apply transform to both the training set and the test set.
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)

# display the normalized data
display(train_data)

Create a series of models and calculate the predictions.

In [None]:
# K-NEAREST NEIGHBOR MODEL 

# create model
regr_KNN = KNeighborsRegressor(n_neighbors=7)

# train the model 
print("Training KNN Regressor...")
start = time.time()
regr_KNN.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_KNN=regr_KNN.predict(test_data)
print("Regression Score: ", regr_KNN.score(test_data, test_labels))

In [None]:
# LINEAR REGRESSION MODEL 

# create model
regr_LR = LinearRegression()

# train the model 
print("Training Linear Regressor...")
start = time.time()
regr_LR.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_LR = regr_LR.predict(test_data)
print("Regression Score: ", regr_LR.score(test_data,test_labels))

In [None]:
# DECISION TREE MODEL 

# create model
regr_DT = DecisionTreeRegressor()

# train the model 
print("Training Decision Tree Regressor...")
start = time.time()
regr_DT.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_DT = regr_DT.predict(test_data)
print("Regression Score: ", regr_DT.score(test_data,test_labels))

In [None]:
# RANDOM FOREST  MODEL 

# create model
regr_rf = RandomForestRegressor(n_estimators=100)

# train the model 
print("Training Random Forest Regressor...")
start = time.time()
regr_rf.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_rf = regr_rf.predict(test_data)
print("Regression Score: ", regr_rf.score(test_data, test_labels))

In [None]:
# SUPPORT VECTOR REGRESSOR MODEL 

# create model
regr_svm = MultiOutputRegressor(LinearSVR())

# train the model 
print("Training SVM Regressor...")
start = time.time()
regr_svm.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_svm = regr_svm.predict(test_data)
print("Regression Score: ", regr_svm.score(test_data, test_labels))

""""
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2, random_state=1, noise=0.5)
# define base model
model = LinearSVR()
# define the direct multioutput wrapper model
wrapper = MultiOutputRegressor(model)
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(wrapper, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
"""

In [None]:
""""This one is kinda long to run... Commenting for now. """

# MULTIOUPUT REGRESSOR MODEL

# create model
regr_multirf = MultiOutputRegressor(RandomForestRegressor())

# train the model 
print("Training Random Forest Multioutput Regressor...")
start = time.time()
regr_multirf.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_multirf = regr_multirf.predict(test_data)
print("Regression Score: ", regr_rf.score(test_data,test_labels))

In [None]:
# Random Forests 

plt.scatter(y_rf, test_labels, alpha=0.5)
plt.show()

# evaluate the model and collect the scores
n_scores = cross_val_score(regr_rf, train_data, train_labels,  scoring='neg_mean_absolute_error', n_jobs=-1)

# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
# x and y given as array_like objects
import matplotlib.pyplot as plt

plt.scatter(y_multirf, test_labels, alpha=0.5)
plt.show()


In [None]:
# x and y given as array_like objects
import matplotlib.pyplot as plt

plt.scatter(y_KNN, test_labels, alpha=0.5)
plt.show()

In [None]:
# x and y given as array_like objects
import matplotlib.pyplot as plt

plt.scatter(y_LR, test_labels, alpha=0.5)
plt.show()

In [None]:
# x and y given as array_like objects
import matplotlib.pyplot as plt

plt.scatter(y_DT, test_labels, alpha=0.5)
plt.show()

Normalize the data: 

In [None]:
# check scikit-learn version
import sklearn
print(sklearn.__version__)

Create a series of models and calculate the predictions

In [None]:
# Evaluate the model with K-Cross Validation

IMPORT K-FOLD
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3)

# evaluate the model and collect the scores
n_scores = cross_val_score(model, train_data, train_labels,  scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

#model.predict(test_data, test_labels)
