# AI Launch Lab - Sea Ice Movement Challenge - Training

Do some more analysis on the data before choosing our final model.

In [1]:
import pandas as pd
import pickle
import os 
pd.options.mode.chained_assignment = None
import datetime
import numpy as np
import os
import zipfile
import modules.ml_pipeline.readdata as mlpp

Load the data from disk and display it

In [2]:
# unzip the zip dataset
with zipfile.ZipFile('data/converted.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

# Load the raw data to disk
input_path = "data/converted.csv"
df = pd.read_csv(input_path)

# Convert all column names to lower case and display the dataframe 
df = df.rename(str.lower, axis='columns')
df.set_index("id_buoy")

Unnamed: 0_level_0,index,year,month,day,doy,sic_cdr,d2c,ice_thickness,buoy_lat,buoy_lon,buoy_vel_mag,buoy_vel_dir,wind_vel_mag,wind_vel_dir
id_buoy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1906,0,1979,2,18,49,0.990195,522.523298,3.189743,78.007070,-128.549129,1.370671,2.191824,6.711849,3.189490
1913,1,1979,2,18,49,0.966372,412.767669,2.484009,74.498024,-119.750294,0.741408,0.520564,6.851881,3.240164
1914,2,1979,2,18,49,0.996022,362.547379,2.474106,74.003619,-134.786524,1.187695,2.934923,8.896751,3.014921
1918,3,1979,2,18,49,0.982681,381.025629,3.740522,81.019593,-145.578020,0.920127,0.028026,1.496117,3.905953
1906,4,1979,2,19,50,0.990302,521.535334,3.188522,78.002077,-128.560665,1.300527,1.273525,3.338513,2.278041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25560,339472,2019,12,30,364,1.000000,470.197315,0.933125,75.686986,-96.341501,17.168005,3.811734,8.234026,3.676847
44880,339473,2019,12,30,364,1.000000,702.312813,1.517084,86.513431,-29.153877,11.739478,4.628723,6.522125,4.314615
53005,339474,2019,12,30,364,1.000000,360.491321,1.164462,79.286760,-53.579091,8.973035,3.772087,4.630856,3.224141
95020,339475,2019,12,30,364,1.000000,393.799208,2.010032,79.025667,-135.924079,2.365742,1.967356,2.240471,0.523007


In [3]:
# Do some manipulations on the data and clean it (remove rows of all NaNs, remove duplicates, etc.)

# Remove any rows that have buoy velocity/mag =0 
df = df.drop(df[df["buoy_vel_mag"] == 0].index)

# Remove any rows that have very low ice_thickness  

"""
sea ice concentration of 0.5 means that the surroundings of the buoy is 50% sea ice and 50% open water. So ice concentration of 
1 is full ice cover, while a concentration of 0.1 means there’S barely any ice left at that location!
"""
df = df.drop(df[df["ice_thickness"] < 0.1].index)

# Print the dataframe dimensions
print("Dataframe final shape: ", df.shape)
display(df)

Dataframe final shape:  (327042, 15)


Unnamed: 0,index,year,month,day,doy,id_buoy,sic_cdr,d2c,ice_thickness,buoy_lat,buoy_lon,buoy_vel_mag,buoy_vel_dir,wind_vel_mag,wind_vel_dir
0,0,1979,2,18,49,1906,0.990195,522.523298,3.189743,78.007070,-128.549129,1.370671,2.191824,6.711849,3.189490
1,1,1979,2,18,49,1913,0.966372,412.767669,2.484009,74.498024,-119.750294,0.741408,0.520564,6.851881,3.240164
2,2,1979,2,18,49,1914,0.996022,362.547379,2.474106,74.003619,-134.786524,1.187695,2.934923,8.896751,3.014921
3,3,1979,2,18,49,1918,0.982681,381.025629,3.740522,81.019593,-145.578020,0.920127,0.028026,1.496117,3.905953
4,4,1979,2,19,50,1906,0.990302,521.535334,3.188522,78.002077,-128.560665,1.300527,1.273525,3.338513,2.278041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329883,339472,2019,12,30,364,25560,1.000000,470.197315,0.933125,75.686986,-96.341501,17.168005,3.811734,8.234026,3.676847
329884,339473,2019,12,30,364,44880,1.000000,702.312813,1.517084,86.513431,-29.153877,11.739478,4.628723,6.522125,4.314615
329885,339474,2019,12,30,364,53005,1.000000,360.491321,1.164462,79.286760,-53.579091,8.973035,3.772087,4.630856,3.224141
329886,339475,2019,12,30,364,95020,1.000000,393.799208,2.010032,79.025667,-135.924079,2.365742,1.967356,2.240471,0.523007


Define the train/test variables that we will use. 

In [4]:
print("Sea Ice Movement Datasets")

training_targets = df[["buoy_vel_mag","buoy_vel_dir"]]

# We will leave the month for now because it could be an indicator of weather/season 
print("Target Variables")
display(training_targets)

Sea Ice Movement Datasets
Target Variables


Unnamed: 0,buoy_vel_mag,buoy_vel_dir
0,1.370671,2.191824
1,0.741408,0.520564
2,1.187695,2.934923
3,0.920127,0.028026
4,1.300527,1.273525
...,...,...
329883,17.168005,3.811734
329884,11.739478,4.628723
329885,8.973035,3.772087
329886,2.365742,1.967356


In [5]:
# Drop the time related columns 
training_data = df.drop(["index", "year","day","sic_cdr", "doy", "id_buoy","buoy_vel_mag","buoy_vel_dir"], axis = 1)
display(training_data)

Unnamed: 0,month,d2c,ice_thickness,buoy_lat,buoy_lon,wind_vel_mag,wind_vel_dir
0,2,522.523298,3.189743,78.007070,-128.549129,6.711849,3.189490
1,2,412.767669,2.484009,74.498024,-119.750294,6.851881,3.240164
2,2,362.547379,2.474106,74.003619,-134.786524,8.896751,3.014921
3,2,381.025629,3.740522,81.019593,-145.578020,1.496117,3.905953
4,2,521.535334,3.188522,78.002077,-128.560665,3.338513,2.278041
...,...,...,...,...,...,...,...
329883,12,470.197315,0.933125,75.686986,-96.341501,8.234026,3.676847
329884,12,702.312813,1.517084,86.513431,-29.153877,6.522125,4.314615
329885,12,360.491321,1.164462,79.286760,-53.579091,4.630856,3.224141
329886,12,393.799208,2.010032,79.025667,-135.924079,2.240471,0.523007


In [6]:
# Evaluate multioutput regression models
%matplotlib inline
import numpy as np
import time 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

Split the dataset into training and validation sets. 

In [7]:
# test_size: what proportion of original data is used for test set
train_data, test_data, train_labels, test_labels = train_test_split(
    training_data, training_targets, test_size= 0.25, shuffle=True)

# show the sizes of the training and test sets
print("Training data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

print("Training labels shape: ", train_labels.shape)
print("Test labels shape: ", test_labels.shape)

Training data shape:  (245281, 7)
Test data shape:  (81761, 7)
Training labels shape:  (245281, 2)
Test labels shape:  (81761, 2)


Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data

In [8]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

scaler = MinMaxScaler()

# Fit on training set only.
scaler.fit(train_data)

# Apply transform to both the training set and the test set.
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)

# display the normalized data
display(train_data)

array([[0.81818182, 0.42663328, 0.26078594, ..., 0.70311537, 0.22514973,
        0.62747221],
       [0.54545455, 0.35291024, 0.17768981, ..., 0.12515426, 0.2712856 ,
        0.38507636],
       [0.27272727, 0.64608179, 0.41957298, ..., 0.14428325, 0.24669606,
        0.44852449],
       ...,
       [0.90909091, 0.19731674, 0.50370991, ..., 0.1023689 , 0.2326095 ,
        0.68150506],
       [0.09090909, 0.49267473, 0.39238209, ..., 0.67164096, 0.21386761,
        0.09066163],
       [0.90909091, 0.17331403, 0.13033766, ..., 0.54347088, 0.32884335,
        0.49199055]])

Create a series of models and calculate the predictions.

In [9]:
# K-NEAREST NEIGHBOR MODEL 

# create model
regr_KNN = KNeighborsRegressor(n_neighbors=7)

# train the model 
print("Training KNN Regressor...")
start = time.time()
regr_KNN.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_KNN=regr_KNN.predict(test_data)
print("Regression Score: ", regr_KNN.score(test_data, test_labels))

Training KNN Regressor...
Model training time:  0.5657415390014648
Predicting scores...
Regression Score:  0.6554243171553308




In [10]:
# LINEAR REGRESSION MODEL 

# create model
regr_LR = LinearRegression()

# train the model 
print("Training Linear Regressor...")
start = time.time()
regr_LR.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_LR = regr_LR.predict(test_data)
print("Regression Score: ", regr_LR.score(test_data,test_labels))

Training Linear Regressor...
Model training time:  0.25689172744750977
Predicting scores...
Regression Score:  0.4706958928312219




In [11]:
# DECISION TREE MODEL 

# create model
regr_DT = DecisionTreeRegressor()

# train the model 
print("Training Random Forest Multioutput Regressor...")
start = time.time()
regr_DT.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_DT = regr_DT.predict(test_data)
print("Regression Score: ", regr_DT.score(test_data,test_labels))

Training Random Forest Multioutput Regressor...
Model training time:  3.9193105697631836
Predicting scores...
Regression Score:  0.410190800095046




In [12]:
# RANDOM FOREST  MODEL 

# create model
regr_rf = RandomForestRegressor(n_estimators=100)

# train the model 
print("Training Random Forest Regressor...")
start = time.time()
regr_rf.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_rf = regr_rf.predict(test_data)
print("Regression Score: ", regr_rf.score(test_data, test_labels))

Training Random Forest Regressor...
Model training time:  241.18301105499268
Predicting scores...
Regression Score:  0.7002731938567859




In [None]:
# SUPPORT VECTOR REGRESSOR MODEL 

# create model
svm = LinearSVR()
regr_wrapper = MultiOutputRegressor(svm)

# train the model 
print("Training Random Forest Regressor...")
start = time.time()
regr_svm.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_svm = regr_svm.predict(test_data)
print("Regression Score: ", regr_svm.score(test_data, test_labels))

""""
# define dataset
X, y = make_regression(n_samples=1000, n_features=10, n_informative=5, n_targets=2, random_state=1, noise=0.5)
# define base model
model = LinearSVR()
# define the direct multioutput wrapper model
wrapper = MultiOutputRegressor(model)
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(wrapper, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
"""

In [None]:
""""This one is kinda long to run... Commenting for now.

# MULTIOUPUT REGRESSOR MODEL

# create model
regr_multirf = MultiOutputRegressor(RandomForestRegressor())

# train the model 
print("Training Random Forest Multioutput Regressor...")
start = time.time()
regr_multirf.fit(train_data, train_labels)
end = time.time()
print("Model training time: ", end-start)

# Calculate the new predictions and print the regression score:
print("Predicting scores...")
y_multirf = regr_multirf.predict(test_data)
print("Regression Score: ", regr_rf.score(test_data,test_labels))
"""

Training Random Forest Multioutput Regressor...


In [None]:
# Random Forests 

plt.scatter(y_rf, test_labels, alpha=0.5)
plt.show()

# evaluate the model and collect the scores
n_scores = cross_val_score(regr_rf, train_data, train_labels,  scoring='neg_mean_absolute_error', n_jobs=-1)

# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
# x and y given as array_like objects
import matplotlib.pyplot as plt

plt.scatter(y_multirf, test_labels, alpha=0.5)
plt.show()


In [None]:
# x and y given as array_like objects
import matplotlib.pyplot as plt

plt.scatter(y_KNN, test_labels, alpha=0.5)
plt.show()

In [None]:
# x and y given as array_like objects
import matplotlib.pyplot as plt

plt.scatter(y_LR, test_labels, alpha=0.5)
plt.show()

In [None]:
# x and y given as array_like objects
import matplotlib.pyplot as plt

plt.scatter(y_DT, test_labels, alpha=0.5)
plt.show()

Normalize the data: 

In [None]:
# check scikit-learn version
import sklearn
print(sklearn.__version__)

Create a series of models and calculate the predictions

In [None]:
# Evaluate the model with K-Cross Validation

IMPORT K-FOLD
# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3)

# evaluate the model and collect the scores
n_scores = cross_val_score(model, train_data, train_labels,  scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

# force the scores to be positive
n_scores = absolute(n_scores)
# summarize performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

#model.predict(test_data, test_labels)
