# AI Launch Lab - Sea Ice Movement Challenge

Import the necessary librairies needed to run the code 

In [1]:
%matplotlib inline
import pickle
import os 
import pandas as pd
pd.options.mode.chained_assignment = None
import datetime
import numpy as np
import os
import zipfile
import modules.ml_pipeline.readdata as mlpp

Load the data from disk and display it 

In [2]:
# unzip the zip dataset
with zipfile.ZipFile('data/converted.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

# Load the raw data to disk
input_path = "data/converted.csv"
df = pd.read_csv(input_path)

# Convert all column names to lower case and display the dataframe 
df = df.rename(str.lower, axis='columns')
df.set_index("id_buoy")

# Print the dataframe dimensions
print("Dataframe shape: ", df.shape)

Dataframe shape:  (339478, 15)


In [3]:
# Do some manipulations on the data and clean it (remove rows of all NaNs, remove duplicates, etc.)

# Dropping the NA when there is no value on the current columns
print("Before dropping the NaNs :", df.shape)
df = df.dropna(how="all")
print("After dropping the NaNs :", df.shape)

# Remove Duplicates 
df = df.drop_duplicates() 

# Print size of the df after dropping duplicates
print("After dropping the duplicates:", df.shape)

Before dropping the NaNs : (339478, 15)
After dropping the NaNs : (339478, 15)
After dropping the duplicates: (339203, 15)


Do some simple analyses on the data. Distributions, correlation maps, PCA, etc. 


In [4]:
df_summary = mlpp.summary_of_data(df, list(df.columns), ['u_buoy', 'v_buoy'])

print("Summarized data set:")
display(df_summary)

print("Dataset Stats:")
df.describe()

original = df.copy()


number of features: 15
number of targets: 2
labels: ['u_buoy', 'v_buoy']
number of rows: 339203
Summarized data set:


Unnamed: 0,year,month,day,doy,id_buoy,sic_cdr,h_cs2smos,h_piomas,d2c,buoy_lat,buoy_lon,buoy_vel_mag,buoy_vel_dir,wind_vel_mag,wind_vel_dir
0,1979,2,18,49,1906,0.990195,,3.189743,522.523298,78.007070,-128.549129,1.370671,2.191824,6.711849,3.189490
1,1979,2,18,49,1913,0.966372,,2.484009,412.767669,74.498024,-119.750294,0.741408,0.520564,6.851881,3.240164
2,1979,2,18,49,1914,0.996022,,2.474106,362.547379,74.003619,-134.786524,1.187695,2.934923,8.896751,3.014921
3,1979,2,18,49,1918,0.982681,,3.740522,381.025629,81.019593,-145.578020,0.920127,0.028026,1.496117,3.905953
4,1979,2,19,50,1906,0.990302,,3.188522,521.535334,78.002077,-128.560665,1.300527,1.273525,3.338513,2.278041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339473,2019,12,30,364,44880,1.000000,1.414148,1.620020,702.312813,86.513431,-29.153877,11.739478,4.628723,6.522125,4.314615
339474,2019,12,30,364,53005,1.000000,1.039972,1.288953,360.491321,79.286760,-53.579091,8.973035,3.772087,4.630856,3.224141
339475,2019,12,30,364,95020,1.000000,2.059716,1.960349,393.799208,79.025667,-135.924079,2.365742,1.967356,2.240471,0.523007
339476,2019,12,30,364,7750,1.000000,1.411272,1.610893,680.057567,86.582037,-23.247546,11.792136,4.703807,5.904344,4.281085


Dataset Stats:


Visualize the data to analyze existing trends.

In [5]:
# import necessary libraries for dynamic plotting 
import chart_studio.plotly as py
import plotly.graph_objs as go # import plotly graph objects
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots

# Jupyter Setup 
init_notebook_mode(connected=True) # allows us to visualize the code in a jupyter notebook

In [6]:
# Normalize the data using min-max Normalization 
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object and scale the data 
scaler = MinMaxScaler() 
columns_normalized = ["month", "buoy_lat", "buoy_lon", "buoy_vel_mag", "buoy_vel_dir","wind_vel_mag","wind_vel_dir","sic_cdr","h_cs2smos", "h_piomas","d2c"]

x = df[columns_normalized].values
x_scaled = scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=columns_normalized, index = df.index)
df[columns_normalized] = df_temp

# Display the normalized data
display(df)

Unnamed: 0,year,month,day,doy,id_buoy,sic_cdr,h_cs2smos,h_piomas,d2c,buoy_lat,buoy_lon,buoy_vel_mag,buoy_vel_dir,wind_vel_mag,wind_vel_dir
0,1979,0.090909,18,49,1906,0.990195,,0.507543,0.466216,0.719088,0.138990,0.018128,0.348841,0.264387,0.507625
1,1979,0.090909,18,49,1913,0.966371,,0.395249,0.368220,0.627196,0.163543,0.009806,0.082851,0.269912,0.515691
2,1979,0.090909,18,49,1914,0.996022,,0.393673,0.323381,0.614249,0.121584,0.015708,0.467109,0.350589,0.479842
3,1979,0.090909,18,49,1918,0.982681,,0.595182,0.339879,0.797977,0.091471,0.012169,0.004461,0.058608,0.621655
4,1979,0.090909,19,50,1906,0.990301,,0.507349,0.465334,0.718957,0.138958,0.017201,0.202688,0.131297,0.362562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339473,2019,1.000000,30,364,44880,1.000000,0.284390,0.257773,0.626741,0.941845,0.416354,0.155264,0.736686,0.256902,0.686697
339474,2019,1.000000,30,364,53005,1.000000,0.208248,0.205095,0.321545,0.752599,0.348195,0.118676,0.600348,0.182285,0.513140
339475,2019,1.000000,30,364,95020,1.000000,0.415758,0.311926,0.351284,0.745762,0.118410,0.031289,0.313115,0.087976,0.083237
339476,2019,1.000000,30,364,7750,1.000000,0.283805,0.256321,0.606871,0.943642,0.432836,0.155961,0.748636,0.232528,0.681360


In [7]:
# Box plots 

import plotly.express as px

# select a random subset of the main dataset
df_subset = df.sample(frac=0.01, replace=True, random_state=1)

# commented the following due to heavy performance cost
fig = px.box(df_subset[columns_normalized])
fig.show()

"""
fig = go.Figure()

for idx, value in enumerate(columns_normalized):
     fig.add_trace(go.Box(x=[]*len(value), y=value, name=color, marker=dict(color=color.lower())))
    
fig.update_layout(yaxis_title="Mean value", xaxis_title="Color channel",
                  title="Mean value vs. Color channel", template="plotly_white")
"""

'\nfig = go.Figure()\n\nfor idx, value in enumerate(columns_normalized):\n     fig.add_trace(go.Box(x=[]*len(value), y=value, name=color, marker=dict(color=color.lower())))\n    \nfig.update_layout(yaxis_title="Mean value", xaxis_title="Color channel",\n                  title="Mean value vs. Color channel", template="plotly_white")\n'

Create a correlation map to visualize any important correlations between the variables. 
"Correlation is defined as the strength of the linear relationship between two variables. A correlation coefficient closer to r=1 indicates that when one variable changes, the other variable changes in the same direction (positive or negative) by the same amount. A similar idea applies as the correlation coefficient approaches r=-1, but in this case, the variables have a nearly perfect inverse correlation, meaning that a change in one variable elicits an opposite change in the other variable."  https://medium.com/blockforce-capital/seeing-in-the-dark-creating-crypto-heat-map-correlations-with-plot-ly-7154cf362f92

In [8]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(z = df_subset[columns_normalized].corr(),
                               x = columns_normalized,
                               y = columns_normalized,
                                colorscale= "ylgn"))

fig.update_layout(title="Sea Ice Data Correlation Map")
                
fig.show()

In [9]:
# Heat Map with datetimes (Does not quite work yet)

"""
fig = go.Figure(data=go.Heatmap(
        z=df[columns_normalized],
        x=df[["year"]],
        y=columns_normalized,
        colorscale='Viridis'))

fig.update_layout(
    title='GitHub commits per day')

fig.show()
"""

'\nfig = go.Figure(data=go.Heatmap(\n        z=df[columns_normalized],\n        x=df[["year"]],\n        y=columns_normalized,\n        colorscale=\'Viridis\'))\n\nfig.update_layout(\n    title=\'GitHub commits per day\')\n\nfig.show()\n'

In [10]:
# Distribution Plots

fig = px.histogram(df_subset[columns_normalized], opacity=.6)
fig.show()

"""
fig = ff.create_distplot(df[columns_normalized], group_labels="day of year")
fig.update_layout(showlegend=False, template="simple_white")
fig.update_layout(title_text="Distributions of Variables in the Sea Ice Dataset")

fig.show()
"""

'\nfig = ff.create_distplot(df[columns_normalized], group_labels="day of year")\nfig.update_layout(showlegend=False, template="simple_white")\nfig.update_layout(title_text="Distributions of Variables in the Sea Ice Dataset")\n\nfig.show()\n'

In [11]:
"""
We could potentially add a backdrop image to a correlation plot in plotly! Leaving this here in case we ever need it...

layout = go.Layout(
   images=[dict( #add custom image to top right corner of graph
        source="main-logo-black.png",
        xref="paper", yref="paper",
        x=1.2, y=1.1,
        sizex=0.32, sizey=0.32,
        xanchor="right", yanchor="bottom")],
  
    title=f'Return Correlation',
    

"""

'\nWe could potentially add a backdrop image to a correlation plot in plotly! Leaving this here in case we ever need it...\n\nlayout = go.Layout(\n   images=[dict( #add custom image to top right corner of graph\n        source="main-logo-black.png",\n        xref="paper", yref="paper",\n        x=1.2, y=1.1,\n        sizex=0.32, sizey=0.32,\n        xanchor="right", yanchor="bottom")],\n  \n    title=f\'Return Correlation\',\n    \n\n'