# AI Launch Lab - Sea Ice Movement Challenge

Import the necessary librairies needed to run the code 

In [2]:
%matplotlib inline
import pickle
import os 
import pandas as pd
pd.options.mode.chained_assignment = None
import datetime
import numpy as np
import os
import zipfile
import modules.ml_pipeline.readdata as mlpp

Load the data from disk and display it 

In [9]:
# unzip the zip dataset
with zipfile.ZipFile('data/data-sea-ice.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

# Load the raw data to disk
input_path = "data/DRIFT_DATA_TRAIN.csv"
df = pd.read_csv(input_path)

# Convert all column names to lower case and display the dataframe 
df = df.rename(str.lower, axis='columns')
df.set_index("id_buoy")

# Print the dataframe dimensions
print("Dataframe shape: ", df.shape)

Dataframe shape:  (339478, 15)


In [10]:
# Do some manipulations on the data and clean it (remove rows of all NaNs, remove duplicates, etc.)

# Dropping the NA when there is no value on the current columns
print("Before dropping the NaNs :", df.shape)
df = df.dropna(how="all")
print("After dropping the NaNs :", df.shape)

# Remove Duplicates 
df = df.drop_duplicates() 

# Print size of the df after dropping duplicates
print("After dropping the duplicates:", df.shape)

Before dropping the NaNs : (339478, 15)
After dropping the NaNs : (339478, 15)
After dropping the duplicates: (339203, 15)


Do some simple analyses on the data. Distributions, correlation maps, PCA, etc. 


In [4]:
df_summary = mlpp.summary_of_data(df, list(df.columns), ['u_buoy', 'v_buoy'])

print("Summarized data set:")
display(df_summary)

print("Dataset Stats:")
df.describe()

original = df.copy()


number of features: 15
number of targets: 2
labels: ['u_buoy', 'v_buoy']
number of rows: 339203
Summarized data set:


Unnamed: 0,year,month,day,doy,x_ease,y_ease,u_buoy,v_buoy,id_buoy,u_era5,v_era5,sic_cdr,h_cs2smos,h_piomas,d2c
0,1979,2,18,49,147.506958,138.582672,-0.797554,1.114740,1906,-6.704156,-0.321260,0.990195,,3.189743,522.523298
1,1979,2,18,49,146.834778,120.509880,0.643200,0.368754,1913,-6.818630,-0.674205,0.966372,,2.484009,412.767669
2,1979,2,18,49,130.993561,129.623672,-1.162420,0.243717,1914,-8.825469,1.123955,0.996022,,2.474106,362.547379
3,1979,2,18,49,147.524719,157.382492,0.919766,0.025784,1918,-1.079951,-1.035410,0.982681,,3.740522,381.025629
4,1979,2,19,50,147.470963,138.599823,0.380940,1.243485,1906,-2.169171,2.537787,0.990302,,3.188522,521.535334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339473,2019,12,30,364,193.232056,172.742004,-0.981225,-11.698400,44880,-2.526544,-6.012877,1.000000,1.414148,1.620020,702.312813
339474,2019,12,30,364,208.421234,142.049896,-7.247925,-5.289890,53005,-4.615093,-0.381765,1.000000,1.039972,1.288953,360.491321
339475,2019,12,30,364,145.264023,146.109741,-0.913761,2.182150,95020,1.940967,1.119087,1.000000,2.059716,1.960349,393.799208
339476,2019,12,30,364,193.921402,174.408707,-0.101372,-11.791700,7750,-2.468425,-5.363596,1.000000,1.411272,1.610893,680.057567


Dataset Stats:


Visualize the data to analyze existing trends.

In [11]:
# import necessary libraries for dynamic plotting 
import chart_studio.plotly as py
import plotly.graph_objs as go # import plotly graph objects
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots

# Jupyter Setup 
init_notebook_mode(connected=True) # allows us to visualize the code in a jupyter notebook

In [12]:
# Normalize the data using min-max Normalization 
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object and scale the data 
scaler = MinMaxScaler() 
columns_normalized = ["doy", "x_ease", "y_ease", "u_buoy", "v_buoy","u_era5","v_era5","sic_cdr","h_cs2smos", "h_piomas","d2c"]
x = df[columns_normalized].values
x_scaled = scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=columns_normalized, index = df.index)
df[columns_normalized] = df_temp

# Display the normalized data
display(df)

Unnamed: 0,year,month,day,doy,x_ease,y_ease,u_buoy,v_buoy,id_buoy,u_era5,v_era5,sic_cdr,h_cs2smos,h_piomas,d2c
0,1979,2,18,0.132231,0.450990,0.419101,0.516395,0.501371,1906,0.399561,0.460963,0.990195,,0.507543,0.466216
1,1979,2,18,0.132231,0.448192,0.344633,0.526465,0.495862,1913,0.396769,0.452757,0.966371,,0.395249,0.368220
2,1979,2,18,0.132231,0.382248,0.382186,0.513845,0.494939,1914,0.347817,0.494566,0.996022,,0.393673,0.323381
3,1979,2,18,0.132231,0.451064,0.496565,0.528398,0.493330,1918,0.536749,0.444359,0.982681,,0.595182,0.339879
4,1979,2,19,0.134986,0.450840,0.419172,0.524632,0.502322,1906,0.510181,0.527439,0.990301,,0.507349,0.465334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339473,2019,12,30,1.000000,0.641335,0.559853,0.515112,0.406754,44880,0.501463,0.328627,1.000000,0.284390,0.257773,0.626741
339474,2019,12,30,1.000000,0.704564,0.433388,0.471313,0.454077,53005,0.450518,0.459557,1.000000,0.208248,0.205095,0.321545
339475,2019,12,30,1.000000,0.441653,0.450116,0.515583,0.509253,95020,0.610437,0.494453,1.000000,0.415758,0.311926,0.351284
339476,2019,12,30,1.000000,0.644204,0.566721,0.521261,0.406065,7750,0.502881,0.343724,1.000000,0.283805,0.256321,0.606871


In [13]:
# Box plots 

import plotly.express as px

# select a random subset of the main dataset
df_subset = df.sample(frac=0.01, replace=True, random_state=1)

# commented the following due to heavy performance cost
fig = px.box(df_subset[columns_normalized])
fig.show()

"""
fig = go.Figure()

for idx, value in enumerate(columns_normalized):
     fig.add_trace(go.Box(x=[]*len(value), y=value, name=color, marker=dict(color=color.lower())))
    
fig.update_layout(yaxis_title="Mean value", xaxis_title="Color channel",
                  title="Mean value vs. Color channel", template="plotly_white")
"""

'\nfig = go.Figure()\n\nfor idx, value in enumerate(columns_normalized):\n     fig.add_trace(go.Box(x=[]*len(value), y=value, name=color, marker=dict(color=color.lower())))\n    \nfig.update_layout(yaxis_title="Mean value", xaxis_title="Color channel",\n                  title="Mean value vs. Color channel", template="plotly_white")\n'

Create a correlation map to visualize any important correlations between the variables. 
"Correlation is defined as the strength of the linear relationship between two variables. A correlation coefficient closer to r=1 indicates that when one variable changes, the other variable changes in the same direction (positive or negative) by the same amount. A similar idea applies as the correlation coefficient approaches r=-1, but in this case, the variables have a nearly perfect inverse correlation, meaning that a change in one variable elicits an opposite change in the other variable."  https://medium.com/blockforce-capital/seeing-in-the-dark-creating-crypto-heat-map-correlations-with-plot-ly-7154cf362f92

In [14]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(z = df_subset[columns_normalized].corr(),
                               x = columns_normalized,
                               y = columns_normalized,
                                colorscale= "ylgn"))

fig.update_layout(title="Sea Ice Data Correlation Map")
                
fig.show()

In [15]:
# Heat Map with datetimes (Does not quite work yet)

"""
fig = go.Figure(data=go.Heatmap(
        z=df[columns_normalized],
        x=df[["year"]],
        y=columns_normalized,
        colorscale='Viridis'))

fig.update_layout(
    title='GitHub commits per day')

fig.show()
"""

'\nfig = go.Figure(data=go.Heatmap(\n        z=df[columns_normalized],\n        x=df[["year"]],\n        y=columns_normalized,\n        colorscale=\'Viridis\'))\n\nfig.update_layout(\n    title=\'GitHub commits per day\')\n\nfig.show()\n'

In [16]:
"""
We could potentially add a backdrop image to a correlation plot in plotly! Leaving this here in case we ever need it...

layout = go.Layout(
   images=[dict( #add custom image to top right corner of graph
        source="main-logo-black.png",
        xref="paper", yref="paper",
        x=1.2, y=1.1,
        sizex=0.32, sizey=0.32,
        xanchor="right", yanchor="bottom")],
  
    title=f'Return Correlation',
    

"""

'\nWe could potentially add a backdrop image to a correlation plot in plotly! Leaving this here in case we ever need it...\n\nlayout = go.Layout(\n   images=[dict( #add custom image to top right corner of graph\n        source="main-logo-black.png",\n        xref="paper", yref="paper",\n        x=1.2, y=1.1,\n        sizex=0.32, sizey=0.32,\n        xanchor="right", yanchor="bottom")],\n  \n    title=f\'Return Correlation\',\n    \n\n'

In [17]:
# Distribution Plots

fig = px.histogram(df_subset[columns_normalized], marginal="rug", opacity=.6)
fig.show()

"""
fig = ff.create_distplot(df[columns_normalized], group_labels="day of year")
fig.update_layout(showlegend=False, template="simple_white")
fig.update_layout(title_text="Distributions of Variables in the Sea Ice Dataset")

fig.show()
"""

'\nfig = ff.create_distplot(df[columns_normalized], group_labels="day of year")\nfig.update_layout(showlegend=False, template="simple_white")\nfig.update_layout(title_text="Distributions of Variables in the Sea Ice Dataset")\n\nfig.show()\n'

In [12]:
columns_normalized

['doy',
 'x_ease',
 'y_ease',
 'u_buoy',
 'v_buoy',
 'u_era5',
 'v_era5',
 'sic_cdr',
 'h_cs2smos',
 'h_piomas',
 'd2c']