# AI Launch Lab - Sea Ice Movement Challenge

Import the necessary librairies needed to run the code 

In [1]:
import pickle
import os 
import pandas as pd
pd.options.mode.chained_assignment = None
import datetime
import numpy as np
import os
import ml_pipeline.readdata as mlpp

Load the data from disk and display it 

In [2]:
# Load the raw data to disk
input_path = "Samples DRIFT_DATA_TRAIN.csv"
df = pd.read_csv(input_path)

# Convert all column names to lower case and display the dataframe 
df = df.rename(str.lower, axis='columns')
df.set_index("id_buoy")

# Print the dataframe dimensions
print("Dataframe shape: ", df.shape)

Dataframe shape:  (8, 15)


In [3]:
# Do some manipulations on the data and clean it (remove rows of all NaNs, remove duplicates, etc.)

# Dropping the NA when there is no value on the current columns
print("Before dropping the NaNs :", df.shape)
df = df.dropna(how="all")
print("After dropping the NaNs :", df.shape)

# Remove Duplicates 
df = df.drop_duplicates() 

# Print size of the df after dropping duplicates
print("After dropping the duplicates:", df.shape)

Before dropping the NaNs : (8, 15)
After dropping the NaNs : (8, 15)
After dropping the duplicates: (8, 15)


Do some simple analyses on the data. Distributions, correlation maps, PCA, etc. 


In [4]:
df_summary = mlpp.summary_of_data(df, list(df.columns), ['u_buoy', 'v_buoy'])

print("Summarized data set:")
display(df_summary)

print("Dataset Stats:")
df.describe()

original = df.copy()


number of features: 15
number of targets: 2
labels: ['u_buoy', 'v_buoy']
number of rows: 8
Summarized data set:


Unnamed: 0,year,month,day,doy,x_ease,y_ease,u_buoy,v_buoy,id_buoy,u_era5,v_era5,sic_cdr,h_cs2smos,h_piomas,d2c
0,1979,2,18,49,147.506958,138.582672,-0.797554,1.11474,1906,-6.704156,-0.32126,0.990195,,3.189743,522.523298
1,1979,2,18,49,146.834778,120.50988,0.6432,0.368754,1913,-6.81863,-0.674205,0.966372,,2.484009,412.767669
2,1979,2,18,49,130.993561,129.623672,-1.16242,0.243717,1914,-8.825469,1.123955,0.996022,,2.474106,362.547379
3,1979,2,18,49,147.524719,157.382492,0.919766,0.025784,1918,-1.079951,-1.03541,0.982681,,3.740522,381.025629
4,1979,2,19,50,147.470963,138.599823,0.38094,1.243485,1906,-2.169171,2.537787,0.990302,,3.188522,521.535334
5,1979,2,19,50,180.349854,118.013527,1.387772,-0.253256,1911,2.68091,-0.295979,1.0,,2.574216,475.418633
6,1979,2,19,50,146.83049,120.509583,3.025445,1.076415,1913,0.551862,3.960332,1.0,,2.490376,412.761318
7,1979,2,19,50,130.940811,129.619873,1.409495,-0.04115,1914,-1.85992,1.140724,0.979121,,2.480513,361.805709


Dataset Stats:


Visualize the data to analyze existing trends.

In [5]:
# import necessary libraries for dynamic plotting 
import chart_studio.plotly as py
import plotly.graph_objs as go # import plotly graph objects
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots

# Jupyter Setup 
init_notebook_mode(connected=True) # allows us to visualize the code in a jupyter notebook

In [6]:
# Normalize the data using min-max Normalization 
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object and scale the data 
scaler = MinMaxScaler() 
columns_normalized = ["doy", "x_ease", "y_ease", "u_buoy", "v_buoy","u_era5","v_era5","sic_cdr","h_cs2smos", "h_piomas","d2c"]
x = df[columns_normalized].values
x_scaled = scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=columns_normalized, index = df.index)
df[columns_normalized] = df_temp

# Display the normalized data
display(df)


All-NaN slice encountered


All-NaN slice encountered



Unnamed: 0,year,month,day,doy,x_ease,y_ease,u_buoy,v_buoy,id_buoy,u_era5,v_era5,sic_cdr,h_cs2smos,h_piomas,d2c
0,1979,2,18,0.0,0.335286,0.522471,0.087125,0.913983,1906,0.18436,0.142952,0.708423,,0.565088,1.0
1,1979,2,18,0.0,0.321681,0.063409,0.431155,0.415576,1913,0.174411,0.072302,0.0,,0.007819,0.31709
2,1979,2,18,0.0,0.001068,0.294906,0.0,0.332037,1914,0.0,0.432241,0.881715,,0.0,0.004615
3,1979,2,18,0.0,0.335645,1.0,0.497195,0.186432,1918,0.67315,0.0,0.484997,,1.0,0.119588
4,1979,2,19,1.0,0.334557,0.522907,0.368531,1.0,1906,0.578488,0.715249,0.711596,,0.564124,0.993853
5,1979,2,19,1.0,1.0,0.0,0.608948,0.0,1911,1.0,0.148012,1.0,,0.07905,0.70691
6,1979,2,19,1.0,0.321595,0.063402,1.0,0.888377,1913,0.814968,1.0,0.999999,,0.012847,0.317051
7,1979,2,19,1.0,0.0,0.29481,0.614135,0.141712,1914,0.605364,0.435598,0.37911,,0.005059,0.0


In [13]:
# Box plots 

import plotly.express as px

fig = px.box(df[columns_normalized], points="all")
fig.show()

"""
fig = go.Figure()

for idx, value in enumerate(columns_normalized):
     fig.add_trace(go.Box(x=[]*len(value), y=value, name=color, marker=dict(color=color.lower())))
    
fig.update_layout(yaxis_title="Mean value", xaxis_title="Color channel",
                  title="Mean value vs. Color channel", template="plotly_white")
"""

'\nfig = go.Figure()\n\nfor idx, value in enumerate(columns_normalized):\n     fig.add_trace(go.Box(x=[]*len(value), y=value, name=color, marker=dict(color=color.lower())))\n    \nfig.update_layout(yaxis_title="Mean value", xaxis_title="Color channel",\n                  title="Mean value vs. Color channel", template="plotly_white")\n'

Create a correlation map to visualize any important correlations between the variables. 
"Correlation is defined as the strength of the linear relationship between two variables. A correlation coefficient closer to r=1 indicates that when one variable changes, the other variable changes in the same direction (positive or negative) by the same amount. A similar idea applies as the correlation coefficient approaches r=-1, but in this case, the variables have a nearly perfect inverse correlation, meaning that a change in one variable elicits an opposite change in the other variable."  https://medium.com/blockforce-capital/seeing-in-the-dark-creating-crypto-heat-map-correlations-with-plot-ly-7154cf362f92

In [8]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(z = df[columns_normalized].corr(),
                               x = columns_normalized,
                               y = columns_normalized,
                                colorscale= "ylgn"))

fig.update_layout(title="Sea Ice Data Correlation Map")
                
fig.show()

In [9]:
# Heat Map with datetimes (Does not quite work yet)

"""
fig = go.Figure(data=go.Heatmap(
        z=df[columns_normalized],
        x=df[["year"]],
        y=columns_normalized,
        colorscale='Viridis'))

fig.update_layout(
    title='GitHub commits per day')

fig.show()
"""

'\nfig = go.Figure(data=go.Heatmap(\n        z=df[columns_normalized],\n        x=df[["year"]],\n        y=columns_normalized,\n        colorscale=\'Viridis\'))\n\nfig.update_layout(\n    title=\'GitHub commits per day\')\n\nfig.show()\n'

In [10]:
"""
We could potentially add a backdrop image to a correlation plot in plotly! Leaving this here in case we ever need it...

layout = go.Layout(
   images=[dict( #add custom image to top right corner of graph
        source="main-logo-black.png",
        xref="paper", yref="paper",
        x=1.2, y=1.1,
        sizex=0.32, sizey=0.32,
        xanchor="right", yanchor="bottom")],
  
    title=f'Return Correlation',
    

"""

'\nWe could potentially add a backdrop image to a correlation plot in plotly! Leaving this here in case we ever need it...\n\nlayout = go.Layout(\n   images=[dict( #add custom image to top right corner of graph\n        source="main-logo-black.png",\n        xref="paper", yref="paper",\n        x=1.2, y=1.1,\n        sizex=0.32, sizey=0.32,\n        xanchor="right", yanchor="bottom")],\n  \n    title=f\'Return Correlation\',\n    \n\n'

In [16]:
# Distribution Plots

fig = px.histogram(df[columns_normalized], marginal="rug", opacity=.6)
fig.show()

"""
fig = ff.create_distplot(df[columns_normalized], group_labels="day of year")
fig.update_layout(showlegend=False, template="simple_white")
fig.update_layout(title_text="Distributions of Variables in the Sea Ice Dataset")

fig.show()
"""

'\nfig = ff.create_distplot(df[columns_normalized], group_labels="day of year")\nfig.update_layout(showlegend=False, template="simple_white")\nfig.update_layout(title_text="Distributions of Variables in the Sea Ice Dataset")\n\nfig.show()\n'

In [12]:
columns_normalized

['doy',
 'x_ease',
 'y_ease',
 'u_buoy',
 'v_buoy',
 'u_era5',
 'v_era5',
 'sic_cdr',
 'h_cs2smos',
 'h_piomas',
 'd2c']