# AI Launch Lab - Sea Ice Movement Challenge - PCA and t-SNE analysis

Do some more analysis on the data before choosing our final model.

In [1]:
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import os 
pd.options.mode.chained_assignment = None
import datetime
import numpy as np
import os
import zipfile
import modules.ml_pipeline.readdata as mlpp

Load the data from disk and display it

In [2]:
# unzip the zip dataset
with zipfile.ZipFile('data/data-sea-ice.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

# Load the raw data to disk
input_path = "data/DRIFT_DATA_TRAIN.csv"
df = pd.read_csv(input_path)

# Convert all column names to lower case and display the dataframe 
df = df.rename(str.lower, axis='columns')
df.set_index("id_buoy")

# Print the dataframe dimensions
print("Dataframe shape: ", df.shape)

Dataframe shape:  (339478, 15)


In [3]:
# Do some manipulations on the data and clean it (remove rows of all NaNs, remove duplicates, etc.)

# Dropping the NA when there is no value on the current columns
print("Before dropping the NaNs :", df.shape)
df = df.dropna(how="any", subset=["u_buoy", "v_buoy"])
print("After dropping the NaNs :", df.shape)

# Remove Duplicates 
df = df.drop_duplicates() 

# Fill the missing values with the median value 
df = df.fillna(df.mean())

# Print size of the df after dropping duplicates
print("After dropping the duplicates:", df.shape)

Before dropping the NaNs : (339478, 15)
After dropping the NaNs : (339478, 15)
After dropping the duplicates: (339203, 15)


In [4]:
# Drop the time related columns 
training_data = df.drop(["year","day", "doy", "id_buoy","u_buoy", "v_buoy"], axis = 1)
display(df)

Unnamed: 0,year,month,day,doy,x_ease,y_ease,u_buoy,v_buoy,id_buoy,u_era5,v_era5,sic_cdr,h_cs2smos,h_piomas,d2c
0,1979,2,18,49,147.506958,138.582672,-0.797554,1.114740,1906,-6.704156,-0.321260,0.990195,1.774046,3.189743,522.523298
1,1979,2,18,49,146.834778,120.509880,0.643200,0.368754,1913,-6.818630,-0.674205,0.966372,1.774046,2.484009,412.767669
2,1979,2,18,49,130.993561,129.623672,-1.162420,0.243717,1914,-8.825469,1.123955,0.996022,1.774046,2.474106,362.547379
3,1979,2,18,49,147.524719,157.382492,0.919766,0.025784,1918,-1.079951,-1.035410,0.982681,1.774046,3.740522,381.025629
4,1979,2,19,50,147.470963,138.599823,0.380940,1.243485,1906,-2.169171,2.537787,0.990302,1.774046,3.188522,521.535334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339473,2019,12,30,364,193.232056,172.742004,-0.981225,-11.698400,44880,-2.526544,-6.012877,1.000000,1.414148,1.620020,702.312813
339474,2019,12,30,364,208.421234,142.049896,-7.247925,-5.289890,53005,-4.615093,-0.381765,1.000000,1.039972,1.288953,360.491321
339475,2019,12,30,364,145.264023,146.109741,-0.913761,2.182150,95020,1.940967,1.119087,1.000000,2.059716,1.960349,393.799208
339476,2019,12,30,364,193.921402,174.408707,-0.101372,-11.791700,7750,-2.468425,-5.363596,1.000000,1.411272,1.610893,680.057567


In [5]:
print("Sea Ice Movement Datasets")

training_targets = df[["u_buoy", "v_buoy"]]

# We will leave the month for now because it could be an indicator of weather/season 
print("Target Variables")
display(training_targets)

Sea Ice Movement Datasets
Target Variables


Unnamed: 0,u_buoy,v_buoy
0,-0.797554,1.114740
1,0.643200,0.368754
2,-1.162420,0.243717
3,0.919766,0.025784
4,0.380940,1.243485
...,...,...
339473,-0.981225,-11.698400
339474,-7.247925,-5.289890
339475,-0.913761,2.182150
339476,-0.101372,-11.791700


Split the data into a Training and Test set 

In [6]:
# test_size: what proportion of original data is used for test set
train_data, test_data, train_labels, test_labels = train_test_split(
    training_data, training_targets, test_size= 0.25, shuffle=True)

# show the sizes of the training and test sets
print("Training data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

Training data shape:  (254402, 9)
Test data shape:  (84801, 9)


## PCA to Speed up Machine Learning Algorithms
(Taken from https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60)

Since PCA yields a feature subspace that maximizes the variance along the axes, it makes sense to standardize the data, especially, if it was measured on different scales.

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data

Normalize the data: 

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()

# Fit on training set only.
scaler.fit(train_data)

# Apply transform to both the training set and the test set.
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)

# display the normalized data
display(train_data)

array([[ 1.23031761e+00, -1.08873463e+00,  6.99113093e-02, ...,
         2.04385353e+00, -2.94410800e-01, -1.12616491e+00],
       [ 9.40664076e-01,  1.28102400e+00,  2.42684875e-01, ...,
        -2.42845952e+00, -9.73318229e-01,  1.15599880e+00],
       [ 3.61357016e-01,  1.68343286e-01, -5.75596488e-01, ...,
        -1.51224505e-04,  8.78071404e-03,  2.04933547e+00],
       ...,
       [-7.97257103e-01, -3.57784357e-01,  2.70097861e+00, ...,
        -1.51224505e-04, -1.64220498e+00, -1.58311851e+00],
       [-1.66621769e+00,  7.39694147e-01, -1.57656278e+00, ...,
        -2.35577889e+00, -3.93770949e-01, -1.33694129e+00],
       [ 1.51997114e+00, -8.72035474e-01, -1.43431689e-01, ...,
        -1.51224505e-04,  1.07857995e+00, -5.15347139e-01]])

In [8]:
# Choose the minimum percentage of the variables such that 95% of the variance in the dataset is retained
pca = PCA(n_components=3) 

Fit PCA on training set. Note: you are fitting PCA on the training set only

In [9]:
pca.fit(train_data)

# print the number of components we are left with
print("Number of components: ", pca.n_components_)
print("PCA variance ratio: ", pca.explained_variance_ratio_)

# Transform the data according to the number of principal components
train_data = pca.transform(train_data)
test_data = pca.transform(test_data)

Number of components:  3
PCA variance ratio:  [0.19299088 0.15966511 0.14868213]


In [10]:
# Recreate the principal component dataframe: 
principalDf = pd.DataFrame(data = train_data, columns = ['principal component 1', 
                                                         'principal component 2','principal component 3'])

# Concatenate the new dataset with the targets
finalDf = pd.concat([principalDf, training_targets], axis = 1)