# AI Launch Lab - Sea Ice Movement Challenge - PCA and Training

Do some more analysis on the data before choosing our final model.

In [1]:
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import os 
pd.options.mode.chained_assignment = None
import datetime
import numpy as np
import os
import zipfile
import modules.ml_pipeline.readdata as mlpp

Load the data from disk and display it

In [2]:
# unzip the zip dataset
with zipfile.ZipFile('data/converted.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

# Load the raw data to disk
input_path = "data/converted.csv"
df = pd.read_csv(input_path)

# Convert all column names to lower case and display the dataframe 
df = df.rename(str.lower, axis='columns')
df.set_index("id_buoy")

# Print the dataframe dimensions
print("Dataframe shape: ", df.shape)

Dataframe shape:  (339478, 15)


In [3]:
# Do some manipulations on the data and clean it (remove rows of all NaNs, remove duplicates, etc.)

# Dropping the NA when there is no value on the current columns
print("Before dropping the NaNs :", df.shape)
df = df.dropna(how="any", subset=["u_buoy", "v_buoy"])
print("After dropping the NaNs :", df.shape)

# Remove Duplicates 
df = df.drop_duplicates() 

# Fill the missing values with the median value 
df = df.fillna(df.mean())

# Print size of the df after dropping duplicates
print("After dropping the duplicates:", df.shape)

Before dropping the NaNs : (339478, 15)


KeyError: ['u_buoy', 'v_buoy']

In [None]:
# Drop the time related columns 
training_data = df.drop(["year","day", "doy", "id_buoy","u_buoy", "v_buoy"], axis = 1)
display(df)

In [None]:
print("Sea Ice Movement Datasets")

training_targets = df[["u_buoy", "v_buoy"]]

# We will leave the month for now because it could be an indicator of weather/season 
print("Target Variables")
display(training_targets)

Split the data into a Training and Test set 

In [None]:
# test_size: what proportion of original data is used for test set
train_data, test_data, train_labels, test_labels = train_test_split(
    training_data, training_targets, test_size= 0.25, shuffle=True)

# show the sizes of the training and test sets
print("Training data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

## PCA to Speed up Machine Learning Algorithms
(Taken from https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60)

Since PCA yields a feature subspace that maximizes the variance along the axes, it makes sense to standardize the data, especially, if it was measured on different scales.

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data

Normalize the data: 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()

# Fit on training set only.
scaler.fit(train_data)

# Apply transform to both the training set and the test set.
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)

# display the normalized data
display(train_data)

In [None]:
# Choose the minimum percentage of the variables such that 95% of the variance in the dataset is retained
pca = PCA(n_components=3) 

Fit PCA on training set. Note: you are fitting PCA on the training set only

In [None]:
pca.fit(train_data)

# print the number of components we are left with
print("Number of components: ", pca.n_components_)
print("PCA variance ratio: ", pca.explained_variance_ratio_)

# Transform the data according to the number of principal components
train_data = pca.transform(train_data)
test_data = pca.transform(test_data)

In [None]:
# Recreate the principal component dataframe: 
principalDf = pd.DataFrame(data = train_data, columns = ['principal component 1', 
                                                         'principal component 2','principal component 3'])

# Concatenate the new dataset with the targets
finalDf = pd.concat([principalDf, training_targets], axis = 1)