In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utility import *
import seaborn as sns
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

1. The features in the dataset are continuous. They are as follows:
    - Power consumed by different components
    - Factors influencing power consumption 
    - Time series in an interval of 5 minutes for 2 vessels, spanning across a year. That makes it $12*24*365 = 105120$ data points for each vessel.


In [None]:
# Read the data
df = pd.read_csv('data/data.csv', header = 0)
df.head()

In [None]:
# Check the data types and column names
df.dtypes

In [None]:
dfv = pick_vessel(df, 'Vessel 1')

In [None]:
missing_values = dfv.isna().sum()
missing_values.plot(kind='bar',figsize=(12,5), title='Missing Values')  # Plot the missing values

In [None]:
# Imputing the missing values
missing_values

In [None]:
# Imputation via interpolation for the columns with < 1% missing values as they are likely to be continuous in time
col_to_interpolate = dfv.columns.difference(['Depth (m)', 'Start Time', 'End Time', 'Vessel Name'])

In [None]:
impute_time_series(dfv, col_to_interpolate)
dfv.isna().sum()

In [None]:
# impute depth column using median value as more than 20% of the values are missing
median_depth = dfv['Depth (m)'].median()
dfv['Depth (m)'].fillna(median_depth, inplace=True)
dfv.isna().sum()

In [None]:
dfv.head()

In [None]:
# Correlation matrix
dfv_sub = dfv.iloc[:,3:]
corr = dfv_sub.corr()

In [None]:
high_corr = []
for i in corr.columns:
    high_corr.append(corr[(corr[i] > 0.8) | (corr[i] < -0.8)][i])

In [None]:
high_corr

In [None]:
fig, ax = plt.subplots(figsize=(24,20))
sns.heatmap(data = corr[(corr > 0.8) | (corr < -0.8)], vmin=-1,vmax=1, cmap='coolwarm', ax = ax, annot= True)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_title('Correlation Matrix')
plt.show()

**Comments:**
1. Diesel Generators' Power are highly correlated to corresponding Main Engine's Fuel flow rate 
2. Propulsion power is a linear combination of Port Side Propulsion Power and Starboard Side Propulsion Power 
3. Speed through water and Speed over ground are positively correlated to the Propulsion Power
4. Sea water temperature reducing as the latitude is increasing completely makes sense. This is because, temperatures are lower as one moves towards the poles