In [1]:
#data
import numpy as np
import pandas as pd

#visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.preprocessing import LabelEncoder

sns.set(rc={'figure.figsize':(6, 4)})
sns.set_style('whitegrid')
sns.color_palette("flare")
sns.set_palette(sns.color_palette("flare"))

# Visualization

In [None]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

train_data.head()

In [None]:
test_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
print("train= ",train_data.info())

In [None]:
print("test= ",test_data.info())

In [None]:
# Descriptive Statistics
train_data.describe().T

In [None]:
enc =LabelEncoder()
train_data['Transported'] = enc.fit_transform(train_data['Transported'])
sns.countplot(data=train_data,x=train_data.Transported)

In [None]:
sns.countplot(data=train_data, x='Destination', hue='Transported')

In [None]:
sns.countplot(data=train_data, x='HomePlanet', hue='Transported')

In [None]:
sns.countplot(data=train_data, x='HomePlanet', hue='Destination')

# visualization of missing data

In [None]:
# Using Missingno to Diagnose Data Sparsity

print(train_data.isnull().sum())
msno.matrix(train_data).set_title("Train set",fontsize=20)

In [None]:
print(test_data.isnull().sum())
msno.matrix(test_data).set_title("Test set",fontsize=20)

In [None]:
## Visualize Correlation 
plt.figure(5, figsize=(25, 10))
corr = train_data.apply(lambda x: pd.factorize(x)[0]).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

In [None]:
# Splitting Passenger id and Cabin  to better understand relation of individual values present in them

train_data=train_data.assign(
          PassengerId_GroupId= train_data["PassengerId"].str[:4].astype(np.int16)
         ,PassengerId_PassengerNumber=train_data["PassengerId"].str[5:].astype(np.int16)
         )

train_data.drop(columns='PassengerId',inplace=True)

train_data=train_data.assign( 
           Cabin_Deck=train_data["Cabin"].str.split("/").str[0],
           Cabin_Num=train_data["Cabin"].str.split("/").str[1],
           Cabin_Side=train_data["Cabin"].str.split("/").str[2]
         )

train_data.drop(columns='Cabin',inplace=True)

In [None]:
# For all Numerical columnn for all rows 
train_data.plot(lw=0, marker=".", subplots=True, layout=(-1, 3),
          figsize=(12,6), markersize=5,color='#B84A62')
plt.tight_layout()

##  Feature Engineering