# Spaceship titanic

In [None]:
#data
import numpy as np
import pandas as pd

#visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px

from sklearn.preprocessing import LabelEncoder

sns.set(rc={'figure.figsize':(6, 4)})
sns.set_style('whitegrid')
sns.color_palette("flare")
sns.set_palette(sns.color_palette("flare"))

In [None]:
# data loading
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# Observations in Train Data
- There are total of 14 columns and 8693 rows in train data.
- Train data contains 119378 observation with 2324 missing values.
- All 12 feature columns have missing values in them with CryoSleep having highest missing values (217) 
- Transported is the target variable which is only available in the train dataset.

In [None]:
train_data.head()

In [None]:
print(f'Shape of train data: {train_data.shape}')

In [None]:
print(f'Number of rows in train data: {train_data.shape[0]}')
print(f'Number of columns in train data: {train_data.shape[1]}')
print(f'Number of values in train data: {train_data.count().sum()}')
print(f'Number missing values in train data: {sum(train_data.isna().sum())}')

In [None]:
print(train_data.isna().sum().sort_values(ascending = False))

The basic statistics for each variables which contain information on count, mean, standard deviation, minimum, 1st quartile, median, 3rd quartile and maximum.

In [None]:
# Descriptive Statistics
train_data.describe()

# Observations in Test Data
- There are total of 13 columns and 4277 rows in test data.
- Test data contains 54484 observation with 1117 missing values.
- All 12 feature columns have missing values in them with FoodCourt having highest missing values (106)

In [None]:
test_data.head()

In [None]:
print(f'Shape of test data: {test_data.shape}')

In [None]:
print(f'Number of rows in test data: {test_data.shape[0]}')
print(f'Number of columns in test data: {test_data.shape[1]}')
print(f'Number of values in train data: {test_data.count().sum()}')
print(f'Number of rows with missing values  in test data: {sum(test_data.isna().sum())}')

In [None]:
print((test_data.isna().sum().sort_values(ascending = False)))

In [None]:
# statistics of test data 
test_data.describe()

# Visualization of Train data

In [None]:
enc =LabelEncoder()
train_data['Transported'] = enc.fit_transform(train_data['Transported'])
sns.countplot(data=train_data,x=train_data.Transported)

In [None]:
sns.countplot(data=train_data, x='Destination', hue='Transported')

In [None]:
sns.countplot(data=train_data, x='HomePlanet', hue='Transported')

In [None]:
sns.countplot(data=train_data, x='HomePlanet', hue='Destination')

# Visualization of missing data

In [None]:
# Using Missingno to Diagnose Data Sparsity

msno.matrix(train_data).set_title("Train set",fontsize=20)

In [None]:
msno.matrix(test_data).set_title("Test set",fontsize=20)

# Correlation matrix

In [None]:
fig = px.imshow(train_data.corr() ,text_auto=True, aspect="auto" , color_continuous_scale = "Reds")
fig.show()

In [None]:
# Visualize Correlation 
plt.figure(5, figsize=(25, 10))
corr = train_data.apply(lambda x: pd.factorize(x)[0]).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

##  Data Pre-Processing