# Spaceship titanic

# 1. Import Libraries/ Data Loading 

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px

from sklearn.preprocessing import LabelEncoder

sns.set(rc={'figure.figsize':(6, 4)})
sns.set_style('whitegrid')
sns.color_palette("flare")
sns.set_palette(sns.color_palette("flare"))

In [None]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# 2. EDA (Exploratory Data Analysis) and Data Preprocessing


The goal of EDA is to understand the main characteristics of the data and identify any patterns, outliers, or other features of the data that are important to know before building a model or making predictions.

### Observations in Train Data
It's a first step in EDA. It is useful for understanding the main characteristics of the data and identifying any patterns, outliers, or other features of the data.

- There are total of 14 columns and 8693 rows in train data.
- Train data contains 119378 observation with 2324 missing values.
- All 12 feature columns have missing values in them with CryoSleep having highest missing values (217) 
- Transported is the target variable which is only available in the train dataset.

In [None]:
train_data.head()

In [None]:
print(f'Shape of train data: {train_data.shape}')

In [None]:
print(f'Number of rows in train data: {train_data.shape[0]}')
print(f'Number of columns in train data: {train_data.shape[1]}')
print(f'Number of values in train data: {train_data.count().sum()}')
print(f'Number missing values in train data: {sum(train_data.isna().sum())}')

In [None]:
print(train_data.isna().sum().sort_values(ascending = False))

The basic statistics for each variables which contain information on count, mean, standard deviation, minimum, 1st quartile, median, 3rd quartile and maximum.

In [None]:
# Descriptive Statistics
train_data.describe()

The pandas-profiling library allows you to generate a profile report that allows you to obtain the types of all the columns and to access statistical details at the quantile level, descriptions, histograms, and the most frequent and exterm values.

In [None]:
from pandas_profiling import ProfileReport

ProfileReport(train_data)

### Observations in Test Data
- There are total of 13 columns and 4277 rows in test data.
- Test data contains 54484 observation with 1117 missing values.
- All 12 feature columns have missing values in them with FoodCourt having highest missing values (106)

In [None]:
test_data.head()

In [None]:
print(f'Shape of test data: {test_data.shape}')

In [None]:
print(f'Number of rows in test data: {test_data.shape[0]}')
print(f'Number of columns in test data: {test_data.shape[1]}')
print(f'Number of values in train data: {test_data.count().sum()}')
print(f'Number of rows with missing values  in test data: {sum(test_data.isna().sum())}')

In [None]:
print((test_data.isna().sum().sort_values(ascending = False)))

In [None]:
# statistics of test data 
test_data.describe()

### Visualization of data

In [None]:
enc =LabelEncoder()
train_data['Transported'] = enc.fit_transform(train_data['Transported'])
sns.countplot(data=train_data,x=train_data.Transported)

In [None]:
sns.countplot(data=train_data, x='Destination', hue='Transported')

In [None]:
sns.countplot(data=train_data, x='HomePlanet', hue='Transported')

In [None]:
sns.countplot(data=train_data, x='HomePlanet', hue='Destination')

### Visualization of missing data

In [None]:
# Using Missingno to Diagnose Data Sparsity

msno.matrix(train_data).set_title("Train set",fontsize=20)

In [None]:
msno.matrix(test_data).set_title("Test set",fontsize=20)

### Correlation matrix

Darker colors indicate a stronger positive correlation, while lighter colors indicate a weaker positive correlation or a negative correlation.

In [None]:
sns.heatmap(train_data.corr(), annot = True, linewidths= 0.5)

In [None]:
plt.figure(5, figsize=(25, 10))
corr = train_data.apply(lambda x: pd.factorize(x)[0]).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

# 3. Feature Engineering
Feature engineering is the process of using domain knowledge to extract features from raw data that can be used to train machine learning models. It involves transforming raw data into a format that can be easily understood by the model, such as converting text to numerical values. The goal of feature engineering is to create a set of features that are most informative and relevant for the task at hand, which can improve the performance of the model.


In [None]:
train_data.drop('Name', axis=1, inplace=True)
test_data.drop('Name', axis=1, inplace=True)

In [None]:
train_data['Transported'].replace(False, 0, inplace=True)
train_data['Transported'].replace(True, 1, inplace=True)

In [None]:
train_data[['deck','num', 'side']] = train_data['Cabin'].str.split('/', expand=True)
test_data[['deck','num', 'side']] = test_data['Cabin'].str.split('/', expand=True)

train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

In [None]:
col_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train_data['total_spent'] = train_data[col_to_sum].sum(axis=1)
test_data['total_spent'] = test_data[col_to_sum].sum(axis=1)

###  Imputing Missing Values


We are using Simple Imputer to fill the na values with the specified strategy.

For ['CryoSleep', 'VIP', 'HomePlanet', 'Destination', 'Cabin'] we use the strategy most_frequent as it is categorical data.

For ['Age','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'] we use the strategy median as it is numeric data.


In [None]:
categorical_cols = [col for col in train_data.columns if train_data[col].dtype == 'object' or train_data[col].dtype == 'category']
numeric_cols = [col for col in train_data.columns if train_data[col].dtype == 'float64']

print(f'Categorical cols -- {categorical_cols}')
print(f'Numeric cols -- {numeric_cols}')

In [None]:
train_data[categorical_cols] = train_data[categorical_cols].astype('category')
test_data[categorical_cols] = test_data[categorical_cols].astype('category')

In [None]:
from sklearn.preprocessing import OrdinalEncoder

oc = OrdinalEncoder()
data_for_encode = pd.concat([train_data, test_data])
data_for_encode[categorical_cols] = data_for_encode[categorical_cols].astype('category')
data_for_encode[categorical_cols] = oc.fit_transform(data_for_encode[categorical_cols])

del train_data, test_data

train_data = data_for_encode.iloc[:8693, :]
test_data = data_for_encode.iloc[8693: , :]

del data_for_encode

test_data.drop('Transported', inplace=True, axis=1)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

ctc = ColumnTransformer([("imp", SimpleImputer(strategy='most_frequent'), categorical_cols)])
    
train_data[categorical_cols] = ctc.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = ctc.fit_transform(test_data[categorical_cols])

ctn = ColumnTransformer([("imp", SimpleImputer(strategy='median'), numeric_cols)])

train_data[numeric_cols] = ctn.fit_transform(train_data[numeric_cols])
test_data[numeric_cols] = ctn.fit_transform(test_data[numeric_cols])

train_data["Transported"].fillna(method='ffill', inplace = True)

In [None]:
train_data.head()