# Spaceship Titanic

# LogBook

In [None]:
'''
 7 Oct 2022   >>>   Project Start
 7 Oct 2022   >>>   Individual Features Analysis
 8 Oct 2022   >>>   Data Wrangling
11 Oct 2022   >>>   EDA
14 Oct 2022   >>>   Data Modeling
15 Oct 2022   >>>   Asthetics and Proyect End
'''
# Author: Andres Montes de Oca
# GitHub: https://github.com/AndresMontesDeOca

# Load Data and Info

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st

# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set(style='white', context='notebook', palette='pastel')

train = pd.read_csv('Data/train.csv')
data = train
test = pd.read_csv('Data/test.csv')

print(data.info())
display(data.head())

# Individual Features Check

## Transported (Target)

In [None]:
print(data['Transported'].value_counts(), '\n')

# Plot
fig, ax = plt.subplots(1, 1, figsize=(5, 2))
data['Transported'].value_counts().plot(kind='pie', 
        autopct='%.1f%%', textprops={'fontsize':8}, startangle=90).set_title('Transported Rate')
ax.set_ylabel('')
plt.show()

# We will use it later
mask_transported = data['Transported']==True

# Transported is well balanced

## PassengerId (Feature)

In [None]:
print('Nulls:', data.PassengerId.isnull().sum(), '\n')
print(data.PassengerId.head())

# We will have to split the GroupId form this Feature, there are no nulls

## GroupSize (Created Feature)

In [None]:
data['GroupId'] = data['PassengerId'].str.split('_', expand=True)[0]

# Grouping by GroupId to count each group size
GroupId_size_serie = data.groupby('GroupId').size() # temp GroupId_size Serie
GroupId_size_serie.rename('GroupSize', inplace=True) # necesary for the Join

# First we add how many passengers are in the group
data = pd.merge(left=data, right=GroupId_size_serie, left_on='GroupId', right_on='GroupId', how='left')

print(data['GroupSize'].value_counts())

# Plot
fig, ax = plt.subplots(1, 1, figsize=(5, 3))
sns.countplot(y=data['GroupSize'], palette='Set2')
ax.set_ylabel('')
ax.set_xlabel('Passenger Count')

plt.title('GroupSize')
plt.show()

# Few Passangers in big groups

## IsAlone (Created Feature)

In [None]:
# IsAlone Feature Generation (Solo Travelers)

data.loc[:, 'IsAlone'] = data['GroupSize']==1 # Or we can leave it as it is, maybe improves the results
# data.rename(columns={'Group_size':'IsAlone'}, inplace=True)

print(data['IsAlone'].value_counts(), '\n')
# Plots
fig, ax = plt.subplots(1, 2, figsize=(12, 2))

# PieChart
data['IsAlone'].value_counts(dropna=False).sort_index(ascending=False).plot(kind='pie', autopct='%.1f%%', ax=ax[0], \
                                                                cmap='Set2', startangle=90, textprops={'fontsize':8})
ax[0].set_ylabel('')
ax[0].set_title('On-Board Rate')

# CountPlot
sns.countplot(y=data['IsAlone'], hue=data['Transported'], ax=ax[1], order=[True, False], hue_order=[True, False])
ax[1].set_ylabel('')
ax[1].set_title('Count and Comparison w/Target')

fig.suptitle('Solo Travellers\n')
plt.show()

# There are more Solo Travellers, but they have lower Transported Rate

## HomePlanet (Feature)

In [None]:
print(data['HomePlanet'].value_counts(dropna=False), '\n')

# Plots
fig, ax = plt.subplots(1, 2, figsize=(12, 2))

# PieChart
data['HomePlanet'].value_counts(dropna=False).plot(kind='pie', autopct='%.1f%%', ax=ax[0], \
                                    startangle=90, cmap='Set2', textprops={'fontsize':8})
ax[0].set_ylabel('')
ax[0].set_title('Distribution')

# CountPlot
sns.countplot(y=data['HomePlanet'], order=data['HomePlanet'].value_counts().index, ax=ax[1], \
              hue=data['Transported'], hue_order=[True, False])

ax[1].set_ylabel('')
ax[1].set_xlabel('')
ax[1].set_title('Count and Comparison w/Target')

fig.suptitle('HomePlanet')
plt.show()

# There are 201 nulls
# Most coming from Earth
# Passangers from Europe have more chances to be Transported

## CryoSleep (Feature)

In [None]:
print(data['CryoSleep'].value_counts(dropna=False).sort_index(ascending=False), '\n')

# Plots
fig, ax = plt.subplots(1, 2, figsize=(12, 2))

# PieChart
data['CryoSleep'].value_counts(dropna=False).sort_index(ascending=False).plot(kind='pie', cmap='Set2', \
                                    autopct='%.1f%%', ax=ax[0], startangle=90, textprops={'fontsize':8})
ax[0].set_ylabel('')
ax[0].set_title('Distribution')

# CountPlot
sns.countplot(y=data['CryoSleep'], hue=data['Transported'], ax=ax[1], hue_order=[True, False])
ax[1].set_ylabel('')
ax[1].set_xlabel('')
ax[1].set_title('Count and Comparison w/Target')

fig.suptitle('CryoSleep')
plt.show()

# Just 1/3 are in CryoSleep
# There are 217 Nulls
# Strong correlation with Target Variable

#### Correlation with VIP

In [None]:
print('Total VIP Passengers:', data[data.VIP==True].shape[0])

# Plot
fig, ax = plt.subplots(1, 1, figsize=(5, 2))
data[data.VIP==True].CryoSleep.value_counts().sort_index(ascending=False).plot(kind='pie', autopct='%.1f%%', \
                                                        cmap='Set2', startangle=90, textprops={'fontsize':8})
ax.set_ylabel('')
plt.suptitle('CryoSleep')
plt.show()

# There few VIP Passengers
# Strong correlation, VIP passengers dont want to be put to sleep

#### Correlation with Age

In [None]:
# Plot
fig, ax = plt.subplots(1, 1, figsize=(10, 3))
data[data.CryoSleep==True].Age.value_counts().sort_index().plot(title='Count Passengers by Age', color='blue', label='Cryo Passengers')
data[data.CryoSleep==False].Age.value_counts().sort_index().plot(color='red', label='Non-Cryo Passengers')

plt.legend()
plt.show()

# Kids (<18) and Seniors (>65) have similar Rate
# The gap between both gets gradually reduced for adults (18 to 65 years) 

#### Correlation with GroupSize

In [None]:
# Plot
fig, ax = plt.subplots(1, 1, figsize=(10, 3))
data[data.CryoSleep==True].GroupSize.value_counts().sort_index().plot(title='blue Passengers by GroupSize', 
                                                                      color='blue', label='Cryo Passengers')
data[data.CryoSleep==False].GroupSize.value_counts().sort_index().plot(title='Count Passengers by GroupSize', 
                                                                      color='red', label='Non-Cryo Passengers')

plt.yticks(np.arange(0, 5000, 1000))
plt.legend()
plt.show()

## Cabin (Feature)

In [None]:
# The cabin number where the passenger is staying. Takes the form deck/num/side, 
# where side can be either P for Port or S for Starboard.

print('Nulls Count Cabin:', data[data.Cabin.isnull()].shape[0], '\n')
print(data['Cabin'].head()) # PortSide/StarboardSide: Left/Right

#### FillNa (GroupId Approach)

In [None]:
# GroupID DIM table for completing NA values
GroupId_DIM = data[['GroupId', 'Cabin']].drop_duplicates(subset='GroupId').set_index('GroupId').squeeze()
display(GroupId_DIM.head())

data.Cabin.fillna('GroupId_DIM')

# SQL Merge (should be an easier way for sure)
data.loc[data.Cabin.isnull(), 'Cabin']= pd.merge(left=data.loc[data.Cabin.isnull()][['GroupId', 'Cabin']],
         right=GroupId_DIM,
         how='left',
         left_on='GroupId',
         right_index=True
        )['Cabin_y']

print('Nulls Count Cabin:', data[data.Cabin.isnull()].shape[0], '\n')

# There will be another fillna() for each individual fature after split (Deck, Num, Side)

#### Split (Cabin)

In [None]:
col_names = ['Deck', 'Num', 'Side']
data_Cabin = data['Cabin'].str.split('/', expand=True).set_axis(col_names, axis=1)

# Converting Num to Int
data_Cabin.loc[data_Cabin.Num.notnull(), 'Num'] = \
                data_Cabin.loc[data_Cabin.Num.notnull()].Num.astype(int) # Probably there is an easier way

display(data_Cabin)

# Concat
data = pd.concat([data, data_Cabin], axis=1)

## Deck (Created Feature)

In [None]:
# Working but ugly
print(data['Deck'].value_counts(dropna=False))

fig, ax = plt.subplots(1, 1, figsize=(8, 3))
sns.countplot(y=data['Deck'], palette='Set2', order=data['Deck'].value_counts().index)
ax.set_ylabel('')
ax.set_xlabel('')

plt.title('Count by Deck')
plt.show()

# 199 nulls

## Side (Created Feature)

In [None]:
print(data.Side.value_counts(dropna=False), '\n')

# Plot
fix, ax = plt.subplots(1, 2, figsize=(10, 2))

# PieChart
data['Side'].value_counts(dropna=False).sort_index(ascending=False). \
                    plot(kind='pie', autopct='%.1f%%', ax=ax[0], startangle=270, cmap='Set2', textprops={'fontsize':8})
ax[0].set_ylabel('')
ax[0].set_title('Distribution')

# CountPlot
sns.countplot(y=data['Side'], hue=data['Transported'], ax=ax[1], hue_order=[True, False])
ax[1].set_ylabel('')
ax[1].set_xlabel('')
ax[1].set_title('Count and Comparison w/Target')
plt.show()

# Same # nulls as Deck, maybe are related?
print('Nulls for Deck, Side and Num are the same:', \
      data[data.Deck.isnull() & data.Side.isnull() & data.Num.isnull()].shape[0]) # 134 = All the same

## Num (Created Feature)

In [None]:
print('Nulls:', data['Num'].isnull().sum(), '\n')

# Checking boat simetry
Cabin_P_F = data[(data.Side=='P') & (data.Deck=='F')].groupby(['Side', 'Deck', 'Num']).size().to_frame('Count')
Cabin_S_F = data[(data.Side=='S') & (data.Deck=='F')].groupby(['Side', 'Deck', 'Num']).size().to_frame('Count')

display(Cabin_P_F)
display(Cabin_S_F)

## Destination (Feature)

In [None]:
print(data['Destination'].value_counts(dropna=False), '\n')

# Plots
fig, ax = plt.subplots(1, 2, figsize=(12, 2))

# PieChart
data['Destination'].value_counts(dropna=False).plot(kind='pie', autopct='%.1f%%', ax=ax[0], startangle=90, \
                                                    cmap='Set2', textprops={'fontsize':8})
ax[0].set_ylabel('')
ax[0].set_title('Distribution')

# CountPlot
sns.countplot(y=data['Destination'], order=data['Destination'].value_counts().index, ax=ax[1], hue=data['Transported'])
ax[1].set_ylabel('')
ax[1].set_title('Count and Comparison w/Target')

plt.suptitle('Destination')
plt.show()

# Similar to HomePlanet. Most passengers heading TRAPPIST-1e, 182 Nulls
# 55 Cancri e looks like they have a better Transported Rate

## Age (Feature)

In [None]:
print('Nulls:', data['Age'].isnull().sum())
print("Skewness: %f" % data['Age'].skew(), '\n') # Right
print(data['Age'].describe())

# Plots
fig,ax=plt.subplots(1,1,figsize=(18,4))
sns.histplot(data=data['Age'], binwidth=5)

plt.title('Age')
plt.show()

# To descritize this one?

## VIP (Feature)

In [None]:
print(data['VIP'].value_counts(dropna=False).sort_index(ascending=False), '\n')

# Plot
fig,ax=plt.subplots(1,2,figsize=(12,2))

# ax[0]
data['VIP'].value_counts(dropna=False).plot(kind='pie', autopct='%.1f%%', startangle=90, cmap='Set2', \
                                            ax=ax[0], textprops={'fontsize':8})
ax[0].set_ylabel('')
ax[0].set_title('Distribution')

# ax[1]
sns.countplot(y=data['VIP'], hue=data['Transported'], order=[True, False], hue_order=[True, False], ax=ax[1])
ax[1].set_ylabel('')
ax[1].set_title('Count and Comparison w/Target')
fig.suptitle('VIP')
plt.show()

# Few VIPS and few Nulls, nothing important here

## Name (Feature)

In [None]:
print('Nulls:', data['Name'].isnull().sum(), '\n')
print(data['Name'].value_counts())

# 200 nulls, couple of name repeted

 ## RoomService (Feature)

In [None]:
print('Nulls:', data['RoomService'].isnull().sum(), '\n')
print(data['RoomService'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(10, 2))

# ax[0]
sns.boxplot(x=data['RoomService'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['RoomService'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('RoomService with and without Outliers')
plt.show()

# ax[0] xticks shold be added

## FoodCourt (Feature)

In [None]:
print('Nulls:', data['FoodCourt'].isnull().sum(), '\n')
print(data['FoodCourt'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['FoodCourt'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['FoodCourt'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('FoodCourt with and without Outliers')
plt.show()

# ax[0] xticks shold be added

## ShoppingMall (Feature)

In [None]:
print('Nulls:', data['ShoppingMall'].isnull().sum(), '\n')
print(data['ShoppingMall'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['ShoppingMall'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['ShoppingMall'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('ShoppingMall with and without Outliers')
plt.show()

# ax[0] xticks shold be added

## Spa (Feature)

In [None]:
print('Nulls:', data['Spa'].isnull().sum(), '\n')
print(data['Spa'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['Spa'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['Spa'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('Spa with and without Outliers')
plt.show()

# ax[0] xticks shold be added

## VRDeck (Feature)

In [None]:
print('Nulls:', data['VRDeck'].isnull().sum(), '\n')
print(data['VRDeck'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['VRDeck'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['VRDeck'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('VRDeck with and without Outliers')
plt.show()

# ax[0] xticks shold be added

# Spa (Feature)

In [None]:
print('Nulls:', data['Spa'].isnull().sum(), '\n')
print(data['Spa'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['Spa'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['Spa'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('Spa with and without Outliers')
plt.show()

# ax[0] xticks shold be added
# Weird values, probably we will ignore this Feature

## Continuous Features Comparation (PairPlot)

In [None]:
# Complete NA values with 0
data_plot = pd.concat([data.select_dtypes(['float64']).fillna(0), data['Transported']], axis=1)

# Plot
sns.pairplot(data=data_plot, hue='Transported', hue_order=[True, False])
plt.show()

# Passangers with no expenses on Spa either VRDeck, are mostly in Cryo (ofc, they are sleeping)
# Probably creating a new single feature for expenses could improve the model performance

# Correcting and Completing Nulls

## HomePlanet (Feature)

In [None]:
# Counting and Checking nulls
print('Nulls Count:', data[data.HomePlanet.isnull()].shape[0], '\n')

#### Groups Approach

In [None]:
# Passangers in Groups
display('Passengers in groups', data[data.HomePlanet.isnull() & ~(data.IsAlone)]
        [['PassengerId', 'HomePlanet', 'IsAlone']].head())
print('In groups with other passengers',data[data.HomePlanet.isnull() & ~(data.IsAlone)].shape[0])

# We can get HomePlanet from GroupId for almost 50% of NA. Lets check if this is a valid approach

In [None]:
# Lets confirm all members of each group departed from the same HomePlanet

data_grp_groupid_homeplanet = data.groupby(['GroupId', 'HomePlanet']).size().to_frame('Count').reset_index()
display(data_grp_groupid_homeplanet.head())

# There are no duplicated GroupId, so all are from the same HomePlanet
print('Is GroupId unique?:', data_grp_groupid_homeplanet.GroupId.is_unique)

# Filling Passangers in Groups with value from same GroupId
data.loc[:, 'HomePlanet'] = data.sort_values(by=['GroupId', 'HomePlanet'])['HomePlanet'].fillna(method='ffill')
print('Nulls Count HomePlanet:', data.HomePlanet.isnull().sum())

#### Mode Approach

In [None]:
# Filling Solo Travelers with the mode # Not Necesary, nulls already 0
# data.loc[data.HomePlanet.isnull() & data.IsAlone, 'HomePlanet'] = data.HomePlanet.mode()[0]

# print('Nulls Count HomePlanet:', data.HomePlanet.isnull().sum())

## CryoSleep (Feature)

In [None]:
# Counting and Checking nulls
print('Nulls Count CryoSleep:', data[data.CryoSleep.isnull()].shape[0], '\n')

#### Amenites Approcah

In [None]:
# Users spending money onboard are not in Cryo
mask_amenities_expenses = (data.RoomService>0) | (data.FoodCourt>0) | (data.ShoppingMall>0) | (data.VRDeck>0)
data.loc[mask_amenities_expenses & data.CryoSleep.isnull(), 'CryoSleep'] = False

print('Nulls Count CryoSleep:', data[data.CryoSleep.isnull()].shape[0], '\n')

#### Transported Approach

In [None]:
# Set True for Transported Passengers. Is this a good approach? Or is this a Data Leak?

display(data.loc[data.CryoSleep.isnull() & data.Transported][['CryoSleep', 'Transported']])
data.loc[data.CryoSleep.isnull() & data.Transported, 'CryoSleep'] = True

print('\nNulls Count CryoSleep:', data[data.CryoSleep.isnull()].shape[0], '\n')

#### VIP Approach

In [None]:
# Set to False for VIPs, any there are just 2
print(data[data.CryoSleep.isnull()].VIP.value_counts())
data.loc[data.CryoSleep.isnull() & data.VIP, 'CryoSleep'] = False

print('\nNulls Count CryoSleep:', data[data.CryoSleep.isnull()].shape[0], '\n')

#### Mode Approach

In [None]:
# For the rest
data['CryoSleep'].fillna(data['CryoSleep'].mode()[0], inplace=True)

print('\nNulls Count CryoSleep:', data[data.CryoSleep.isnull()].shape[0], '\n')

## Destination (Feature)

In [None]:
print('Nulls Count Destination:', data[data.Destination.isnull()].shape[0], '\n')

#### Mode Approach

In [None]:
data['Destination'].fillna(data['Destination'].mode()[0], inplace=True)
print('Nulls Count Destination:', data[data.Destination.isnull()].shape[0], '\n')

## Age (Feature)

In [None]:
print('Nulls Count Age:', data[data.Age.isnull()].shape[0], '\n')

#### Median Approach

In [None]:
# Unnecessary, we will deal with Continuous Features later on

# data['Age'].fillna(data['Age'].median(), inplace=True)

# print('Nulls Count Age:', data[data.Age.isnull()].shape[0], '\n')

## VIP (Feature)

In [None]:
print('Nulls Count VIP:', data[data.VIP.isnull()].shape[0], '\n')

#### Mode Approach

In [None]:
data['VIP'].fillna(data['VIP'].mode()[0], inplace=True)
print('Nulls Count VIP:', data[data.VIP.isnull()].shape[0], '\n')

## Age & Amenities (Continuous Features)

In [None]:
continuous_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'VRDeck', 'Age'] #Age?

print('Nulls Count Amenities:', data[continuous_cols].isnull().sum())

#### Median Approach

In [None]:
# Medians are 0, I'm not sure if this is a good approach

for col in continuous_cols:
    data[col].fillna(data[col].median(), inplace=True)


print('Nulls Count Amenities:', data[continuous_cols].isnull().sum())

## Deck, Num, Side (Created Features)

In [None]:
print('Nulls Count Deck:', data[data.Deck.isnull()].shape[0], '\n')

#### DropNA

In [None]:
data.dropna(subset='Deck', inplace=True)

print('Nulls Count Deck:', data[data.Deck.isnull()].shape[0], '\n')

# Pending to try better approaches

# Features Selection and Convertion

In [None]:
X_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 
          'FoodCourt', 'ShoppingMall', 'VRDeck', 'GroupId', 'GroupSize', 'IsAlone', 'Deck', 'Num', 'Side']

X = data[X_cols]
y = data['Transported']

print(X.info())
print('Target size:', y.shape)

## Nominal Features 

In [None]:
nominal_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side', 'Deck']
X_nominals = data[nominal_cols].astype('category').reset_index(drop=True)
print(X_nominals.info())

# in doubt: IsAlone(correlated with GroupSize), Deck (Nominal?), Num (Ordinal?)

#### Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
oh_encoder = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse=False, drop='first')

X_nominals_encoded = oh_encoder.fit_transform(X_nominals)
X_nominals_encoded = pd.DataFrame\
    (X_nominals_encoded, columns=oh_encoder.get_feature_names_out()) # DF and column names

display(X_nominals_encoded.head())

# Is ok OneHot on Deck with cardinality of 9?

## Ordinal Features

In [None]:
ordinal_cols = ['GroupSize']

X_ordinals = data[ordinal_cols].astype('category').reset_index(drop=True)
print(X_ordinals.info())

#### Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

X_ordinals_encoded = ord_encoder.fit_transform(X_ordinals)
X_ordinals_encoded = pd.DataFrame\
    (X_ordinals_encoded, columns=ord_encoder.get_feature_names_out()) # DF and column names

display(X_ordinals_encoded.head())

## Quantitative Features

In [None]:
X_continuous = data[continuous_cols].reset_index(drop=True)
print(X_continuous.info())

#### Encoding

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

kbins_encoder = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')

X_continuous_encoded = kbins_encoder.fit_transform(X_continuous)
X_continuous_encoded = pd.DataFrame(X_continuous_encoded, columns=kbins_encoder.get_feature_names_out()) # DF and column names
display(X_continuous_encoded.head())

# ain't sure if this is right

## Target Feature

In [None]:
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()

y_encoded = lb_encoder.fit_transform(data['Transported'])
y_encoded = pd.Series(y_encoded, name='Transported')

# Correlation

In [None]:
X_encoded = pd.concat([X_nominals_encoded, X_ordinals_encoded, X_continuous], axis=1)
data_tmp = pd.concat([X_encoded, y_encoded], axis=1)

sns.heatmap(data_tmp.corr(), vmin=-1, vmax=1, cmap='RdYlGn')
plt.show()

# Probably some decks can be put together?

# Modeling

#### Pre-Procesing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

# Folds
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3)

#### Standarization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_encoded[continuous_cols])
X_encoded[continuous_cols] = scaler.transform(X_encoded[continuous_cols]) # Comment for skipping Normalization

# Split
Xtrain, Xtest, ytrain, ytest = train_test_split(X_encoded, y_encoded)

## Lazy Predict

In [None]:
# pip install lazypredict

import lazypredict
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score

models, predictions = LazyClassifier().fit(Xtrain, Xtest, ytrain, ytest)
display(models)


# Quick look at the performance of various models
# PipeLines pending

## LGBMClassifier

In [None]:
import lightgbm as lgb

# Hyper
boosting_type = ['gbdt', 'dart', 'goss', 'rf']
num_leaves = np.arange(50, 150, 1)
learning_rate = np.arange(.01, .1, .01)
param_grid = dict(boosting_type=boosting_type, num_leaves=num_leaves, learning_rate=learning_rate)

# Model
model_lgbm = RandomizedSearchCV(estimator=lgb.LGBMClassifier(), param_distributions=param_grid, cv=cv)
model_lgbm.fit(Xtrain, ytrain)
ypred = model_lgbm.predict(Xtest)

# Print Scores
print(classification_report(ytest, ypred), '\n')
print('Best Score:', model_lgbm.best_score_)
print('Best HyperParameters:', model_lgbm.best_params_, '\n')

# Confusion Matrix
fig, ax = plt.subplots(1, 1, figsize=(6, 4))
sns.heatmap(confusion_matrix(ytest, ypred), annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Hyper
n_neighbros = np.arange(1, 31, 1)
weights = ['uniform', 'distance']
param_grid = dict(n_neighbors=n_neighbros, weights=weights)

# Model
model_knn = RandomizedSearchCV(estimator=KNeighborsClassifier(), 
                               param_distributions=param_grid, cv=cv)
model_knn.fit(Xtrain, ytrain)
ypred = model_knn.predict(Xtest)

# Print Scores
print(classification_report(ytest, ypred), '\n')
print('Best Score:', model_knn.best_score_)
print('Best HyperParameters:', model_knn.best_params_, '\n')

# Confusion Matrix
# fig, ax = plt.subplots(1, 1, figsize=(4, 4))
# sns.heatmap(confusion_matrix(ytest, ypred), annot=True, fmt='d')
# plt.title('Confusion Matrix')
# plt.show()

# Naive-Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

# Model
model_NB = GaussianNB()
model_NB.fit(Xtrain, ytrain)
ypred = model_NB.predict(Xtest)

# Print Scores
print(classification_report(ytest, ypred), '\n')

# Confusion Matrix
# fig, ax = plt.subplots(1, 1, figsize=(4, 4))
# sns.heatmap(confusion_matrix(ytest, ypred), annot=True, fmt='d')
# plt.title('Confusion Matrix')
# plt.show()

## Logistic Regresion

In [None]:
from sklearn.linear_model import LogisticRegression

# Hyper
penalty = ['none', 'l2', 'l1', 'elasticnet']
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(penalty=penalty, C=C)

# Model
model_LR = RandomizedSearchCV(estimator=LogisticRegression(), param_distributions=param_grid)
model_LR.fit(Xtrain, ytrain)
ypred = model_LR.predict(Xtest)

# Print Scores
print(classification_report(ytest, ypred), '\n')
print('Best Score:', model_LR.best_score_)
print('Best HyperParameters:', model_LR.best_params_, '\n')

# Confusion Matrix
# fig, ax = plt.subplots(1, 1, figsize=(4, 4))
# sns.heatmap(confusion_matrix(ytest, ypred), annot=True, fmt='d')
# plt.title('Confusion Matrix')
# plt.show()

## Random Forest

In [None]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler().fit(X_encoded[continuous_cols])
# # X_encoded.loc[continuous_cols]

# # X_encoded[continuous_cols] = scaler.transform(X_encoded[continuous_cols])

# # Split
# Xtrain, Xtest, ytrain, ytest = train_test_split(X_encoded, y_encoded)
from sklearn.ensemble import RandomForestClassifier

# Hyper
n_estimators = np.arange(50, 151, 1)
criterion = ['gini', 'entropy', 'log_loss']
min_samples_leaf = [1, 2, 3]
max_features = ['auto', 'sqrt', 'log2']
param_grid = dict(n_estimators=n_estimators, criterion=criterion, max_features=max_features)

# Model
model_RF = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=param_grid, cv=3)
model_RF.fit(Xtrain, ytrain)
ypred = model_RF.predict(Xtest)

# Print Scores
print(classification_report(ytest, ypred), '\n')
print('Best Score:', model_RF.best_score_)
print('Best HyperParameters:', model_RF.best_params_, '\n')

# Confusion Matrix
# fig, ax = plt.subplots(1, 1, figsize=(4, 4))
# sns.heatmap(confusion_matrix(ytest, ypred), annot=True, fmt='d')
# plt.title('Confusion Matrix')
# plt.show()

# Submission

In [None]:
sample = pd.read_csv('Data/sample_submission.csv')
results = pd.read_csv('train_stand.csv')
results.drop(columns=['Side_nan', 'Deck_nan', 'Unnamed: 0'], inplace=True)
print('test set:', results.shape)
print('X set:', X_encoded.shape)
print('y set:', y_encoded.shape)

In [None]:
model = lgb.LGBMClassifier(boosting_type='dart', learning_rate=.09, num_leaves=68)
model.fit(X_encoded, y_encoded)
ypred = model.predict(results)

In [None]:
sample.loc[:, 'Transported'] = ypred.astype(bool)
sample.set_index('PassengerId', inplace=True)
# sample.to_csv('submision.csv')
