# Spaceship titanic

# 1. Import Libraries/ Data Loading 

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px

from sklearn.preprocessing import LabelEncoder

sns.set(rc={'figure.figsize':(6, 4)})
sns.set_style('whitegrid')
sns.color_palette("flare")
sns.set_palette(sns.color_palette("flare"))

In [None]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# 2. EDA (Exploratory Data Analysis) and Data Preprocessing


The goal of EDA is to understand the main characteristics of the data and identify any patterns, outliers, or other features of the data that are important to know before building a model or making predictions.

### Observations in Train Data
It's a first step in EDA. It is useful for understanding the main characteristics of the data and identifying any patterns, outliers, or other features of the data.

- There are total of 14 columns and 8693 rows in train data.
- Train data contains 119378 observation with 2324 missing values.
- All 12 feature columns have missing values in them with CryoSleep having highest missing values (217) 
- Transported is the target variable which is only available in the train dataset.

In [None]:
train_data.head()

In [None]:
print(f'Shape of train data: {train_data.shape}')

In [None]:
print(f'Number of rows in train data: {train_data.shape[0]}')
print(f'Number of columns in train data: {train_data.shape[1]}')
print(f'Number of values in train data: {train_data.count().sum()}')
print(f'Number missing values in train data: {sum(train_data.isna().sum())}')

In [None]:
print(train_data.isna().sum().sort_values(ascending = False))

The basic statistics for each variables which contain information on count, mean, standard deviation, minimum, 1st quartile, median, 3rd quartile and maximum.

In [None]:
# Descriptive Statistics
train_data.describe()

The pandas-profiling library allows you to generate a profile report that allows you to obtain the types of all the columns and to access statistical details at the quantile level, descriptions, histograms, and the most frequent and exterm values.

In [None]:
from pandas_profiling import ProfileReport

ProfileReport(train_data)

### Observations in Test Data
- There are total of 13 columns and 4277 rows in test data.
- Test data contains 54484 observation with 1117 missing values.
- All 12 feature columns have missing values in them with FoodCourt having highest missing values (106)

In [None]:
test_data.head()

In [None]:
print(f'Shape of test data: {test_data.shape}')

In [None]:
print(f'Number of rows in test data: {test_data.shape[0]}')
print(f'Number of columns in test data: {test_data.shape[1]}')
print(f'Number of values in train data: {test_data.count().sum()}')
print(f'Number of rows with missing values  in test data: {sum(test_data.isna().sum())}')

In [None]:
print((test_data.isna().sum().sort_values(ascending = False)))

In [None]:
# statistics of test data 
test_data.describe()

### Visualization of data

In [None]:
enc =LabelEncoder()
train_data['Transported'] = enc.fit_transform(train_data['Transported'])
sns.countplot(data=train_data,x=train_data.Transported)

In [None]:
sns.countplot(data=train_data, x='Destination', hue='Transported')

In [None]:
sns.countplot(data=train_data, x='HomePlanet', hue='Transported')

In [None]:
sns.countplot(data=train_data, x='HomePlanet', hue='Destination')

### Visualization of missing data

In [None]:
# Using Missingno to Diagnose Data Sparsity

msno.matrix(train_data).set_title("Train set",fontsize=20)

In [None]:
msno.matrix(test_data).set_title("Test set",fontsize=20)

### Correlation matrix

Darker colors indicate a stronger positive correlation, while lighter colors indicate a weaker positive correlation or a negative correlation.

In [None]:
corr = train_data.corr()

fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(corr, annot=True, ax=ax)
plt.show()

In [None]:
plt.figure(5, figsize=(25, 10))
corr = train_data.apply(lambda x: pd.factorize(x)[0]).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

# 3. Feature Engineering
Feature engineering is the process of using domain knowledge to extract features from raw data that can be used to train machine learning models. It involves transforming raw data into a format that can be easily understood by the model, such as converting text to numerical values. The goal of feature engineering is to create a set of features that are most informative and relevant for the task at hand, which can improve the performance of the model.


In [None]:
train_data.drop('Name', axis=1, inplace=True)
test_data.drop('Name', axis=1, inplace=True)

In [None]:
train_data['Transported'].replace(False, 0, inplace=True)
train_data['Transported'].replace(True, 1, inplace=True)

In [None]:
train_data[['deck','num', 'side']] = train_data['Cabin'].str.split('/', expand=True)
test_data[['deck','num', 'side']] = test_data['Cabin'].str.split('/', expand=True)

train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)


train_data.drop('num', axis=1, inplace=True)
test_data.drop('num', axis=1, inplace=True)

In [None]:
col_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train_data['total_spent'] = train_data[col_to_sum].sum(axis=1)
test_data['total_spent'] = test_data[col_to_sum].sum(axis=1)

In [None]:
plt.figure(figsize=(6,5 ))
train_data['deck'].value_counts().plot.bar(rot=0)

In [None]:
plt.figure(figsize=(6,5 ))
train_data['side'].value_counts().plot.bar(rot=0)

In [None]:
train_data.hist("Age")

In [None]:
train_data.Age.describe()

In [None]:
# child : 0, teenager : 1, adult : 2, older : 3
labels=['child', 'teenager', 'adult', 'older']
bins = [0, 12, 21, 45, 80]
train_data['Age_Group'] = pd.cut(train_data['Age'], bins=bins, labels=labels)
test_data['Age_Group'] = pd.cut(test_data['Age'], bins=bins, labels=labels)
train_data.head()

In [None]:
plt.figure(figsize=(6,5 ))
train_data['Age_Group'].value_counts().plot.bar(rot=0)

In [None]:
train_data = train_data.drop("Age", axis=1)
test_data = test_data.drop("Age", axis=1)

train_data.head()

In [None]:
train_data['Passenger_Group'] = train_data['PassengerId'].str.split('_', 1, expand=True)[1].astype(int)
test_data['Passenger_Group'] = test_data['PassengerId'].str.split('_', 1, expand=True)[1].astype(int)

train_data.set_index('PassengerId',inplace=True)
test_data.set_index('PassengerId',inplace=True)

# train_data = train_data.drop("PassengerId", axis = 1)
# test_data = test_data.drop("PassengerId", axis = 1)

In [None]:
train_data.head()

### Outliers detection

In [None]:
train_data.plot(kind = "box" , subplots = True , layout= (5,5), figsize =(20,15) )
plt.show()

In [None]:
sns.boxplot(data=train_data[["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]])

###  Imputing Missing Values


We are using Simple Imputer to fill the na values with the specified strategy.

For ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'deck', 'side', 'Age_Group'] we use the strategy most_frequent as it is categorical data.

For ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'total_spent'] we use the strategy median as it is numeric data.


In [None]:
categorical_cols = [col for col in train_data.columns if train_data[col].dtype == 'object' or train_data[col].dtype == 'category']
numeric_cols = [col for col in train_data.columns if train_data[col].dtype == 'float64']

print(f'Categorical cols -- {categorical_cols}')
print(f'Numeric cols -- {numeric_cols}')

In [None]:
train_data[categorical_cols] = train_data[categorical_cols].astype('category')
test_data[categorical_cols] = test_data[categorical_cols].astype('category')

In [None]:
from sklearn.preprocessing import OrdinalEncoder

oc = OrdinalEncoder()
data_for_encode = pd.concat([train_data, test_data])
data_for_encode[categorical_cols] = data_for_encode[categorical_cols].astype('category')
data_for_encode[categorical_cols] = oc.fit_transform(data_for_encode[categorical_cols])

del train_data, test_data

train_data = data_for_encode.iloc[:8693, :]
test_data = data_for_encode.iloc[8693: , :]

del data_for_encode

test_data.drop('Transported', inplace=True, axis=1)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

ctc = ColumnTransformer([("imp", SimpleImputer(strategy='most_frequent'), categorical_cols)])
    
train_data[categorical_cols] = ctc.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = ctc.fit_transform(test_data[categorical_cols])

ctn = ColumnTransformer([("imp", SimpleImputer(strategy='median'), numeric_cols)])

train_data[numeric_cols] = ctn.fit_transform(train_data[numeric_cols])
test_data[numeric_cols] = ctn.fit_transform(test_data[numeric_cols])

train_data["Transported"].fillna(method='ffill', inplace = True)

In [None]:
train_data.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_data[numeric_cols] = scaler.fit_transform(train_data[numeric_cols])
test_data[numeric_cols] = scaler.fit_transform(test_data[numeric_cols])


In [None]:
train_data.head()

In [None]:
test_data.head()

# Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_predict, cross_val_score


In [None]:
X = train_data.drop('Transported',axis=1)
y = train_data['Transported']

X.columns

In [None]:
X_test_data = test_data

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X, 
                                                       y, 
                                                       random_state = 12 ,
                                                       test_size =0.20)

In [None]:
X_train.shape

In [None]:
X_test.shape

### LogisticRegression

In [None]:
model_Log = LogisticRegression(max_iter=10000)
model_Log.fit(X_train,y_train)
model_Log.score(X_train,y_train)

In [None]:
y_pred_log= model_Log.predict(X_test)

In [None]:
model_Log.score(X_test, y_test)

In [None]:
model_Log.coef_

### Random Forest

In [None]:
model_forest = RandomForestClassifier()
model_forest.fit(X_train,y_train)
model_forest.score(X_train,y_train)

In [None]:
y_pred_forest = model_forest.predict(X_test)

In [None]:
model_Log.score(X_test, y_test)

In [None]:
# fine tunnig random forest 
forest_params = [{'n_estimators':[500],
                'min_samples_leaf':[4]}]

forest_grid = GridSearchCV(estimator=model_forest, param_grid=forest_params, cv=5)
forest_grid.fit(X_train, y_train)
forest_grid.best_estimator_

In [None]:
model_forest_final = forest_grid.best_estimator_
y_pred_forest2 = model_forest_final.predict(X_test)

In [None]:
model_forest_final.score(X_test, y_test)

In [None]:
model_forest_final.feature_importances_

### SVM

In [None]:
model_svc = SVC(random_state = 42)
model_svc.fit(X_train,y_train)
model_svc.score(X_train,y_train)

In [None]:
y_pred_svc = model_svc.predict(X_test)

In [None]:
model_svc.score(X_test, y_test)

### K-Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
knn = KNeighborsClassifier()
param_grid = {'n_neighbors':np.arange(2,15)}
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(X,y)
knn_gscv.best_params_

In [None]:
model_knn = KNeighborsClassifier(n_neighbors=11)
model_knn.fit(X_train,y_train)
model_knn.score(X_train,y_train)

In [None]:
y_pred_knn = model_knn.predict(X_test)

In [None]:
model_knn.score(X_test, y_test)

### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model_gbr = GradientBoostingClassifier(random_state = 1)
  
# Fit to training set
model_gbr.fit(X_train, y_train)
model_gbr.score(X_train,y_train)

In [None]:
y_pred_gbr = model_gbr.predict(X_test)

In [None]:
model_gbr.score(X_test, y_test)

### Tuning the Gradient Boostinf classifier

In [None]:
gbc = GradientBoostingClassifier()
parameters = {
    "n_estimators":[5,50,100],
    "max_depth":[1,3,5],
    "learning_rate":[0.01,0.1,1]
}
cv = RandomizedSearchCV(gbc, parameters, n_iter=27, scoring='accuracy', n_jobs=-1, cv=5, random_state=1)
cv.fit(X,y)
cv.best_params_

In [None]:
model2_gbc = GradientBoostingClassifier(n_estimators=100,max_depth=5,learning_rate=0.1) #best params from gscv

model2_gbc.fit(X_train,y_train)
model2_gbc.score(X_train,y_train)

In [None]:
y_pred_gbc = model2_gbc.predict(X_test)

In [None]:
model2_gbc.score(X_test, y_test)

### Extreme Gradient Boosting

In [None]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
model_xgb.fit(X_train,y_train)
model_xgb.score(X_train,y_train)

In [None]:
y_pred_xgb = model_xgb.predict(X_test)

In [None]:
model_xgb.score(X_test, y_test)

### CatBoost

In [None]:
from catboost import CatBoostClassifier

model_cat=CatBoostClassifier(iterations=3000,
                         eval_metric='Accuracy',
                        verbose=0)


In [None]:
model_cat.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
pred = model_cat.predict(X_train)
print(accuracy_score(y_train.values,pred))

In [None]:
y_pred_cat = model_cat.predict(X_test)

In [None]:
model_cat.score(X_test, y_test)

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_boost_model = AdaBoostClassifier(n_estimators=50, random_state=42)

In [None]:
ada_boost_model.fit(X_train, y_train)

In [None]:
ada_boost_model.score(X_train,y_train)

In [None]:
y_pred_adaboost = ada_boost_model.predict(X_test)

In [None]:
ada_boost_model.score(X_test, y_test)

### Fine tune AdaBoost Classifier

### Lazy Classifier

In [None]:
import lazypredict
from lazypredict.Supervised import LazyClassifier
from IPython.display import clear_output

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X, 
                                                       y, 
                                                       random_state = 12 ,
                                                       test_size =0.20)

In [None]:
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=False,
                     random_state=12,
                     classifiers='all')

models, predictions = clf.fit(X_train , X_test , y_train , y_test)
clear_output()

In [None]:
models[:15]