>## __MLSA CAPSTONE PROJECT - THE SPACESHIP TITANIC DATASET__
>### 1. *Importing and exploring the data*

In [649]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler

In [1146]:
import warnings
warnings.filterwarnings('ignore')

In [175]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

In [3]:
train = pd.read_csv('train.csv')

In [5]:
train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [41]:
np.sum(train.isna(), axis = 0) #null values in each column

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [564]:
train['Destination'].value_counts()

Destination
TRAPPIST-1e      5498
55 Cancri e      1679
PSO J318.5-22     743
Name: count, dtype: int64

In [370]:
#for columns HomePlanet, CryoSleep and Cabin I dropped the null values
for col in ['HomePlanet', 'CryoSleep', 'Cabin']:
    ind = train[train[col].isna()].index
    train.drop(ind, axis = 0, inplace = True)
train.shape

(8091, 14)

In [689]:
train['Cabin']

0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8091, dtype: object

>### 2. *Creating pipelines and cleaning the data*

In [634]:
#I created a custom transformer to convert the passenger id to group number
class id_processor(BaseEstimator, TransformerMixin):
    def __init__(self):
        ''
    def fit(self, X, y = None):
        return self
    def transform(self,  X):
        return pd.DataFrame([a[:4] for a in train['PassengerId']])
    def get_feature_names_out(self, X):
       return ['group_no']

In [727]:
'''This custom transformer takes the cabin column and splits it into three columns-deck, num and side
it then one hot encodes the deck and side columns then returns the whole thing as a 10 column array'''
class cabin_processor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.OHE = OneHotEncoder(sparse_output = False)
        self.OHE1 = OneHotEncoder(sparse_output = False)
        self.MMS = MinMaxScaler(feature_range = (0,1))
    def fit(self, X, y = None):
        return self
    def transform(self,  X):
        X = np.array(X).reshape(-1,1)
        df = pd.DataFrame([cabin[0].split('/') for cabin in X])
        #df.columns = ['deck', 'num', 'side']
        deck = pd.DataFrame(self.OHE.fit_transform(df.iloc[:, 0:1]))
        side = pd.DataFrame(self.OHE1.fit_transform(df.iloc[:, 2:3]))
        cab_no = pd.DataFrame(self.MMS.fit_transform(df.iloc[:, 1:2]))
        return pd.concat([deck, cab_no, side], axis = 1)
    def get_feature_names_out(self, X):
        lis_ = self.OHE.get_feature_names_out().tolist()
        return ['Cabin_'+ a for a in lis_] + ['Num'] + ['Portside', 'Starboard']

In [527]:
pipeline1 = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder(sparse_output = False)) # for Destination, VIP, HomePlanet and CryoSleep

In [971]:
pipeline2 = make_pipeline(SimpleImputer(strategy = 'mean'), MinMaxScaler(feature_range=(0, 100))) #RoomService, FoodCourt, ShoppingMall, Spa and VRDeck

In [973]:
 transformer = ColumnTransformer([ ('ct1', cabin_processor(), ['Cabin']),
                                     ('ct2', pipeline1, ['Destination', 'VIP', 'HomePlanet', 'CryoSleep']),
                                     ('ct3', pipeline2, ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']),
                                     ('ct4', id_processor(), ['PassengerId'])], remainder = 'drop')

In [975]:
#generating the train and test set
labels = train['Transported']
preprocessed = transformer.fit_transform(train)

In [977]:
preprocessed = pd.DataFrame(preprocessed, columns = transformer.get_feature_names_out())

>### 3. *Trying different models*

In [987]:
#splitting the dataset into train and test
from sklearn.model_selection import StratifiedShuffleSplit
splitter = StratifiedShuffleSplit(n_splits = 1, test_size = 0.15, random_state = 42)
for train_index, test_index in splitter.split(preprocessed, train['Transported']):
    X_train, y_train = preprocessed.iloc[train_index], labels.iloc[train_index]
    X_test, y_test = preprocessed.iloc[test_index], labels.iloc[test_index]

In [988]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict

In [989]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state = 42, max_iter = 300)
LR.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [990]:
cross_val_score(LR, X_train, y_train,cv = 4, scoring = 'accuracy')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.7872093 , 0.79930192, 0.78999418, 0.78359511])

In [991]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(max_depth = 10, random_state = 42)
DTC.fit(X_train, y_train)

In [992]:
cross_val_score(DTC, X_train, y_train, cv = 4, scoring = 'accuracy')

array([0.75988372, 0.74985457, 0.77603258, 0.76730657])

In [995]:
from sklearn.ensemble  import RandomForestClassifier, VotingClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [996]:
cross_val_score(rfc, X_train, y_train, cv = 4, scoring = 'accuracy')

array([0.79476744, 0.79290285, 0.79755672, 0.80279232])

In [999]:
voting = VotingClassifier(estimators = [('dtc1', DecisionTreeClassifier(max_depth = 15, random_state = 42)),
                                        ('lr2', LogisticRegression(random_state = 42, max_iter = 300)),
                                        ('rfc1', RandomForestClassifier())])
voting.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1021]:
cross_val_score(voting, X_train, y_train, cv = 4, scoring = 'accuracy')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.79534884, 0.79057592, 0.79988365, 0.80686446])

In [1000]:
from sklearn.ensemble import BaggingClassifier
BG = BaggingClassifier(RandomForestClassifier(), n_estimators = 300, max_samples=120, n_jobs = -2, random_state = 42,
                      bootstrap = True, oob_score = True)
BG.fit(X_train, y_train)

In [1001]:
cross_val_score(BG, X_train, y_train, cv = 4, scoring = 'accuracy')

array([0.77267442, 0.78592205, 0.77021524, 0.76730657])

>### 3. *Comparison of models using accuracy, precision and recall*

In [1023]:
model_stats = []
for clf in [LR, DTC, rfc, voting]:
    stat = []
    for metric in ['accuracy', 'precision', 'recall']:
        stat.append(cross_val_score(clf, X_train, y_train, cv=4, scoring=metric).mean())
    model_stats.append(stat)
metric_data = pd.DataFrame(model_stats)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [1033]:
metric_data.columns = ['Accuracy', 'Precision', 'Recall']
metric_data.index = ['Logistic Regression','Decision Trees', 'Random Forests', 'Voting Classifier']

In [1035]:
metric_data

Unnamed: 0,Accuracy,Precision,Recall
Logistic Regression,0.790025,0.783776,0.80612
Decision Trees,0.763269,0.786994,0.727074
Random Forests,0.80093,0.827486,0.755632
Voting Classifier,0.796568,0.815545,0.769482


> *So the best performing model is Random Forests Classifier, next I want to try it on the test set:*

In [1120]:
prediction = rfc.predict(X_test)

In [1127]:
accuracy_score(prediction, y_test) #80% accuracy on the test set

0.8023064250411862