# Spaceship titanic classification
<p>A simple example of a classification from Kaggle data set. </p> </br>
There was a disaster on board the spaceship.  
Some people have been transferred to another dimension.  
The model classifies passengers who have been transferred.  


link do data set: https://www.kaggle.com/c/spaceship-titanic


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#importing test and train set (previously splited)
df_train = pd.read_csv(r"data\spaceship titanic\train.csv")
df_test = pd.read_csv(r"data\spaceship titanic\test.csv")

In [96]:
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# analyzing data

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
df_train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
#checking the correctness of categorical data (no typos which could add new category; no outliers; no strange categories)
def check_categorical(df):
    for col in df.columns[1:]:
        print(df[col].value_counts(), "\n")
           

In [7]:
check_categorical(df_train[list(df_train.select_dtypes(include=['bool', "object"]).columns)])

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64 

False    5439
True     3037
Name: CryoSleep, dtype: int64 

G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: Cabin, Length: 6560, dtype: int64 

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64 

False    8291
True      199
Name: VIP, dtype: int64 

Gollux Reedall        2
Elaney Webstephrey    2
Grake Porki           2
Sus Coolez            2
Apix Wala             2
                     ..
Jamela Griffy         1
Hardy Griffy          1
Salley Mckinn         1
Mall Frasp            1
Propsh Hontichre      1
Name: Name, Length: 8473, dtype: int64 

True     4378
False    4315
Name: Transported, dtype: int64 



# data preprocessing

### spliting data set 

In [130]:
X_train, y_train = df_train[df_train.columns[:-1]], df_train["Transported"]


### dropping useless(?) features 

In [99]:
#The name attribute  doesn't seem to affect targets - it should be dropped
X_train.drop("Name", axis=1, inplace=True)
df_test.drop("Name", axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### extracting data 

In [106]:
#first column looks useless, because this is just unique ID of passengers...but!
#there is an information about number of family/group members on the ship - this could be important feature
#let's "pull out" this from data
from sklearn.base import BaseEstimator, TransformerMixin

class passenger_id_transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("")
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_passenger = X["PassengerId"].apply(lambda x: x[:4])
        members = dict(X_passenger.value_counts())
        X.loc[:, "PassengerId"] = X_passenger.apply(lambda x: members[x])
        return X
    

#like before with PassengerId: column Cabin contains information about deck/side of passengers cabin - this could be important
#building encoder transformation to extract this data

class cabin_transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("")
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        deck = X["Cabin"].apply(lambda x: str(x)[0])
        side = X["Cabin"].apply(lambda x: str(x)[-1])
        X.drop("Cabin", axis=1, inplace=True)
        X["Deck"] = deck
        X["Side"] = side
        return X
    

In [105]:
#quick look if it works
passenger_transformer = passenger_id_transformer()
passenger_transformer.fit_transform(X_train.copy())




Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,1,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,2,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,1,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0
8689,1,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0
8690,1,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0
8691,2,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0


In [102]:
#quick look if it works
cabin_transformer = cabin_transformer()
cabin_transformer.fit_transform(X_train.copy())




Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,B,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,F,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,A,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,A,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,A,P
8689,9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,G,S
8690,9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,G,S
8691,9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,E,S


### handling with missing values

In [131]:
#in this case, dropping all nan values is bad idea - too much data would be lost
#to handle with missing values I will build transformer, it will really simplify the process with different types of attributes/missing values 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler

cat_features = ["PassengerId", "HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]
num_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa"]



pipe = Pipeline([
    ("PassengerId_transform", passenger_id_transformer()),
    ("Cabin_transfrom", cabin_transformer()),
    
    ("cleaning", ColumnTransformer(transformers=[
        
        ("cat", make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder()), cat_features),
        
        ("num", make_pipeline(
            SimpleImputer(strategy="median"),
            MinMaxScaler()), num_features)
    ]))
    
    
    
])




cat_pipeline.fit(X_train.copy())



# cat_pipeline = Pipeline(steps=[
#     ("impute", SimpleImputer(strategy='most_frequent')),
#     ("encoder", ColumnTransformer(transformers =[
#         ("one_hot_encoder", OneHotEncoder(), ["PassengerId", "HomePlanet", "Destination", "Deck", "Side"]),

#     ]))])


# t = [('cat', SimpleImputer(strategy='most_frequent'), [1,2,3,4,5,7]),
#      ("num_age", SimpleImputer(strategy='median'), [6]),
#      ("num_service", SimpleImputer(strategy="constant", fill_value=0.0), [8,9,10,11])]
# missing_values_transformer = ColumnTransformer(transformers=t)






ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
# not necessary but I will convert the transformer output back to df just to visualize it
#at the end I'll use a pipeline, so the data will be transformed anyway
df_train_nonull = df_train_extracted.copy()
df_train_nonull[[df_train_nonull.columns[x] for x in [1,2,3,4,5,7,6,8,9,10,11,12]]] = missing_values_transformer.fit_transform(df_train_nonull)

In [None]:
df_train_nonull.isnull().sum() #it worked! :)

In [32]:
X_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0


In [120]:
pipe = make_pipeline(
    passenger_id_transformer(),
    cabin_transformer()
)





In [129]:
pipe.fit_transform(X_train.copy())

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Deck,Side
0,1,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,B,P
1,1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,F,S
2,2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,A,S
3,2,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,A,S
4,1,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,A,P
8689,1,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,G,S
8690,1,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,G,S
8691,2,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,E,S


In [127]:
print(lol)

Pipeline(steps=[('passenger_id_transformer', passenger_id_transformer()),
                ('cabin_transformer', cabin_transformer())])


In [122]:
X_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,1,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,2,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,1,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0
8689,1,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0
8690,1,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0
8691,2,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0
