## Importing modules and packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.utils import resample
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import scale 
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import confusion_matrix 
# from sklearn.metrics import plot_confusion_matrix
from sklearn.decomposition import PCA


## Import data and clean up data

In [2]:
train_data = pd.read_csv("./train.csv")


test_data = pd.read_csv("./test.csv")
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Splitting the passenger id into two parts, the group id and the number assigned in the group. In addition, split the cabin column into three, the deck, deck number, and which side of the deck.

In [3]:
Seperate_Passenger_ID = test_data["PassengerId"].str.split("_", n=1, expand=True)
test_data["PassengerGroup"]=Seperate_Passenger_ID[0]
test_data["PassengerNumber"]=Seperate_Passenger_ID[1]
test_data.drop(columns=["PassengerId"], inplace=True)

Seperate_Cabin=test_data["Cabin"].str.split("/", n=2, expand=True)
test_data["Deck"]=Seperate_Cabin[0]
test_data["DeckNumber"]=Seperate_Cabin[1]
test_data["DeckSide"]=Seperate_Cabin[2]
test_data.drop(columns=["Cabin"], inplace=True)

Seperate_Passenger_ID = train_data["PassengerId"].str.split("_", n=1, expand=True)
train_data["PassengerGroup"]=Seperate_Passenger_ID[0]
train_data["PassengerNumber"]=Seperate_Passenger_ID[1]
train_data.drop(columns=["PassengerId"], inplace=True)

Seperate_Cabin=train_data["Cabin"].str.split("/", n=2, expand=True)
train_data["Deck"]=Seperate_Cabin[0]
train_data["DeckNumber"]=Seperate_Cabin[1]
train_data["DeckSide"]=Seperate_Cabin[2]
train_data.drop(columns=["Cabin"], inplace=True)

train_data.drop('Name',axis=1,inplace=True)
test_data.drop('Name',axis=1,inplace=True)

Making all strings into numerical or categorial data

In [4]:
train_data['DeckNumber'] = train_data['DeckNumber'].astype('Int64')
train_data['PassengerGroup'] = train_data['PassengerGroup'].astype('Int64')
train_data['PassengerNumber'] = train_data['PassengerNumber'].astype('Int64')
# train_data['CryoSleep'] = train_data['CryoSleep'].map({'True': True, 'False': False})
train_data['HomePlanet'] = train_data['HomePlanet'].map({'Earth': 0, 'Europa': 1, 'Mars': 2})
train_data['Destination'] = train_data['Destination'].map({'55 Cancri e': 0, 'PSO J318.5-22': 1,'TRAPPIST-1e': 2})
train_data['Deck'] = train_data['Deck'].map({'A': 0,'B': 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,'H': 7,'I': 8,'J': 9,'K': 10,'L': 11,'M': 12,'N': 13,'O': 14,'P': 15,'Q': 16,'R': 17,'S': 18,'T': 19,'U': 20,'V': 21,'W': 22,'X': 23,'Y': 24,'Z': 25})
train_data['DeckSide']=train_data['DeckSide'].map({'S': 0, 'P': 1})

test_data['DeckNumber'] = test_data['DeckNumber'].astype('Int64')
test_data['PassengerGroup'] = test_data['PassengerGroup'].astype('Int64')
test_data['PassengerNumber'] = test_data['PassengerNumber'].astype('Int64')
# test_data['CryoSleep'] = test_data['CryoSleep'].map({'True': True, 'False': False})
test_data['HomePlanet'] = test_data['HomePlanet'].map({'Earth': 0, 'Europa': 1, 'Mars': 2})
test_data['Destination'] = test_data['Destination'].map({'55 Cancri e': 0, 'PSO J318.5-22': 1,'TRAPPIST-1e': 2})
test_data['Deck'] = test_data['Deck'].map({'A': 0,'B': 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,'H': 7,'I': 8,'J': 9,'K': 10,'L': 11,'M': 12,'N': 13,'O': 14,'P': 15,'Q': 16,'R': 17,'S': 18,'T': 19,'U': 20,'V': 21,'W': 22,'X': 23,'Y': 24,'Z': 25})
test_data['DeckSide']=test_data['DeckSide'].map({'S': 0, 'P': 1})


In [5]:
print(train_data.dtypes)

HomePlanet         float64
CryoSleep           object
Destination        float64
Age                float64
VIP                 object
RoomService        float64
FoodCourt          float64
ShoppingMall       float64
Spa                float64
VRDeck             float64
Transported           bool
PassengerGroup       Int64
PassengerNumber      Int64
Deck               float64
DeckNumber           Int64
DeckSide           float64
dtype: object


We see that the only columns with non-defined data types are CryoSleep and VIP. We will see if they have any null values.

In [7]:
print("Number of rows with null \'VIP\' data: "+ str(len(train_data[train_data['VIP'].isnull()])))
print("Number of total rows: "+ str(len(train_data)))
print("Percentage of null \'VIP\' data: "+str(len(train_data[train_data['VIP'].isnull()])/len(train_data))+"\n")

print("Number of rows with null \'CryoSleep\' data: "+ str(len(train_data[train_data['CryoSleep'].isnull()])))
print("Number of total rows: "+ str(len(train_data)))
print("Percentage of null \'CryoSleep\' data: "+str(len(train_data[train_data['CryoSleep'].isnull()])/len(train_data))+"\n")

Number of rows with null 'VIP' data: 203
Number of total rows: 8693
Percentage of null 'VIP' data: 0.023352122397331185

Number of rows with null 'CryoSleep' data: 217
Number of total rows: 8693
Percentage of null 'CryoSleep' data: 0.02496261359714713



Thus, the other rows has around 2.5% of null rows. We will try to impute the missing data.

In [9]:
train_data.corr()["VIP"]

HomePlanet         0.127844
CryoSleep         -0.081402
Destination       -0.044722
Age                0.092819
VIP                1.000000
RoomService        0.058785
FoodCourt          0.129799
ShoppingMall       0.018295
Spa                0.061059
VRDeck             0.125974
Transported       -0.037650
PassengerGroup     0.014018
PassengerNumber    0.000622
Deck              -0.180471
DeckNumber        -0.099416
DeckSide           0.010117
Name: VIP, dtype: float64

Since the maximum correlation is with Food Court, linear regression will be used to predict VIP

In [8]:
train_data.corr()["CryoSleep"]

HomePlanet         0.088072
CryoSleep          1.000000
Destination       -0.096722
Age               -0.074273
VIP               -0.081402
RoomService       -0.252396
FoodCourt         -0.211510
ShoppingMall      -0.212514
Spa               -0.203991
VRDeck            -0.198857
Transported        0.468645
PassengerGroup    -0.005879
PassengerNumber    0.064676
Deck               0.015158
DeckNumber        -0.037560
DeckSide          -0.019677
Name: CryoSleep, dtype: float64

Apart from Transported, the most correlated one is RoomService.

In [10]:
train_data['HomePlanet'].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
8688    False
8689    False
8690    False
8691    False
8692    False
Name: HomePlanet, Length: 8693, dtype: bool

In [11]:
VIP=train_data['VIP'].values
FoodCourt=train_data['FoodCourt'].values



## Downsampling

Since SVM works well with only small data sets, we will split our training data into smaller sets of 1000 data points each

In [12]:
train_transported=train_data[train_data['Transported']==True]
train_not_transported=train_data[train_data['Transported']==False]

train_not_transported_downsampled=resample(train_not_transported,replace=False,n_samples=1000)
train_transported_downsampled=resample(train_transported,replace=False,n_samples=1000)

train_downsampled=pd.concat([train_transported_downsampled,train_not_transported_downsampled])

## Setting independant and dependant variables

In [13]:
X=train_data.drop('Transported',axis=1).copy()
y=train_data['Transported'].copy()

X_dummies=pd.get_dummies(X,columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroup', 'PassengerNumber', 'Deck', 'DeckNumber', 'DeckSide'])


The Radial Basis Function only works when the data is centered and scaled, so that the mean value is 0 and the standard deviation is 1.

In [16]:
X_train,X_test, y_train, y_test=train_test_split(X_dummies,y)
X_train_scaled=scale(X_train)
X_test_scaled=scale(X_test)

In [18]:
test_svm=SVC()
test_svm.fit(X_train_scaled,y_train)


ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [17]:
confusion_matrix(test_svm,X_test_scaled,y_test,values_format="d",display_labels=["Not transported","Transported"])

NameError: name 'test_svm' is not defined