**Importing Libraries for Data Preprocessing and Classification**

In [69]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA


Importing the Dataset

In [70]:
 df=pd.read_csv('/content/train.csv')
 df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


Checking the missing values in dataset

In [71]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

Filling the missing values using mode method

In [72]:
lst=['HomePlanet','CryoSleep','Cabin','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Name']
for i in lst:
  df[i]=df[i].fillna(df[i].mode()[0])
  

Crossing Checking the  missing values in datasets

In [73]:
df.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

Checking the data type 

In [74]:
df.dtypes

PassengerId      object
HomePlanet       object
CryoSleep          bool
Cabin            object
Destination      object
Age             float64
VIP                bool
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

Creating new columns for substituting Cabin values

In [75]:
df[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Cabin_num,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,A,98,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,G,1499,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,G,1500,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,E,608,S


Droping unwanted datas

In [76]:
df.drop(['PassengerId','Name','Cabin'],axis=1,inplace=True)

Converting the 'object,bool' datatypes into int 

In [77]:
le=LabelEncoder()
lst1=['HomePlanet','CryoSleep','Destination','VIP','Transported','Side','Cabin_num','Deck']
for i in lst1:
  df[i]=le.fit_transform(df[i])



Cross checking the data types

In [78]:
df.dtypes

HomePlanet        int64
CryoSleep         int64
Destination       int64
Age             float64
VIP               int64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported       int64
Deck              int64
Cabin_num         int64
Side              int64
dtype: object

Creating target dataset

In [79]:
y=df['Transported']
y

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: Transported, Length: 8693, dtype: int64

Droping the Target data and creating a dataset

In [80]:
df.drop('Transported',axis=1,inplace=True)
X=df.iloc[:,:]

In [81]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1,test_size=0.3)

Standardizing the feature values of a dataset using StandardScaler

In [82]:
sc=StandardScaler()
sc.fit(X_train)
X_train=sc.transform(X_train)
X_test=sc.transform(X_test)

Comparing the Accuracy of Different Classification Algorithms on a Dataset

In [83]:
classifier=[KNeighborsClassifier(n_neighbors=5),SVC(kernel='rbf'),RandomForestClassifier(n_estimators=42),DecisionTreeClassifier(criterion='entropy'),AdaBoostClassifier(n_estimators=42),GaussianNB()]
for i in classifier:
  i.fit(X_train,y_train)
  y_pred=i.predict(X_test)
  print(i,round(accuracy_score(y_test, y_pred)*100,2),'%')

KNeighborsClassifier() 77.57 %
SVC() 80.64 %
RandomForestClassifier(n_estimators=42) 80.25 %
DecisionTreeClassifier(criterion='entropy') 74.0 %
AdaBoostClassifier(n_estimators=42) 79.64 %
GaussianNB() 71.24 %


Dimensionality reduction using PCA on training and testing data 

In [84]:
pca=PCA(n_components=3)
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)


Comparing the Accuracy of Different Classification Algorithms on a Dataset with PCA

In [85]:
classifier=[KNeighborsClassifier(n_neighbors=10),SVC(kernel='rbf'),RandomForestClassifier(n_estimators=42),DecisionTreeClassifier(criterion='entropy'),AdaBoostClassifier(n_estimators=42),GaussianNB()]
for i in classifier:
  i.fit(X_train,y_train)
  y_pred=i.predict(X_test)
  print(i,round(accuracy_score(y_test, y_pred)*100,2),'%')
  
  

KNeighborsClassifier(n_neighbors=10) 73.54 %
SVC() 73.81 %
RandomForestClassifier(n_estimators=42) 72.35 %
DecisionTreeClassifier(criterion='entropy') 67.68 %
AdaBoostClassifier(n_estimators=42) 72.55 %
GaussianNB() 73.12 %


It can be seen that using PCA there is no improvement in accuracy.

The Support Vector Machine has the highest accuracy among all other classifer which is 80.64 %