In [288]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [289]:
df = pd.read_csv('train.csv')

In [290]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [291]:
features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age']
label = 'Transported'

In [292]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

class FeatureLabel(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        label_encoder = LabelEncoder()
        X['HomePlanet'] = label_encoder.fit_transform(X['HomePlanet'])
        X['CryoSleep'] = label_encoder.fit_transform(X['CryoSleep'])
        X['Destination'] = label_encoder.fit_transform(X['Destination'])
        X['VIP'] = label_encoder.fit_transform(X['VIP'])
        X['VRDeck'] = label_encoder.fit_transform(X['VRDeck'])
        X['Transported'] = label_encoder.fit_transform(X['Transported'])

        return X

In [293]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [294]:
df.dropna(inplace=True)

In [295]:
from sklearn.impute import SimpleImputer

class FeatureImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        imputer = SimpleImputer(strategy='mean')
        X['Age'] = imputer.fit_transform(X[['Age']])
        return X

In [296]:
class FeatureFillValue(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['HomePlanet'] = X['HomePlanet'].bfill()
        X['CryoSleep'] = X['CryoSleep'].bfill()
        X['Destination'] = X['Destination'].bfill()
        X['VIP'] = X['VIP'].bfill()
        X['VRDeck'] = X['VRDeck'].bfill()
        X['Transported'] = X['Transported'].bfill()

        return X


In [297]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(['Cabin', 'PassengerId' 'Name'], axis=1, errors='ignore')

In [298]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('feature_label', FeatureLabel()),
    ('feature_dropper', FeatureDropper())
])

In [299]:
train_data = pipeline.fit_transform(df)

In [300]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6606 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   6606 non-null   object 
 1   HomePlanet    6606 non-null   int32  
 2   CryoSleep     6606 non-null   int32  
 3   Destination   6606 non-null   int32  
 4   Age           6606 non-null   float64
 5   VIP           6606 non-null   int32  
 6   RoomService   6606 non-null   float64
 7   FoodCourt     6606 non-null   float64
 8   ShoppingMall  6606 non-null   float64
 9   Spa           6606 non-null   float64
 10  VRDeck        6606 non-null   int64  
 11  Name          6606 non-null   object 
 12  Transported   6606 non-null   int64  
dtypes: float64(5), int32(4), int64(2), object(2)
memory usage: 619.3+ KB


In [301]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0,Maham Ofracculy,0
1,0002_01,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44,Juanna Vines,1
2,0003_01,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49,Altark Susent,0
3,0003_02,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,180,Solam Susent,0
4,0004_01,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2,Willy Santantines,1


In [302]:
from sklearn.preprocessing import StandardScaler

X = train_data[features]
y = train_data[label]

scaler = StandardScaler()

X_scaler = scaler.fit_transform(X)
y_scaler = y.to_numpy()

In [303]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaler, y_scaler, test_size=0.2, random_state=42)

In [304]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [305]:
from sklearn.linear_model import LogisticRegression

model_1 = LogisticRegression()

model_1.fit(X_train, y_train)

In [306]:
pred_1 = model_1.predict(X_test)

In [307]:
from sklearn.metrics import accuracy_score

print('LogisticRegression Model:', accuracy_score(y_test, pred_1))

LogisticRegression Model: 0.7208774583963692


In [308]:
from sklearn.ensemble import RandomForestClassifier

model_2 = RandomForestClassifier(n_estimators=30, max_features=4)

model_2.fit(X_train, y_train)

In [309]:
pred_2 = model_2.predict(X_test)

In [310]:
print('RandomForestClassifier:', accuracy_score(y_test, pred_2))

RandomForestClassifier: 0.726928895612708


In [311]:
from sklearn.naive_bayes import BernoulliNB

model_3 = BernoulliNB()

model_3.fit(X_train, y_train)

In [312]:
pred_3 = model_3.predict(X_test)

In [313]:
print("BernoulliNB:", accuracy_score(y_test, pred_3))

BernoulliNB: 0.7170953101361573


In [314]:
from sklearn.svm import SVC

model_4 = SVC(C=1)

model_4.fit(X_train, y_train)

In [315]:
pred_4 = model_4.predict(X_test)

In [316]:
print('SVC:', accuracy_score(y_test, pred_4))

SVC: 0.7390317700453858


In [317]:
df_2 = pd.read_csv('test.csv')

In [318]:
df_2.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [319]:
class FeatureFillValue_1(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['HomePlanet'] = X['HomePlanet'].bfill()
        X['CryoSleep'] = X['CryoSleep'].bfill()
        X['Destination'] = X['Destination'].bfill()
        X['VIP'] = X['VIP'].bfill()
        X['VRDeck'] = X['VRDeck'].bfill()
        X['Age'] = X['Age'].bfill()
        X['RoomService'] = X['RoomService'].bfill()
        X['FoodCourt'] = X['FoodCourt'].bfill()
        X['ShoppingMall'] = X['ShoppingMall'].bfill()
        X['Spa'] = X['Spa'].bfill()

        return X

In [320]:
class FeatureLabel_1(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        label_encoder = LabelEncoder()
        X['HomePlanet'] = label_encoder.fit_transform(X['HomePlanet'])
        X['CryoSleep'] = label_encoder.fit_transform(X['CryoSleep'])
        X['Destination'] = label_encoder.fit_transform(X['Destination'])
        X['VIP'] = label_encoder.fit_transform(X['VIP'])
        X['VRDeck'] = label_encoder.fit_transform(X['VRDeck'])

        return X

In [321]:
pipeline_2 = Pipeline([
    ('feature_fillvalue_1', FeatureFillValue_1()),
    ('feature_dropper', FeatureDropper()),
    ('feature_label_1', FeatureLabel_1())
])

In [322]:
test_data = pipeline_2.fit_transform(df_2)

In [323]:
test_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,0,1,2,27.0,0,0.0,0.0,0.0,0.0,0,Nelly Carsoning
1,0018_01,0,0,2,19.0,0,0.0,9.0,0.0,2823.0,0,Lerome Peckers
2,0019_01,1,1,0,31.0,0,0.0,0.0,0.0,0.0,0,Sabih Unhearfus
3,0021_01,1,0,2,38.0,0,0.0,6652.0,0.0,181.0,359,Meratz Caltilter
4,0023_01,0,0,2,20.0,0,10.0,0.0,635.0,0.0,0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,0,1,2,34.0,0,0.0,0.0,0.0,0.0,0,Jeron Peter
4273,9269_01,0,0,2,42.0,0,0.0,847.0,17.0,10.0,131,Matty Scheron
4274,9271_01,2,1,0,43.0,0,0.0,0.0,0.0,0.0,0,Jayrin Pore
4275,9273_01,1,0,1,43.0,0,0.0,2680.0,0.0,0.0,334,Kitakan Conale


In [324]:
from sklearn.preprocessing import StandardScaler

X_test_data = test_data[features]

In [325]:
X_data = scaler.fit_transform(X_test_data)

In [326]:
test_pred_1 = model_1.predict(X_data)
test_pred_2 = model_2.predict(X_data)
test_pred_3 = model_3.predict(X_data)
test_pred_4 = model_4.predict(X_data)

In [327]:
sub = pd.read_csv('sample_submission.csv')

In [328]:
bool_value = {False: 0, True: 1}
y_test_org = sub['Transported'].map(bool_value)

In [329]:
print('Accuracy Score Model 1:', accuracy_score(y_test_org, test_pred_1))
print('Accuracy Score Model 2:', accuracy_score(y_test_org, test_pred_2))
print('Accuracy Score Model 3:', accuracy_score(y_test_org, test_pred_3))
print('Accuracy Score Model 4:', accuracy_score(y_test_org, test_pred_4))

Accuracy Score Model 1: 0.6179565115735328
Accuracy Score Model 2: 0.5595043254617723
Accuracy Score Model 3: 0.6046294131400515
Accuracy Score Model 4: 0.6144493804068272


In [330]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()

log.fit(Xtrain, ytrain)

In [331]:
pred_log = log.predict(Xtest)

In [332]:
print(accuracy_score(ytest, pred_log))

0.7193645990922845


In [333]:
pred_log_test = log.predict(X_data)



In [334]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=5, n_jobs=-1)

rfc.fit(Xtrain, ytrain)

In [335]:
pred_rfc = rfc.predict(Xtest)

In [336]:
print(accuracy_score(ytest, pred_rfc))

0.7382753403933434


In [337]:
p = rfc.predict(X_data)



In [338]:
rfc_test = rfc.predict(X_test_data)

In [339]:
print(accuracy_score(y_test_org, rfc_test))

0.5983165770399813


In [340]:
svc_model_4 = model_4.predict(X_data)

In [341]:
print(accuracy_score(y_test_org, svc_model_4))

0.6144493804068272


In [342]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  4277 non-null   object
 1   Transported  4277 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 37.7+ KB


In [380]:
prediction = pd.DataFrame({'PassengerId': sub['PassengerId'], 'Transported': test_pred_4})

In [381]:
prediction

Unnamed: 0,PassengerId,Transported
0,0013_01,1
1,0018_01,0
2,0019_01,1
3,0021_01,0
4,0023_01,0
...,...,...
4272,9266_02,1
4273,9269_01,0
4274,9271_01,1
4275,9273_01,0


In [382]:
bool_value = {0: False, 1: True}
prediction['Transported'] = prediction['Transported'].map(bool_value)

In [383]:
prediction.to_csv('ar_submission_v6.csv', index=False)