In [1]:
# For loading Packages
import pandas as pd

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# For mathematical calculations
import numpy as np

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

# To build and evaluate model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.tree import plot_tree, DecisionTreeClassifier

import tensorflow as tf
# To ignore any warnings
import warnings
warnings.filterwarnings("ignore")

# Create a new model

In [6]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
datam = pd.read_csv('updated (1).csv')
datam

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,CabinDeck,CabinNo.,CabinSide,Group,Group_size,Last_name,Family_size,IsAlone,Luxury_exp,Regular_exp,Total_exp
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,B,0,P,1,1,Ofracculy,3.0,True,0.0,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,F,0,S,2,1,Vines,4.0,True,702.0,34.0,736.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,A,0,S,3,2,Susent,7.0,False,6807.0,3576.0,10383.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,A,0,S,3,2,Susent,7.0,False,3522.0,1654.0,5176.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,F,1,S,4,1,Santantines,9.0,True,870.0,221.0,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,G,1496,S,9266,2,Peter,10.0,False,0.0,0.0,0.0
12966,9269_01,Earth,False,G/160/P,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,F,82,S,9269,1,Scheron,6.0,True,154.0,864.0,1018.0
12967,9271_01,Mars,True,D/296/P,55 Cancri e,28.771969,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,D,296,P,9271,1,Pore,4.0,True,0.0,0.0,0.0
12968,9273_01,Europa,False,D/297/P,TRAPPIST-1e,28.771969,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,D,297,P,9273,1,Conale,7.0,True,523.0,2680.0,3203.0


In [4]:
#Check if there are no more missing values
datam.isnull().sum().sort_values(ascending=False)

PassengerId     0
HomePlanet      0
Regular_exp     0
Luxury_exp      0
IsAlone         0
Family_size     0
Last_name       0
Group_size      0
Group           0
CabinSide       0
CabinNo.        0
CabinDeck       0
Name            0
VRDeck          0
Spa             0
ShoppingMall    0
FoodCourt       0
RoomService     0
VIP             0
Age             0
Destination     0
Cabin           0
CryoSleep       0
Total_exp       0
dtype: int64

In [7]:
#Add the "Transported" value to the data
full_tran = pd.merge(datam, df_train[['PassengerId','Transported']], on='PassengerId', how='left')
full_tran


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,CabinDeck,CabinNo.,CabinSide,Group,Group_size,Last_name,Family_size,IsAlone,Luxury_exp,Regular_exp,Total_exp,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,B,0,P,1,1,Ofracculy,3.0,True,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,F,0,S,2,1,Vines,4.0,True,702.0,34.0,736.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,A,0,S,3,2,Susent,7.0,False,6807.0,3576.0,10383.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,A,0,S,3,2,Susent,7.0,False,3522.0,1654.0,5176.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,F,1,S,4,1,Santantines,9.0,True,870.0,221.0,1091.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,G,1496,S,9266,2,Peter,10.0,False,0.0,0.0,0.0,
12966,9269_01,Earth,False,G/160/P,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,F,82,S,9269,1,Scheron,6.0,True,154.0,864.0,1018.0,
12967,9271_01,Mars,True,D/296/P,55 Cancri e,28.771969,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,D,296,P,9271,1,Pore,4.0,True,0.0,0.0,0.0,
12968,9273_01,Europa,False,D/297/P,TRAPPIST-1e,28.771969,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,D,297,P,9273,1,Conale,7.0,True,523.0,2680.0,3203.0,


In [8]:
#Drop the columns that are not necessary for the analysis
full_tran = full_tran.drop(columns = ['PassengerId','Cabin','Name','CabinNo.','Group','Last_name'])

In [9]:
#Divide the data of train and test
train_full = full_tran.loc[full_tran['Transported'].notnull()]
test_full = full_tran.loc[full_tran['Transported'].isnull()]

In [10]:
#Check that everything is ok
print(train_full.shape)
print(test_full.shape)

(8693, 19)
(4277, 19)


In [11]:
#Check the train data
train_full.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinDeck,CabinSide,Group_size,Family_size,IsAlone,Luxury_exp,Regular_exp,Total_exp,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,B,P,1,3.0,True,0.0,0.0,0.0,False
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,F,S,1,4.0,True,702.0,34.0,736.0,True
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,A,S,2,7.0,False,6807.0,3576.0,10383.0,False
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,A,S,2,7.0,False,3522.0,1654.0,5176.0,False
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,F,S,1,9.0,True,870.0,221.0,1091.0,True


In [12]:
#Prepare the X for the Ml model dropping the Y column
X = train_full.drop(columns='Transported')

In [13]:
#Define the numerical and the categorical columns in the X

numerical_columns = X.select_dtypes(include=['int','float64']).columns
categorical_columns = X.select_dtypes(include=['object','boolean']).columns
print(numerical_columns)
print(categorical_columns)

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Group_size', 'Family_size', 'Luxury_exp', 'Regular_exp', 'Total_exp'],
      dtype='object')
Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinDeck',
       'CabinSide', 'IsAlone'],
      dtype='object')


In [14]:
#Convert the categorical columns in dummies for the X
X_encoded = pd.get_dummies(X, columns=categorical_columns)


In [15]:
#Check the X encoded structure
X_encoded

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group_size,Family_size,Luxury_exp,Regular_exp,Total_exp,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,IsAlone_False,IsAlone_True
0,39.0,0.0,0.0,0.0,0.0,0.0,1,3.0,0.0,0.0,0.0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,1
1,24.0,109.0,9.0,25.0,549.0,44.0,1,4.0,702.0,34.0,736.0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1
2,58.0,43.0,3576.0,0.0,6715.0,49.0,2,7.0,6807.0,3576.0,10383.0,0,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,1,0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,2,7.0,3522.0,1654.0,5176.0,0,1,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,1,0
4,16.0,303.0,70.0,151.0,565.0,2.0,1,9.0,870.0,221.0,1091.0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0,1,3.0,1717.0,6819.0,8536.0,0,1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1
8689,18.0,0.0,0.0,0.0,0.0,0.0,1,5.0,0.0,0.0,0.0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1
8690,26.0,0.0,0.0,1872.0,1.0,0.0,1,8.0,1.0,1872.0,1873.0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,1
8691,32.0,0.0,1049.0,0.0,353.0,3235.0,2,7.0,3588.0,1049.0,4637.0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0


In [16]:
#Prepare the Y for the Ml model and converting it to 0 and 1

y = train_full['Transported']
y = y.astype(int)

In [None]:
#Create the RFC baseline model and fit it to the X and Y data
baseline_model = RandomForestClassifier(random_state = 1)
baseline_model.fit(X_encoded, y)

In [None]:
# store accuracy of baseline model prediction in results
result = cross_val_score(baseline_model, X_encoded, y, cv = 20, scoring = "accuracy")

# print mean and standard deviation of baseline model
print(np.mean(result))
print(np.std(result))

0.7882228931617141
0.017045522775471476


In [None]:
#test better hyperparameters

paramgrid = {'max_depth': list(range(1, 20, 2)),
             'n_estimators': list(range(1, 200, 20))}
grid_search = GridSearchCV(RandomForestClassifier(random_state = 1), paramgrid)

# fit the grid search model
grid_search.fit(X_encoded, y)

# Estimating the optimized value
grid_search.best_estimator_

In [None]:
improved_model = RandomForestClassifier(max_depth=9, n_estimators=181, random_state = 1)
improved_model.fit(X_encoded, y)

In [None]:
# store accuracy of baseline model prediction in results
result = cross_val_score(improved_model, X_encoded, y, cv = 20, scoring = "accuracy")

# print mean and standard deviation of baseline model
print(np.mean(result))
print(np.std(result))

0.8024932464643253
0.019284692405503633


In [None]:
#test better hyperparameters
paramgrid = {'max_depth': list(range(0, 20, 2)),
             'n_estimators': list(range(0, 200, 15)),
             'min_samples_split': [2, 5, 10]
             }
grid_search = GridSearchCV(RandomForestClassifier(random_state = 1), paramgrid)

# fit the grid search model
grid_search.fit(X_encoded, y)

# Estimating the optimized value
grid_search.best_estimator_

In [None]:
improved_model1 = RandomForestClassifier(max_depth=8, min_samples_split=5, n_estimators=135,
                       random_state=1)
improved_model1.fit(X_encoded, y)

In [None]:
# store accuracy of baseline model prediction in results
result = cross_val_score(improved_model1, X_encoded, y, cv = 20, scoring = "accuracy")

# print mean and standard deviation of baseline model
print(np.mean(result))
print(np.std(result))

0.8014566449494147
0.01838413040275716


In [None]:
#test better hyperparameters
paramgrid = {
             'min_samples_split': [2, 5, 10]

             }
grid_search = GridSearchCV(RandomForestClassifier(random_state = 1), paramgrid)

# fit the grid search model
grid_search.fit(X_encoded, y)

# Estimating the optimized value
grid_search.best_estimator_

In [None]:
improved_model2 = RandomForestClassifier(max_depth=9, min_samples_split=10, n_estimators=181,
                       random_state=1)
improved_model2.fit(X_encoded, y)

# store accuracy of baseline model prediction in results
result = cross_val_score(improved_model2, X_encoded, y, cv = 20, scoring = "accuracy")

# print mean and standard deviation of baseline model
print(np.mean(result))
print(np.std(result))

0.8036421420626093
0.01850305152273474


In [None]:
#test better hyperparameters
paramgrid = {'max_depth': list(range(1, 20, 2)),
             'n_estimators': list(range(1, 200, 20)),
             'min_samples_split': [2, 5, 10],
             'min_samples_leaf': [1, 2, 4],
             'max_features': ['auto', 'sqrt']
             }
grid_search = GridSearchCV(RandomForestClassifier(random_state = 1), paramgrid)

# fit the grid search model
grid_search.fit(X_encoded, y)

# Estimating the optimized value
grid_search.best_estimator_

In [None]:
improved_model3 = RandomForestClassifier(max_depth=9, max_features='auto', min_samples_leaf=4,
                       n_estimators=41, random_state=1)
improved_model3.fit(X_encoded, y)

# store accuracy of baseline model prediction in results
result = cross_val_score(improved_model3, X_encoded, y, cv = 20, scoring = "accuracy")

# print mean and standard deviation of baseline model
print(np.mean(result))
print(np.std(result))

0.8049070395677737
0.020697919329389323


In [None]:
#test better hyperparameters
paramgrid = {'max_depth': list(range(5, 15, 1)),
             'n_estimators': list(range(100, 600, 50)),
             'min_samples_split': list(range(10, 100, 10)),
             'min_samples_leaf': list(range(10, 100, 10))
             }
grid_search = GridSearchCV(RandomForestClassifier(random_state = 1), paramgrid)

# fit the grid search model
grid_search.fit(X_encoded, y)

# Estimating the optimized value
grid_search.best_estimator_

In [18]:
improved_model4 = RandomForestClassifier(max_depth=8, min_samples_split=10, n_estimators=450,
                       random_state=1)
improved_model4.fit(X_encoded, y)

# store accuracy of baseline model prediction in results
result = cross_val_score(improved_model4, X_encoded, y, cv = 50, scoring = "accuracy")

# print mean and standard deviation of baseline model
print(np.mean(result))
print(np.std(result))

0.8022556640754768
0.034282878937041884


## Test data predictions

In [19]:
X_test = test_full.drop(columns='Transported')
X_test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinDeck,CabinSide,Group_size,Family_size,IsAlone,Luxury_exp,Regular_exp,Total_exp
8693,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,S,1,8.0,True,0.0,0.0,0.0
8694,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,S,1,4.0,True,2823.0,9.0,2832.0
8695,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,S,1,6.0,True,0.0,0.0,0.0
8696,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,S,1,2.0,True,766.0,6652.0,7418.0
8697,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,S,1,5.0,True,10.0,635.0,645.0


In [20]:
X_test_encoded = pd.get_dummies(X_test, columns=categorical_columns)
X_test_encoded

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group_size,Family_size,Luxury_exp,Regular_exp,Total_exp,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S,IsAlone_False,IsAlone_True
8693,27.000000,0.0,0.0,0.0,0.0,0.0,1,8.0,0.0,0.0,0.0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,1
8694,19.000000,0.0,9.0,0.0,2823.0,0.0,1,4.0,2823.0,9.0,2832.0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1
8695,31.000000,0.0,0.0,0.0,0.0,0.0,1,6.0,0.0,0.0,0.0,0,1,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1
8696,38.000000,0.0,6652.0,0.0,181.0,585.0,1,2.0,766.0,6652.0,7418.0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,1
8697,20.000000,10.0,0.0,635.0,0.0,0.0,1,5.0,10.0,635.0,645.0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,34.000000,0.0,0.0,0.0,0.0,0.0,2,10.0,0.0,0.0,0.0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,1,0
12966,42.000000,0.0,847.0,17.0,10.0,144.0,1,6.0,154.0,864.0,1018.0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1
12967,28.771969,0.0,0.0,0.0,0.0,0.0,1,4.0,0.0,0.0,0.0,0,0,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1
12968,28.771969,0.0,2680.0,0.0,0.0,523.0,1,7.0,523.0,2680.0,3203.0,0,1,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,1


In [21]:
prediction = improved_model4.predict(X_test_encoded)
prediction_df = pd.DataFrame(np.array(prediction).reshape(-1,1))
prediction_df = prediction_df.rename(columns={0: "Transported"})
prediction_df

Unnamed: 0,Transported
0,1
1,0
2,1
3,1
4,1
...,...
4272,1
4273,1
4274,1
4275,1


In [22]:
test_id = df_test['PassengerId']
test_id=pd.DataFrame(test_id)
test_id

Unnamed: 0,PassengerId
0,0013_01
1,0018_01
2,0019_01
3,0021_01
4,0023_01
...,...
4272,9266_02
4273,9269_01
4274,9271_01
4275,9273_01


In [23]:
predictions = test_id.join(prediction_df)
predictions

Unnamed: 0,PassengerId,Transported
0,0013_01,1
1,0018_01,0
2,0019_01,1
3,0021_01,1
4,0023_01,1
...,...,...
4272,9266_02,1
4273,9269_01,1
4274,9271_01,1
4275,9273_01,1


In [24]:
predictions['Transported'] = predictions['Transported'].map({1: True, 0: False})
predictions

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,True


In [25]:
predictions.to_csv('predictions8.csv', index=False)