In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
import scipy.stats
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Combine train and test data and split features/target

In [4]:
data0 = pd.concat([train_data.iloc[:, :-1], test_data]).reset_index(drop=True)
target = train_data['Transported']

In [5]:
data0.shape

(12970, 13)

In [6]:
data0.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [7]:
data0.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12700.0,12707.0,12681.0,12664.0,12686.0,12702.0
mean,28.771969,222.897852,451.961675,174.906033,308.476904,306.789482
std,14.387261,647.596664,1584.370747,590.55869,1130.279641,1180.097223
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,77.0,29.0,57.0,42.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [8]:
data0.isnull().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
dtype: int64

In [9]:
data1 = data0.copy()

# Categorical data cleaning

In [10]:
# CrosstabResult = pd.crosstab(index=data0['HomePlanet'],columns=data0['Destination'])
# print(CrosstabResult)

# ChiSqResult = chi2_contingency(CrosstabResult)
# print('\nThe P-Value of the ChiSq Test is:', ChiSqResult[1])

### Let's drop the name column for now

In [11]:
data1

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
12966,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
12967,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
12968,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [12]:
data1 = data1.drop('Name', axis=1)

In [13]:
categorical_columns = [col for col in data1.columns if data1[col].dtypes=='object' and col!='PassengerId']
categorical_imputer = SimpleImputer(strategy='most_frequent')
data1[categorical_columns] = pd.DataFrame(categorical_imputer.fit_transform(data1[categorical_columns]))

In [14]:
data1.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin             0
Destination       0
Age             270
VIP               0
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
dtype: int64

In [15]:
data2 = data1.copy()

# Numerical data cleaning

In [16]:
numerical_columns = [col for col in data2.columns if data2[col].dtypes=='float64']
numerical_imputer = SimpleImputer(strategy='median')
data2[numerical_columns] = pd.DataFrame(numerical_imputer.fit_transform(data2[numerical_columns]))

In [17]:
data2.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [18]:
data3 = data2.copy()

# Feature Engineering

### Let's discard the number part for now

In [19]:
data3['Deck'] = data3.Cabin.apply(lambda x: x.split('/')[0])
#data3['Num'] = data3.Cabin.apply(lambda x: x.split('/')[1])
data3['Side'] = data3.Cabin.apply(lambda x: x.split('/')[2])

In [20]:
data3 = data3.drop('Cabin', axis=1)

In [21]:
data4 = data3.copy()

# Feature Transformation

### Update categorical_columns

In [22]:
categorical_columns = [col for col in data4.columns if data4[col].dtypes=='object' and col!='Name' and col!='PassengerId']

for col in data4[categorical_columns]:
    dummies = pd.get_dummies(data4[col], prefix=col, drop_first=True)
    data4 = pd.concat([data4, dummies], axis=1)
    data4 = data4.drop([col], axis=1)

In [23]:
skew_cols = []
for col in data4[numerical_columns]:
    if scipy.stats.skew(data4[col])>=5:
        print(col, 'skewness:', scipy.stats.skew(data4[col]))
        skew_cols.append(col)

RoomService skewness: 6.200653248130639
FoodCourt skewness: 7.130413949815388
ShoppingMall skewness: 11.126048320920303
Spa skewness: 7.736207146828869
VRDeck skewness: 8.14228461589503


In [24]:
data5 = data4.copy()

# Feature Scaling

In [25]:
scaler = StandardScaler()

scaler.fit(data5)
data5 = pd.DataFrame(scaler.transform(data5), index=data5.index, columns=data5.columns)

In [26]:
data6 = data5.copy()

# Target Transformation

In [27]:
num_target = target.astype(int)

# Split back to train and test datasets

In [28]:
train_final = data6.iloc[:8692+1, :]
test_final = data6.iloc[8692+1:, :].reset_index(drop=True)

# Based Model

In [29]:
X_train, X_test, y_train, y_test = train_test_split(train_final, num_target, random_state=0)

In [30]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)

rf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [31]:
predictions = rf.predict(X_test)

In [32]:
print(f'Accuracy: {round(accuracy_score(y_test, predictions), 2)}')
print(f'F1: {round(f1_score(y_test, predictions), 2)}')
print(f'Precision {round(precision_score(y_test, predictions), 2)}')

Accuracy: 0.81
F1: 0.81
Precision 0.82


In [33]:
final_predictions = rf.predict(test_final).astype(bool)

In [34]:
submission = pd.concat([pd.Series(test_data.PassengerId), pd.Series(final_predictions, name='Transported')], axis=1)
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False


In [36]:
submission.to_csv('submission_base_model.csv', index=False)#, header=True)