In [153]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [154]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Transported'] = False
df = pd.concat([df_train, df_test], sort = False)
df.drop(['Name', 'PassengerId'], axis=1, inplace=True)




In [155]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [156]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
df = df.drop(columns=['Cabin'])
df['Deck'] = df['Deck'].fillna('Unknown')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('Unknown')


In [157]:
df['Deck'] = df['Deck'].map({'G' : 0, 'F' : 1, 'E' : 2, 'D' : 3, 'C' : 4, 'B' : 5, 'A' : 6, 'U' : 7, 'T' : 8})
df['Side'] = df['Side'].map({'U' : -1, 'P' : 1, 'S' : 2})

In [158]:
impute_lis = ['Age', 'VIP', 'Num', 'CryoSleep', 'Side', 'Deck', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rest = list(set(df.columns) - set(impute_lis))
df_rest = df[rest]
imp = KNNImputer(n_neighbors = 5)
df_imputed = imp.fit_transform(df[impute_lis])
df_imputed = pd.DataFrame(df_imputed, columns= impute_lis)
df = pd.concat([df_rest.reset_index(drop = True), df_imputed.reset_index(drop = True)], axis=1)

In [159]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')
category_colls = ['HomePlanet', 'Destination']

for col in category_colls:
    df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis = 1)
    
df = df.drop(columns= category_colls)

In [160]:
bill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['total_spent'] = df[bill_cols].sum(axis = 1)
df['std_total_spent'] = df[bill_cols].std(axis = 1)
df['mean_total_spent'] = df[bill_cols].mean(axis = 1)

df['the_firts_three'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['three_low'] = df['mean_total_spent'] + df['HomePlanet_Earth'] + df['total_spent']


In [161]:
df.corr()['Transported'].sort_values(ascending= False)

Transported                  1.000000
CryoSleep                    0.324373
the_firts_three              0.284177
HomePlanet_Europa            0.131977
Deck                         0.086721
Destination_55 Cancri e      0.083625
Side                         0.073111
FoodCourt                    0.034739
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004175
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018720
Num                         -0.035240
Age                         -0.050550
Destination_TRAPPIST-1e     -0.072731
HomePlanet_Earth            -0.119644
std_total_spent             -0.121144
total_spent                 -0.140481
mean_total_spent            -0.140481
three_low                   -0.140505
VRDeck                      -0.142783
Spa                         -0.154759
RoomService                 -0.175031
Name: Transported, dtype: float64

In [162]:
df.head()

Unnamed: 0,Transported,Age,VIP,Num,CryoSleep,Side,Deck,RoomService,FoodCourt,ShoppingMall,...,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U,total_spent,std_total_spent,mean_total_spent,the_firts_three,three_low
0,False,39.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,...,False,False,False,True,False,0.0,0.0,0.0,1.0,0.0
1,True,24.0,0.0,0.0,0.0,2.0,1.0,109.0,9.0,25.0,...,False,False,False,True,False,736.0,227.807375,147.2,0.0,884.2
2,False,58.0,1.0,0.0,0.0,2.0,6.0,43.0,3576.0,0.0,...,False,False,False,True,False,10383.0,3013.383198,2076.6,1.0,12459.6
3,False,33.0,0.0,0.0,0.0,2.0,6.0,0.0,1283.0,371.0,...,False,False,False,True,False,5176.0,1373.410427,1035.2,1.0,6211.2
4,True,16.0,0.0,1.0,0.0,2.0,1.0,303.0,70.0,151.0,...,False,False,False,True,False,1091.0,223.988169,218.2,0.0,1310.2


In [163]:
df_train, df_test = df[:df_train.shape[0]], df[df_train.shape[0]:]
df_test = df_test.drop(columns= 'Transported')
df_train.shape, df_test.shape

((8693, 25), (4277, 24))

In [164]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [165]:
X = df_train.drop(columns = 'Transported')
y = df_train['Transported']

X_train, X_test, y_train, y_test =train_test_split(X, y, test_size = 0.2, random_state =42)

model = LGBMClassifier()

In [166]:
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2718
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230


0.8021851638872916

In [169]:
df_dummy = pd.read_csv('test.csv')
pred = model.predict(df_test)

final = pd.DataFrame()
final['PassengerId'] = df_dummy['PassengerId']
final['Transported'] = pred

final.to_csv('submission.csv', index = False)