In [1004]:
import pandas as pd
import numpy as np
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_test['Transported'] = 'Test'
test_id = df_test['PassengerId']

In [1005]:
df_all = pd.concat([df_train, df_test])
df_all = df_all.drop('PassengerId', axis = 1)

In [1006]:
df_all['VIP'].fillna(False, inplace=True)
df_all['VIP'] = df_all['VIP'].astype(float)
df_all['VRDeck'].fillna(0, inplace=True)
df_all['Spa'].fillna(0, inplace=True)
df_all['ShoppingMall'].fillna(0, inplace=True)
df_all['FoodCourt'].fillna(0, inplace=True)
df_all['RoomService'].fillna(0, inplace=True)

In [1007]:
from sklearn.preprocessing import OneHotEncoder
df_all['Destination'].fillna('Unknown', inplace=True)
df_all['CryoSleep'].fillna(False, inplace=True)
df_all['CryoSleep'] = df_all['CryoSleep'].astype(float)
df_all['HomePlanet'].fillna('Unknown', inplace=True)
df_all['Cabin'].fillna(0, inplace=True)
df_all['Deck'] = df_all['Cabin'].apply(lambda x: 'Unknown' if x == 0 else x.split('/')[0])
df_all['Side'] = df_all['Cabin'].apply(lambda x: 'Unknown' if x == 0 else x.split('/')[2])

In [1008]:
df_dum = pd.get_dummies(df_all[['Destination', 'HomePlanet', 'Deck', 'Side']], dummy_na=True)
print(len(df_dum))
df_all = df_all.reset_index(drop=True)
df_dum = df_dum.reset_index(drop=True)
df_all = df_all.join(df_dum)
df_all = df_all.drop(['Destination', 'HomePlanet', 'Cabin', 'Deck', 'Side'], axis = 1)

12970


In [1009]:
df_all['Name'].fillna(0, inplace=True)
df_all['Name'] = df_all['Name'].apply(lambda x: 1 if x!=0 else 0)
df_all['Age'] = pd.qcut(df_all['Age'], 6, labels=False)

In [1010]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
df_val = df_all[df_all['Transported'] == 'Test']
df_train = df_all[df_all['Transported'] != 'Test']
X = df_train.drop('Transported', axis = 1)
y = df_train['Transported'].astype(bool)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val = df_val.drop('Transported', axis = 1)

In [1011]:
import optuna
import sklearn

def objective(trial):

    # Invoke suggest methods of a Trial object to generate hyperparameters.
    max_depth = trial.suggest_int('max_depth', 2, 50)
    num_leaves = trial.suggest_int('num_leaves', 2, 50)
    n_estimators = trial.suggest_int('n_estimators', 20, 500)
    lr = trial.suggest_float('lr', 0.01, 0.2)
    modelTest = lgb.LGBMClassifier(max_depth=max_depth, num_leaves=num_leaves, n_estimators=n_estimators, learning_rate=lr)
    modelTest.fit(X_train, y_train)
    y_pred = modelTest.predict(X_test)

    error = 1 - accuracy_score(y_test, y_pred)

    return error

study = optuna.create_study()  # Create a new study.
# study.optimize(objective, n_trials=300)

[32m[I 2022-07-04 17:57:35,456][0m A new study created in memory with name: no-name-bf81a8ac-b126-4462-a889-f63d5ceaa6e2[0m


In [1012]:
model = lgb.LGBMClassifier(max_depth=20, num_leaves = 12, n_estimators=33, learning_rate=0.189)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8010350776308223


In [1013]:
# 0.8010350776308223
prediction = model.predict(X_val)
test_id = pd.DataFrame(test_id)
prediction = pd.DataFrame(prediction, columns=['Transported'])
result = test_id.join(prediction)
result.to_csv('submission.csv', index=False)