In [91]:
import pandas as pd
import numpy as np
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_test['Transported'] = 'Test'
test_id = df_test['PassengerId']
df_all = pd.concat([df_train, df_test])

In [92]:
df_all['VIP'].fillna(False, inplace=True)
df_all['VIP'] = df_all['VIP'].astype(float)
df_all['VRDeck'].fillna(0, inplace=True)
df_all['Spa'].fillna(0, inplace=True)
df_all['ShoppingMall'].fillna(0, inplace=True)
df_all['FoodCourt'].fillna(0, inplace=True)
df_all['RoomService'].fillna(0, inplace=True)

In [93]:
df_all['Destination'].fillna('Unknown', inplace=True)
df_all['CryoSleep'].fillna(False, inplace=True)
df_all['CryoSleep'] = df_all['CryoSleep'].astype(float)
df_all['HomePlanet'].fillna('Unknown', inplace=True)
df_all['Cabin'].fillna(0, inplace=True)
df_all['Deck'] = df_all['Cabin'].apply(lambda x: 'Unknown' if x == 0 else x.split('/')[0])
df_all['Side'] = df_all['Cabin'].apply(lambda x: 'Unknown' if x == 0 else x.split('/')[2])
df_all['GroupNum'] = df_all['PassengerId'].apply(lambda x: int(x.split('_')[0]))
df_all['GroupSize'] = df_all['PassengerId'].apply(lambda x: int(x.split('_')[1]))
df_all['TotalBill'] = df_all['VRDeck'] + df_all['Spa'] + df_all['ShoppingMall'] + df_all['FoodCourt'] + df_all['RoomService']
df_all['AvgBill'] = df_all['TotalBill']/5
df_all['SpentAny'] = df_all['TotalBill'].apply(lambda x: 0 if x == 0 else 1)


In [94]:
d = df_all.groupby('GroupNum').max()
d = d.drop(['PassengerId', 'HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Transported', 'Deck', 'Side', 'TotalBill', 'AvgBill',
       'SpentAny'], axis = 1)
df_all = df_all.drop(['GroupSize', 'PassengerId'], axis=1)
df_all = df_all.set_index('GroupNum').join(d)
df_all = df_all.reset_index(drop=True)
df_all['Alone'] = df_all['GroupSize'].apply(lambda x: 1 if x == 1 else 0)

  """Entry point for launching an IPython kernel.


In [95]:
df_dum = pd.get_dummies(df_all[['Destination', 'HomePlanet', 'Deck', 'Side']])
df_all = df_all.reset_index(drop=True)
df_dum = df_dum.reset_index(drop=True)
df_all = df_all.join(df_dum)

In [96]:
df_all['Name'].fillna(0, inplace=True)
df_all['Name'] = df_all['Name'].apply(lambda x: 1 if x!=0 else 0)
df_all['Age'].fillna(-1, inplace=True)
df_all['Age'] = df_all['Age'].astype(int)
df_all['Teen'] = df_all['Age'].apply(lambda x: 1 if 0 <= x <= 18 else 0)
df_all['Age_unknown'] = df_all['Age'].apply(lambda x: 1 if x == -1 else 0)
df_all['Adol'] = df_all['Age'].apply(lambda x: 1 if 19 <= x <= 25 else 0)
df_all['Elder'] = df_all['Age'].apply(lambda x: 1 if x > 25 else 0)
df_all = df_all.drop(['Destination', 'HomePlanet', 'Cabin', 'Deck', 'Side', 'Age'], axis = 1)

In [97]:
df_all['VRDeck'] = np.log1p(df_all['VRDeck'])
df_all['Spa'] = np.log1p(df_all['Spa'])
df_all['ShoppingMall'] = np.log1p(df_all['ShoppingMall'])
df_all['FoodCourt'] = np.log1p(df_all['FoodCourt'])
df_all['RoomService'] = np.log1p(df_all['RoomService'])
df_all['TotalBill'] = np.log1p(df_all['TotalBill'])
df_all['AvgBill'] = np.log1p(df_all['AvgBill'])

In [98]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
df_val = df_all[df_all['Transported'] == 'Test']
df_train = df_all[df_all['Transported'] != 'Test']
X = df_train.drop('Transported', axis = 1)
y = df_train['Transported'].astype(bool)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val = df_val.drop('Transported', axis = 1)

In [105]:
corr = df_train.corr()
# corr.style.background_gradient(cmap='coolwarm')
print(np.linalg.det(corr))

-8.686034423878842e-102


In [106]:
import optuna
import sklearn

def objective(trial):

    # Invoke suggest methods of a Trial object to generate hyperparameters.
   # max_depth = trial.suggest_int('max_depth', 2, 50)
   # num_leaves = trial.suggest_int('num_leaves', 2, 50)
    n_estimators = trial.suggest_int('n_estimators', 30, 2500)
    lr = trial.suggest_float('lr', 0.05, 0.15)
    modelTest = lgb.LGBMClassifier(n_estimators=n_estimators, learning_rate=lr)
    modelTest.fit(X_train, y_train)
    y_pred = modelTest.predict(X_test)

    error = 1 - accuracy_score(y_test, y_pred)

    return error

study = optuna.create_study()  # Create a new study.
study.optimize(objective, n_trials=1000)

[32m[I 2022-07-05 15:02:33,176][0m A new study created in memory with name: no-name-139c40b4-be12-42ca-bc37-ccdc90ee4a37[0m
[32m[I 2022-07-05 15:02:35,384][0m Trial 0 finished with value: 0.21334100057504313 and parameters: {'n_estimators': 1100, 'lr': 0.10343628741287818}. Best is trial 0 with value: 0.21334100057504313.[0m
[32m[I 2022-07-05 15:02:35,522][0m Trial 1 finished with value: 0.20184013801035072 and parameters: {'n_estimators': 54, 'lr': 0.14229412617240622}. Best is trial 1 with value: 0.20184013801035072.[0m
[32m[I 2022-07-05 15:02:36,939][0m Trial 2 finished with value: 0.20989074180563538 and parameters: {'n_estimators': 783, 'lr': 0.054422227990039845}. Best is trial 1 with value: 0.20184013801035072.[0m
[32m[I 2022-07-05 15:02:40,273][0m Trial 3 finished with value: 0.2202415181138585 and parameters: {'n_estimators': 1989, 'lr': 0.11143482674061031}. Best is trial 1 with value: 0.20184013801035072.[0m
[32m[I 2022-07-05 15:02:43,027][0m Trial 4 finishe

In [107]:
study.best_params

{'n_estimators': 85, 'lr': 0.07723786894408977}

In [89]:
model2 = lgb.LGBMClassifier(max_depth=40, num_leaves = 11, n_estimators=179, learning_rate=0.06761242685718895)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8033352501437608


In [90]:
# 0.80448533640023
prediction = model2.predict(X_val)
test_id = pd.DataFrame(test_id)
prediction = pd.DataFrame(prediction, columns=['Transported'])
result = test_id.join(prediction)
result.to_csv('submission.csv', index=False)