In [1]:
!pip install flaml
!git clone https://github.com/analokmaus/kuma_utils.git

Collecting flaml
  Downloading FLAML-1.0.1-py3-none-any.whl (157 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 KB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting xgboost<=1.3.3,>=0.90
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.5/157.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost, flaml
  Attempting uninstall: xgboost
    Found existing installation: xgboost 1.5.2
    Uninstalling xgboost-1.5.2:
      Successfully uninstalled xgboost-1.5.2
Successfully installed flaml-1.0.1 xgboost-1.3.3
[0mCloning into 'kuma_utils'...
remote: Enumerating objects: 895, done.[K
remote: Counting objects: 100% (100/100), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 895 (delta 93), reused 90 (delta 90), pack-reused 795[K
Receiving objects: 100% (895/895), 669.35 KiB | 2.42 MiB/s, done.

# File and Data Field Descriptions
* train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
    * PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
    * HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
    * CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
    * Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    * Destination - The planet the passenger will be debarking to.
    * Age - The age of the passenger.
    * VIP - Whether the passenger has paid for special VIP service during the voyage.
    * RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
    * Name - The first and last names of the passenger.
    * Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

* test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.
* sample_submission.csv - A submission file in the correct format.
    * PassengerId - Id for each passenger in the test set.
    * Transported - The target. For each passenger, predict either True or False.

In [2]:
import numpy as np
import pandas as pd
import sys
sys.path.append("kuma_utils/")
import seaborn as sns
import plotly.express as px
from kuma_utils.preprocessing.imputer import LGBMImputer
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_percentage_error

In [3]:
train=pd.read_csv('../input/spaceship-titanic/train.csv')
test=pd.read_csv('../input/spaceship-titanic/test.csv')
train.nunique().sort_values(ascending=False)
round(train.isnull().sum()*100/len(train),2).sort_values(ascending=False)
train=train.drop(['PassengerId'],axis=1)
test=test.drop(['PassengerId'],axis=1)
train=train.drop(['Name'],axis=1)
test=test.drop(['Name'],axis=1)

##  HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

In [4]:
df = px.data.tips()
fig = px.histogram(train, x="HomePlanet")
fig.show()

In [5]:
fig = px.histogram(test, x="HomePlanet")
fig.show()

## CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

In [6]:
fig = px.histogram(train, x="CryoSleep")
fig.show()

In [7]:
fig = px.histogram(test, x="CryoSleep")
fig.show()

## Destination - The planet the passenger will be debarking to.

In [8]:
fig = px.histogram(train, x="Destination")
fig.show()

In [9]:
fig = px.histogram(test, x="Destination")
fig.show()

## VIP - Whether the passenger has paid for special VIP service during the voyage.

In [10]:
fig = px.histogram(train, x="VIP")
fig.show()

In [11]:
fig = px.histogram(test, x="VIP")
fig.show()

## Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [12]:
fig = px.histogram(train, x="Transported")
fig.show()

In [13]:
print(train['Destination'].value_counts())
print('===========================')
train.info()

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(5)
memory usage: 755.7+ KB


In [14]:
train[['deck', 'num','side']] = train['Cabin'].str.split('/', expand=True)
train=train.drop(['Cabin'],axis=1)
test[['deck', 'num','side']] = test['Cabin'].str.split('/', expand=True)
test=test.drop(['Cabin'],axis=1)
train.nunique().sort_values(ascending=False)

num             1817
FoodCourt       1507
Spa             1327
VRDeck          1306
RoomService     1273
ShoppingMall    1115
Age               80
deck               8
HomePlanet         3
Destination        3
CryoSleep          2
VIP                2
Transported        2
side               2
dtype: int64

In [15]:
print(train['deck'].value_counts())
print('=======================')
print(train['deck'].unique().tolist())
print('=======================')
print(test['deck'].unique().tolist())

F    2794
G    2559
E     876
B     779
C     747
D     478
A     256
T       5
Name: deck, dtype: int64
['B', 'F', 'A', 'G', nan, 'E', 'D', 'C', 'T']
['G', 'F', 'C', 'B', 'D', 'E', nan, 'A', 'T']


In [16]:
train['deck']=train['deck'].replace({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7})
test['deck']=test['deck'].replace({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'T':7})
train[['Age','RoomService','FoodCourt',
       'ShoppingMall','Spa','VRDeck','deck','num']]=train[['Age','RoomService','FoodCourt',
       'ShoppingMall','Spa','VRDeck','deck','num']].astype('float')
test[['Age','RoomService','FoodCourt',
       'ShoppingMall','Spa','VRDeck','deck','num']]=test[['Age','RoomService','FoodCourt',
       'ShoppingMall','Spa','VRDeck','deck','num']].astype('float')
train=pd.get_dummies(train,prefix_sep='__')
test=pd.get_dummies(test,prefix_sep='__')

In [17]:
col=train.columns.tolist()
col.remove('Transported')
col

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'deck',
 'num',
 'HomePlanet__Earth',
 'HomePlanet__Europa',
 'HomePlanet__Mars',
 'CryoSleep__False',
 'CryoSleep__True',
 'Destination__55 Cancri e',
 'Destination__PSO J318.5-22',
 'Destination__TRAPPIST-1e',
 'VIP__False',
 'VIP__True',
 'side__P',
 'side__S']

In [18]:
%%time
lgbm_imtr = LGBMImputer(n_iter=500)

train_iterimp = lgbm_imtr.fit_transform(train[col])
test_iterimp = lgbm_imtr.transform(test[col])

# Create train test imputed dataframe
train_ = pd.DataFrame(train_iterimp, columns=col)
test = pd.DataFrame(test_iterimp, columns=col)

  0%|          | 0/8 [00:00<?, ?it/s]


'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.



  0%|          | 0/8 [00:00<?, ?it/s]

CPU times: user 27.3 s, sys: 701 ms, total: 28 s
Wall time: 7.18 s


In [19]:
train_['Transported'] = train['Transported']

In [20]:
def undummify(df, prefix_sep="__"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

In [21]:
train=undummify(train_)
train.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,HomePlanet,CryoSleep,Destination,VIP,side,Transported
0,39.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Europa,False,TRAPPIST-1e,False,P,False
1,24.0,109.0,9.0,25.0,549.0,44.0,5.0,0.0,Earth,False,TRAPPIST-1e,False,S,True
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,Europa,False,TRAPPIST-1e,True,S,False
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,Europa,False,TRAPPIST-1e,False,S,False
4,16.0,303.0,70.0,151.0,565.0,2.0,5.0,1.0,Earth,False,TRAPPIST-1e,False,S,True


In [22]:
test=undummify(test)
test.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,HomePlanet,CryoSleep,Destination,VIP,side
0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,Earth,True,TRAPPIST-1e,False,S
1,19.0,0.0,9.0,0.0,2823.0,0.0,5.0,4.0,Earth,False,TRAPPIST-1e,False,S
2,31.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,Europa,True,55 Cancri e,False,S
3,38.0,0.0,6652.0,0.0,181.0,585.0,2.0,1.0,Europa,False,TRAPPIST-1e,False,S
4,20.0,10.0,0.0,635.0,0.0,0.0,5.0,5.0,Earth,False,TRAPPIST-1e,False,S


In [23]:
automl = AutoML()

In [24]:
y = train.pop('Transported')
X = train

In [25]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42,shuffle=True, stratify=y)

In [26]:
automl.fit(X_train, y_train, task="classification",metric='ap',time_budget=300)

[flaml.automl: 04-25 13:36:58] {2105} INFO - task = classification
[flaml.automl: 04-25 13:36:58] {2107} INFO - Data split method: stratified
[flaml.automl: 04-25 13:36:58] {2111} INFO - Evaluation method: cv
[flaml.automl: 04-25 13:36:58] {2188} INFO - Minimizing error metric: 1-ap
[flaml.automl: 04-25 13:36:59] {2281} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl: 04-25 13:36:59] {2567} INFO - iteration 0, current learner lgbm
[flaml.automl: 04-25 13:36:59] {2698} INFO - Estimated sufficient time budget=2112s. Estimated necessary time budget=52s.
[flaml.automl: 04-25 13:36:59] {2750} INFO -  at 0.5s,	estimator lgbm's best error=0.1993,	best estimator lgbm's best error=0.1993
[flaml.automl: 04-25 13:36:59] {2567} INFO - iteration 1, current learner lgbm
[flaml.automl: 04-25 13:36:59] {2750} INFO -  at 0.6s,	estimator lgbm's best error=0.1817,	best estimator lgbm's best error=0.1817
[flaml.automl: 0

In [27]:
print(automl.best_estimator)
print(automl.best_config)
print(1-automl.best_loss)
print(automl.best_config_train_time)

catboost
{'early_stopping_rounds': 30, 'learning_rate': 0.008785670659077992, 'n_estimators': 1155}
0.9128008195941084
9.687533855438232


In [28]:
classification_report(y_train, automl.predict(X_train))

'              precision    recall  f1-score   support\n\n       False       0.83      0.82      0.83      3452\n        True       0.82      0.84      0.83      3502\n\n    accuracy                           0.83      6954\n   macro avg       0.83      0.83      0.83      6954\nweighted avg       0.83      0.83      0.83      6954\n'

In [29]:
classification_report(y_test, automl.predict(X_test))

'              precision    recall  f1-score   support\n\n       False       0.80      0.80      0.80       863\n        True       0.80      0.81      0.80       876\n\n    accuracy                           0.80      1739\n   macro avg       0.80      0.80      0.80      1739\nweighted avg       0.80      0.80      0.80      1739\n'

In [30]:
y_pred = automl.predict(test)
y_pred[:5]

array([ True, False,  True,  True, False])

In [31]:
df = pd.DataFrame(y_pred,columns=['Transported'])
sol=pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
sol['Transported']=df['Transported']
sol.to_csv('./submission.csv',index=False)