https://www.kaggle.com/competitions/spaceship-titanic/overview

In [1]:
import sys
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
sys.path.insert(0, '../')
from utils import factorize
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


Preprocessing


In [2]:
train_data = pd.read_csv('Data/train.csv')
validation_data = pd.read_csv('Data/test.csv')

val_ids = validation_data['PassengerId']
train_ids = train_data['PassengerId']

entire_data = pd.concat([train_data, validation_data], axis=0, sort=False)


entire_data = factorize(entire_data, exclude = ['PassengerId', 'Transported'])

validation_data = entire_data[entire_data['PassengerId'].isin(val_ids)].drop('Transported',axis=1)
train_data = entire_data[entire_data['PassengerId'].isin(train_ids)]

train_data = train_data.fillna(train_data.mean())
validation_data = validation_data.fillna(validation_data.mean())


META_COLS = ['PassengerId', 'Name']

train_data = train_data.drop(META_COLS, axis=1)

train_y = train_data['Transported']
train_x = train_data.drop('Transported', axis=1)

In [3]:
X_train, X_test, y_train, y_test= train_test_split(train_x, train_y, test_size=0.1, random_state=42)


lr = LogisticRegression()

lr.fit(X_train, y_train)


lr.score(X_train,y_train), lr.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.7732327751501982, 0.7528735632183908)

In [4]:

def write_predictions(model,fp = 'submission.csv'):

    pred = model.predict(validation_data.drop(META_COLS ,axis=1))

    pred_df = pd.DataFrame()

    pred_df['PassengerId'] = validation_data['PassengerId']

    pred_df['Transported'] = pred

    pred_df.to_csv(fp,index=False)

In [5]:

write_predictions(lr, fp='submission_lr.csv')

XGboost

In [6]:
x_train, x_val, y_train0, y_val0 = train_test_split(X_train, y_train, test_size=0.1) 

print(len(y_train0), len(y_val0),len(y_train))
xgb = XGBClassifier()

xgb.fit(X_train, y_train)


train_pred = xgb.predict(X_train)
test_pred = xgb.predict(X_test)



accuracy_score(y_train, train_pred), accuracy_score(y_test, test_pred) 

7040 783 7823


(0.9130768247475393, 0.7758620689655172)

Bayesian Optimization to reduce overfitting

In [7]:

def init_xgb_from_params(space):
    
    for key, value in space.items():
        try:
            space[key] = int(space[key])
        except Exception:  
            continue

    clf=XGBClassifier(
                    **space)
    
    return clf

In [8]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.quniform('n_estimators',50,500,50),
        'seed': 0
    }


def objective(space):
    
    clf  = init_xgb_from_params(space)
    evaluation = [( x_train, y_train0), ( x_val, y_val0)]
    
    clf.fit(x_train, y_train0,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK } # just optimziign accuracy for now


trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)




  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]





SCORE:                                                 
0.7597701149425288                                     
  1%|          | 1/100 [00:00<00:34,  2.88trial/s, best loss: -0.7597701149425288]





SCORE:                                                                            
0.7505747126436781                                                                
  2%|▏         | 2/100 [00:00<00:28,  3.45trial/s, best loss: -0.7597701149425288]





SCORE:                                                                            
0.7620689655172413                                                                
  3%|▎         | 3/100 [00:00<00:28,  3.39trial/s, best loss: -0.7620689655172413]





SCORE:                                                                            
0.7620689655172413                                                                
SCORE:                                                                            
0.7505747126436781                                                                
  5%|▌         | 5/100 [00:01<00:20,  4.67trial/s, best loss: -0.7620689655172413]







SCORE:                                                                            
0.7551724137931034                                                                
  6%|▌         | 6/100 [00:01<00:22,  4.13trial/s, best loss: -0.7620689655172413]





SCORE:                                                                            
0.7528735632183908                                                                
SCORE:                                                                            
0.7448275862068966                                                                
  8%|▊         | 8/100 [00:01<00:20,  4.50trial/s, best loss: -0.7620689655172413]







SCORE:                                                                            
0.7597701149425288                                                                
  9%|▉         | 9/100 [00:02<00:26,  3.49trial/s, best loss: -0.7620689655172413]





SCORE:                                                                            
0.767816091954023                                                                 
 10%|█         | 10/100 [00:02<00:29,  3.10trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.764367816091954                                                                 
SCORE:                                                                            
0.7574712643678161                                                                
 12%|█▏        | 12/100 [00:03<00:22,  3.99trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7528735632183908                                                                
SCORE:                                                                            
0.7494252873563219                                                                
 14%|█▍        | 14/100 [00:03<00:16,  5.07trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7482758620689656                                                                
SCORE:                                                                            
0.7494252873563219                                                                
 16%|█▌        | 16/100 [00:03<00:14,  5.63trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7620689655172413                                                                
 17%|█▋        | 17/100 [00:04<00:17,  4.81trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7517241379310344                                                                
SCORE:                                                                            
0.7528735632183908                                                                
 19%|█▉        | 19/100 [00:04<00:14,  5.58trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7666666666666667                                                                
 20%|██        | 20/100 [00:04<00:18,  4.38trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7609195402298851                                                                
 21%|██        | 21/100 [00:05<00:18,  4.27trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.764367816091954                                                                 
 22%|██▏       | 22/100 [00:05<00:20,  3.75trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7574712643678161                                                                
 23%|██▎       | 23/100 [00:05<00:20,  3.74trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7574712643678161                                                                
SCORE:                                                                            
0.7609195402298851                                                                
 25%|██▌       | 25/100 [00:06<00:16,  4.54trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7655172413793103                                                                
 26%|██▌       | 26/100 [00:06<00:16,  4.36trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7551724137931034                                                                
 27%|██▋       | 27/100 [00:06<00:18,  3.97trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7517241379310344                                                                
 28%|██▊       | 28/100 [00:06<00:17,  4.04trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7666666666666667                                                                
SCORE:                                                                            
0.7609195402298851                                                                
 30%|███       | 30/100 [00:07<00:16,  4.34trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7620689655172413                                                                
 31%|███       | 31/100 [00:07<00:17,  3.86trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7563218390804598                                                                
 32%|███▏      | 32/100 [00:07<00:19,  3.55trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7620689655172413                                                                
SCORE:                                                                            
0.7517241379310344                                                                
 34%|███▍      | 34/100 [00:08<00:18,  3.67trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7563218390804598                                                                
 35%|███▌      | 35/100 [00:08<00:18,  3.53trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.764367816091954                                                                 
SCORE:                                                                            
0.7563218390804598                                                                
 37%|███▋      | 37/100 [00:09<00:15,  4.17trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7609195402298851                                                                
 38%|███▊      | 38/100 [00:09<00:14,  4.26trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7586206896551724                                                                
 39%|███▉      | 39/100 [00:09<00:15,  3.90trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7597701149425288                                                                
SCORE:                                                                            
0.7551724137931034                                                                
 41%|████      | 41/100 [00:10<00:12,  4.65trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7609195402298851                                                                
SCORE:                                                                            
0.7551724137931034                                                                
 43%|████▎     | 43/100 [00:10<00:15,  3.79trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7563218390804598                                                                
 44%|████▍     | 44/100 [00:11<00:14,  3.79trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7563218390804598                                                                
SCORE:                                                                            
0.7620689655172413                                                                
 46%|████▌     | 46/100 [00:11<00:11,  4.55trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7632183908045977                                                                
 47%|████▋     | 47/100 [00:11<00:12,  4.30trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7620689655172413                                                                
 48%|████▊     | 48/100 [00:12<00:12,  4.03trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7597701149425288                                                                
 49%|████▉     | 49/100 [00:12<00:13,  3.92trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7597701149425288                                                                
SCORE:                                                                            
0.7517241379310344                                                                
 51%|█████     | 51/100 [00:12<00:10,  4.47trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7563218390804598                                                                
 52%|█████▏    | 52/100 [00:13<00:10,  4.39trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7586206896551724                                                                
SCORE:                                                                            
 53%|█████▎    | 53/100 [00:13<00:11,  4.13trial/s, best loss: -0.767816091954023]





0.7505747126436781                                                                
SCORE:                                                                            
0.7586206896551724                                                                
 55%|█████▌    | 55/100 [00:13<00:08,  5.00trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.764367816091954                                                                 
 56%|█████▌    | 56/100 [00:13<00:10,  4.21trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7609195402298851                                                                
 57%|█████▋    | 57/100 [00:14<00:11,  3.86trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7574712643678161                                                                
 58%|█████▊    | 58/100 [00:14<00:10,  3.83trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.764367816091954                                                                 
SCORE:                                                                            
0.7620689655172413                                                                
 60%|██████    | 60/100 [00:14<00:09,  4.23trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7563218390804598                                                                
 61%|██████    | 61/100 [00:15<00:09,  4.31trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7494252873563219                                                                
 62%|██████▏   | 62/100 [00:15<00:08,  4.24trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7586206896551724                                                                
SCORE:                                                                            
0.7505747126436781                                                                
 64%|██████▍   | 64/100 [00:15<00:07,  4.69trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7528735632183908                                                                
 65%|██████▌   | 65/100 [00:16<00:07,  4.45trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7666666666666667                                                                
 66%|██████▌   | 66/100 [00:16<00:08,  3.87trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7632183908045977                                                                
SCORE:                                                                            
0.7655172413793103                                                                
 68%|██████▊   | 68/100 [00:16<00:07,  4.03trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7632183908045977                                                                
 69%|██████▉   | 69/100 [00:17<00:07,  3.91trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.764367816091954                                                                 
 70%|███████   | 70/100 [00:17<00:07,  3.95trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7666666666666667                                                                
 71%|███████   | 71/100 [00:17<00:07,  3.77trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7632183908045977                                                                
 72%|███████▏  | 72/100 [00:18<00:07,  3.53trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7597701149425288                                                                
SCORE:                                                                            
0.7655172413793103                                                                
 74%|███████▍  | 74/100 [00:18<00:06,  3.96trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7586206896551724                                                                
 75%|███████▌  | 75/100 [00:18<00:06,  4.05trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7528735632183908                                                                
SCORE:                                                                            
0.7620689655172413                                                                
 77%|███████▋  | 77/100 [00:19<00:05,  4.37trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7563218390804598                                                                
 78%|███████▊  | 78/100 [00:19<00:04,  4.42trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7597701149425288                                                                
 79%|███████▉  | 79/100 [00:19<00:05,  4.04trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7597701149425288                                                                
SCORE:                                                                            
0.7655172413793103                                                                
 81%|████████  | 81/100 [00:20<00:04,  4.14trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.764367816091954                                                                 
 82%|████████▏ | 82/100 [00:20<00:04,  3.91trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.764367816091954                                                                 
 83%|████████▎ | 83/100 [00:20<00:04,  3.58trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7609195402298851                                                                
 84%|████████▍ | 84/100 [00:21<00:04,  3.62trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7620689655172413                                                                
 85%|████████▌ | 85/100 [00:21<00:04,  3.51trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7666666666666667                                                                
 86%|████████▌ | 86/100 [00:21<00:04,  3.44trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7609195402298851                                                                
SCORE:                                                                            
0.7551724137931034                                                                
 88%|████████▊ | 88/100 [00:22<00:02,  4.01trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7551724137931034                                                                
 89%|████████▉ | 89/100 [00:22<00:02,  3.84trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7597701149425288                                                                
 90%|█████████ | 90/100 [00:22<00:02,  3.85trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7632183908045977                                                                
SCORE:                                                                            
0.7528735632183908                                                                
 92%|█████████▏| 92/100 [00:23<00:01,  4.50trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7586206896551724                                                                
 93%|█████████▎| 93/100 [00:23<00:01,  4.31trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7620689655172413                                                                
 94%|█████████▍| 94/100 [00:23<00:01,  4.02trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7563218390804598                                                                
SCORE:                                                                            
0.7632183908045977                                                                
 96%|█████████▌| 96/100 [00:24<00:00,  4.24trial/s, best loss: -0.767816091954023]







SCORE:                                                                            
0.7448275862068966                                                                
 97%|█████████▋| 97/100 [00:24<00:00,  4.20trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7609195402298851                                                                
 98%|█████████▊| 98/100 [00:24<00:00,  3.58trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.7574712643678161                                                                
 99%|█████████▉| 99/100 [00:25<00:00,  3.66trial/s, best loss: -0.767816091954023]





SCORE:                                                                            
0.767816091954023                                                                 
100%|██████████| 100/100 [00:25<00:00,  3.94trial/s, best loss: -0.767816091954023]


In [9]:
print(best_hyperparams)
xgb = init_xgb_from_params(best_hyperparams)

xgb.fit(X_train, y_train) # fit on ALL TRAIN DATA
test_pred = xgb.predict(X_test)
train_pred = xgb.predict(X_train)
accuracy_score(y_train, train_pred) ,accuracy_score(y_test, test_pred) 

{'colsample_bytree': 0.5753966092225853, 'gamma': 3.3021835582533443, 'max_depth': 6.0, 'min_child_weight': 2.0, 'n_estimators': 200.0, 'reg_alpha': 81.0, 'reg_lambda': 0.20466227685256955}


(0.7739997443436022, 0.7632183908045977)