In [72]:
import sys
import numpy as np
from pathlib import Path
import pandas as pd
import torch
import openml
import os
import time
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from pathlib import Path
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 

from tabpfn.scripts.transformer_prediction_interface import TabPFNClassifier
from tabpfn.scripts.decision_boundary import DecisionBoundaryDisplay
import re
from sklearn.preprocessing import StandardScaler
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier

In [73]:
train_full = pd.read_csv('./Data/train.csv')
test_full = pd.read_csv('./Data/test.csv')

In [74]:
train = train_full.copy()
test = test_full.copy()

In [75]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

def create_extra_features(data):
    data['Name_Words_Count'] = data['Name'].map(lambda x: len(x.split()))
    data['Has_Cabin'] = data["Cabin"].map(lambda x: 1 - int(type(x) == float))
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
        
    data['Title'] = data['Name'].apply(get_title).replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    data['Title'] = data['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}).fillna(0)
    return data

train = create_extra_features(train)
test = create_extra_features(test)

In [76]:
def drop_unecessary_columns(df):
    columns_to_drop = ['PassengerId', 'Ticket','Name', 'Cabin'] # 'Cabin', 'Embarked' maybe too
    df.drop(columns = columns_to_drop, inplace = True)
    
drop_unecessary_columns(train)
drop_unecessary_columns(test)

In [77]:
target = 'Survived'

X_train = train.drop(columns=target)
y_train = train[target]

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [78]:
def get_rid_of_nans(df):
    df['Age'] = df.groupby(['Sex','Title'])['Age'].transform(
    lambda grp: grp.fillna(np.mean(grp)))
    df['Fare'] = df.groupby(['Sex','Title'])['Fare'].transform(
    lambda grp: grp.fillna(np.mean(grp)))
    df['Embarked'].fillna(value = df['Embarked'].mode()[0], inplace = True)
    re_cabin_template = "[A-Za-z]+"

get_rid_of_nans(X_train)
get_rid_of_nans(X_val)
get_rid_of_nans(test)

In [79]:
def add_new_features(df: pd.DataFrame) -> None:
    #df['Age*Class'] = df['Age'] * df['Pclass']
    df['is_child'] = df['Age'].apply(lambda x: 1 if x < 18 else 0)
    df['is_adult_man'] = df.apply(lambda x: 1 if x['Sex']=='male' and x['Age']>=18 else 0, axis=1)
    
add_new_features(X_train)
add_new_features(X_val)
add_new_features(test)

In [80]:
def encoding_columns(df):
    df['Sex'] = df['Sex'].apply(lambda x: 1 if x=='male' else 0)
    df['Sex'] = df['Sex'].astype('int32')
    df = pd.get_dummies(df)
    return df

X_train = encoding_columns(X_train)
X_val = encoding_columns(X_val)
test = encoding_columns(test)

In [81]:
float_features = ['Age', 'Fare']
scaler = StandardScaler()
X_train[float_features] = scaler.fit_transform(X_train[float_features])
X_val[float_features] = scaler.fit_transform(X_val[float_features])
test[float_features] = scaler.fit_transform(test[float_features])

In [82]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Name_Words_Count,Has_Cabin,FamilySize,Title,is_child,is_adult_man,Embarked_C,Embarked_Q,Embarked_S
298,1,1,0.198765,0,0,-0.032568,3,1,1,1,0,1,0,0,1
884,3,1,-0.344123,0,0,-0.487331,4,0,1,1,0,1,0,0,1
247,2,0,-0.419647,0,2,-0.342854,4,0,3,3,0,0,0,0,1
478,3,1,-0.570694,0,0,-0.478201,4,0,1,1,0,1,0,0,1
305,1,1,-2.162730,1,2,2.314937,4,1,4,4,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,3,0,-0.646217,0,0,-0.475695,4,0,1,2,0,0,0,0,1
270,1,1,0.198765,0,0,-0.022872,3,0,1,1,0,1,0,0,1
860,3,1,0.864253,2,0,-0.350450,4,0,3,1,0,1,0,0,1
435,1,0,-1.174882,1,2,1.703093,4,1,4,2,1,0,0,0,1


In [83]:
start = time.time()

# define the base models
level0 = list()
level0.append(('xgb', xgb.XGBClassifier(objective="binary:logistic", random_state=42)))
level0.append(('lgb', lgb.LGBMClassifier()))
level0.append(('catb', CatBoostClassifier(verbose=0)))
level0.append(('tabpfn', TabPFNClassifier(device='cuda')))
# define meta learner model
level1 = LogisticRegression()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

model.fit(X_train.to_numpy(), y_train.to_numpy())

y_eval = model.predict(X_val.to_numpy())
print(f'Prediction time: {time.time() - start}, Accuracy : {accuracy_score(y_val, y_eval)}')

Loading models_diff/prior_diff_real_checkpoint_n_0_epoch_100.cpkt
Loading....
Using style prior: True
MODEL BUILDER <module 'tabpfn.priors.differentiable_prior' from '/home/beefsports/miniconda3/envs/geo/lib/python3.9/site-packages/tabpfn/priors/differentiable_prior.py'> <function get_model.<locals>.make_get_batch.<locals>.new_get_batch at 0x7f79705fb4c0>
Using cuda device
init dist
Not using distributed
DataLoader.__dict__ {'num_steps': 8192, 'get_batch_kwargs': {'batch_size': 1, 'eval_pos_seq_len_sampler': <function train.<locals>.eval_pos_seq_len_sampler at 0x7f79705fbca0>, 'seq_len_maximum': 10, 'device': 'cuda', 'num_features': 100, 'hyperparameters': {'lr': 0.0001, 'dropout': 0.0, 'emsize': 512, 'batch_size': 1, 'nlayers': 12, 'num_features': 100, 'nhead': 4, 'nhid_factor': 2, 'bptt': 10, 'eval_positions': [972], 'seq_len_used': 50, 'sampling': 'mixed', 'epochs': 400, 'num_steps': 8192, 'verbose': False, 'mix_activations': True, 'nan_prob_unknown_reason_reason_prior': 1.0, 'categ

In [30]:
# Prediction time: 6.959797382354736, Accuracy : 0.8340807174887892 - with PFN
# Prediction time: 11.511653423309326, Accuracy : 0.8609865470852018 - xgb, catboost, lgb & PFN
# Create submission

pred = model.predict(test.to_numpy())
pred_df = pd.DataFrame(pred, columns = ['Survived'])

predictions = pd.DataFrame(test_full['PassengerId'])
predictions['Survived'] = pred_df

predictions.to_csv('submission.csv', index=False)

  and should_run_async(code)
