In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,FunctionTransformer,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score

from xgboost import XGBClassifier

In [2]:
# train and test paths
train_path = "data/train.csv"
test_path = "data/test.csv"

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [4]:
# Create features and target
X = train.drop('Transported', axis=1)
y = train.Transported

In [5]:
# Create train validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

## BASELINE MODEL

In [6]:
# numerical columns
numerical_cols = X.select_dtypes(exclude='object').columns
numerical_cols

# categorical columns
categorical_cols = X.select_dtypes(include='object').columns
categorical_cols

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',
       'Name'],
      dtype='object')

In [7]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [8]:
# Create Random Forest model for baseline prediction
rf_model = RandomForestClassifier(n_estimators=100, n_jobs = 4, random_state=42)

In [9]:
# Create baseline pipeline
baseline_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', rf_model)])

In [10]:
# Fit baseline model
baseline_pipeline.fit(X_train, y_train)

In [11]:
# Get predictions for validation dataset
baseline_pred = baseline_pipeline.predict(X_valid)

In [12]:
# Calculate accuracy score of model
baseline_accuracy = accuracy_score(baseline_pred, y_valid)
baseline_accuracy

0.7826336975273146

In [13]:
# Evaluate distribution of predictions
pd.Series(baseline_pred).value_counts()

False    883
True     856
Name: count, dtype: int64

In [14]:
# Get predictions for test dataset
preds_test = baseline_pipeline.predict(test)

#### Submit Baseline Prediction

In [15]:
sample = pd.read_csv("data/sample_submission.csv")

In [16]:
sample.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [17]:
submission_df = pd.DataFrame({
    'PassengerId': sample.PassengerId,
    'Transported': preds_test
})

In [18]:
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [19]:
# save submission file
submission_df.to_csv('baseline_submission.csv', index=False)

# IMPROVE_MODEL

#### Feature engineering

In [20]:
def create_cabin_features(row):
    if pd.notna(row['Cabin']):
        row[['Deck', 'Number', 'Side']] = row['Cabin'].split('/')
    else:
        row[['Deck', 'Number', 'Side']] = [np.nan, np.nan, np.nan]
    return row

In [21]:
X_train[['Deck', 'Number', 'Side']] = 0
X_train = X_train.apply(create_cabin_features, axis=1)

In [22]:
X_valid[['Deck', 'Number', 'Side']] = 0
X_valid = X_valid.apply(create_cabin_features, axis=1)

In [23]:
def create_spending_features(row):
    row['Bigspenders_RoomService'] = 1 if row['RoomService'] > 1000 else 0
    row['Bigspenders_Food'] = 1 if row['FoodCourt'] > 10000 else 0
    row['Bigspenders_Mall'] = 1 if row['ShoppingMall'] > 2000 else 0
    row['Bigspenders_Spa'] = 1 if row['Spa'] > 1000 else 0
    row['Bigspenders_VR'] = 1 if row['VRDeck'] > 1000 else 0
    return row

In [24]:
X_train = X_train.apply(create_spending_features, axis=1)
X_valid = X_valid.apply(create_spending_features, axis=1)

In [25]:
# numerical columns
numerical_cols = X_train.select_dtypes(exclude='object').columns
numerical_cols

# categorical columns
categorical_cols = X_train.select_dtypes(include='object').columns
categorical_cols

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP',
       'Name', 'Deck', 'Number', 'Side'],
      dtype='object')

In [26]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [27]:
xgb_model = XGBClassifier(n_estimators=1000, max_depth=10, n_jobs=4)
#xgb_model = XGBClassifier(n_estimators=1000,max_depth=10,learning_rate=0.1,subsample=1.0)

In [28]:
# Preprocessing
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Transform the data
X_train = my_pipeline.fit_transform(X_train)
X_valid = my_pipeline.transform(X_valid)

# Fit the model
xgb_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)




In [29]:
# get predictions for validation dataset
predictions = xgb_model.predict(X_valid)

In [30]:
# calculate accuracy score of model
accuracy = accuracy_score(predictions, y_valid)
accuracy

0.78205865439908

#### Train with the whole train data

In [31]:
X[['Deck', 'Number', 'Side']] = 0

X = X.apply(create_spending_features, axis=1)

In [32]:
# Create features and target
X = train.drop('Transported', axis=1)
y = train.Transported

In [33]:
X[['Deck', 'Number', 'Side']] = 0
X

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Deck,Number,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,0,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,0,0,0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,0,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,0,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,0,0
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,0,0
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,0,0,0
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,0,0


In [34]:
X = X.apply(create_cabin_features, axis=1)
X

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Deck,Number,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,A,98,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,G,1499,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,G,1500,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,E,608,S


In [35]:
X = X.apply(create_spending_features, axis=1)
X

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Deck,Number,Side,Bigspenders_RoomService,Bigspenders_Food,Bigspenders_Mall,Bigspenders_Spa,Bigspenders_VR
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0.0,Maham Ofracculy,B,0,P,0,0,0,0,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,44.0,Juanna Vines,F,0,S,0,0,0,0,0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,49.0,Altark Susent,A,0,S,0,0,0,1,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,193.0,Solam Susent,A,0,S,0,0,0,1,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,2.0,Willy Santantines,F,1,S,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,...,74.0,Gravior Noxnuther,A,98,P,0,0,0,1,0
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,...,0.0,Kurta Mondalley,G,1499,S,0,0,0,0,0
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,...,0.0,Fayey Connon,G,1500,S,0,0,0,0,0
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,...,3235.0,Celeon Hontichre,E,608,S,0,0,0,0,1


In [36]:
# Preprocessing
final_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Transform the data
X = final_pipeline.fit_transform(X)

# Fit the model
xgb_model.fit(X, y)

In [37]:
test[['Deck', 'Number', 'Side']] = 0
test = test.apply(create_spending_features, axis=1)

In [38]:
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Deck,Number,Side,Bigspenders_RoomService,Bigspenders_Food,Bigspenders_Mall,Bigspenders_Spa,Bigspenders_VR
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,...,0.0,Nelly Carsoning,0,0,0,0,0,0,0,0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,...,0.0,Lerome Peckers,0,0,0,0,0,0,1,0
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,...,0.0,Sabih Unhearfus,0,0,0,0,0,0,0,0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,...,585.0,Meratz Caltilter,0,0,0,0,0,0,0,0
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,...,0.0,Brence Harperez,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,...,0.0,Jeron Peter,0,0,0,0,0,0,0,0
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,...,144.0,Matty Scheron,0,0,0,0,0,0,0,0
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,...,0.0,Jayrin Pore,0,0,0,0,0,0,0,0
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,...,523.0,Kitakan Conale,0,0,0,0,0,0,0,0


In [39]:
test = test.apply(create_cabin_features, axis=1)
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Deck,Number,Side,Bigspenders_RoomService,Bigspenders_Food,Bigspenders_Mall,Bigspenders_Spa,Bigspenders_VR
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,...,0.0,Nelly Carsoning,G,3,S,0,0,0,0,0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,...,0.0,Lerome Peckers,F,4,S,0,0,0,1,0
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,...,0.0,Sabih Unhearfus,C,0,S,0,0,0,0,0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,...,585.0,Meratz Caltilter,C,1,S,0,0,0,0,0
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,...,0.0,Brence Harperez,F,5,S,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,...,0.0,Jeron Peter,G,1496,S,0,0,0,0,0
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,...,144.0,Matty Scheron,,,,0,0,0,0,0
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,...,0.0,Jayrin Pore,D,296,P,0,0,0,0,0
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,...,523.0,Kitakan Conale,D,297,P,0,0,0,0,0


In [40]:
# apply pipeline transformations to test dataset
test = final_pipeline.transform(test)

In [41]:
# get predictions for test dataset
preds_test = xgb_model.predict(test)

In [42]:
# convert predictions to True/False
preds_test = np.array([True if val==1 else False for val in preds_test])

In [43]:
# create dataframe for submission
submission_df = pd.DataFrame({
    'PassengerId': sample.PassengerId,
    'Transported': preds_test
})

submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [44]:
# save submission file
submission_df.to_csv('submission.csv', index=False)

### SET-UP GRIDSEARCH

In [45]:
#from sklearn.model_selection import GridSearchCV

#model = XGBClassifier()

#pipeline = pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

#param_grid = {
#    'model__max_depth': [2, 3, 5, 7, 10],
#    'model__n_estimators': [10, 100, 500, 1000],
#}

#grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=4, scoring='roc_auc')

#grid.fit(X_train, y_train)

#mean_score = grid.cv_results_["mean_test_score"][grid.best_index_]
#std_score = grid.cv_results_["std_test_score"][grid.best_index_]

#grid.best_params_, mean_score, std_score

#print(f"Best parameters: {grid.best_params_}")
#print(f"Mean CV score: {mean_score: .6f}")
#print(f"Standard deviation of CV score: {std_score: .6f}")