In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

1.   Split train data into two parts
2.   Train multiple models on Part 1
3.   Make predictions on Part 2
4.   Make predictions on the test data
5.   Train a new model on Part 2 using predictions as features
6.   Make predictions on the test data using the 2nd level model

In [None]:
train = pd.read_csv('train.csv')

In [None]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
train.drop(columns=['Cabin', 'PassengerId', 'Name', 'Ticket', 'Fare'], inplace=True)

In [None]:
train['Age'] = train['Age'].fillna(train['Age'].mean())

In [None]:
train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    2
dtype: int64

In [None]:
train.Embarked.mode()

0    S
dtype: object

In [None]:
train['Embarked'] = train['Embarked'].fillna('S')

In [None]:
train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

In [None]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [None]:
train = pd.get_dummies(train, columns=['Sex', 'Embarked'])

In [None]:
train.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [None]:
train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,0,1,0,0,1
1,1,1,38.0,1,0,1,0,1,0,0
2,1,3,26.0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,1,0,0,0,1
4,0,3,35.0,0,0,0,1,0,0,1


In [None]:
test = pd.read_csv('test.csv')
test.drop(columns=['Cabin', 'PassengerId', 'Name', 'Ticket', 'Fare'], inplace=True)
test['Age'] = test['Age'].fillna(test['Age'].mean())
test.isna().sum()
test = pd.get_dummies(test, columns=['Sex', 'Embarked'])
test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,0,1,0,1,0
1,3,47.0,1,0,1,0,0,0,1
2,2,62.0,0,0,0,1,0,1,0
3,3,27.0,0,0,0,1,0,0,1
4,3,22.0,1,1,1,0,0,0,1


##Step 1: Split train into 2 sets##

In [None]:
train_1, train_2 = train_test_split(train, test_size=0.5)

##Step 2: Train Multiple models on train_1##

In [None]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_female', 'Sex_male','Embarked_C', 'Embarked_Q', 'Embarked_S']
label = 'Survived'

In [None]:
gb = GradientBoostingClassifier()
gb.fit(train_1[features], train_1[label])

rf = RandomForestClassifier()
rf.fit(train_1[features], train_1[label])

xgb = XGBClassifier()
xgb.fit(train_1[features], train_1[label])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

##Part 3: Make predictions on train_2##

In [None]:
# I call it emb (Embedding) because we use the predictions from the 3 models as features

train_2_emb = pd.DataFrame()
test_emb = pd.DataFrame()

In [None]:
train_2_emb['gb_pred'] = gb.predict(train_2[features])
train_2_emb['rf_pred'] = rf.predict(train_2[features])
train_2_emb['xgb_pred'] = xgb.predict(train_2[features])

In [None]:
train_2_emb.head()

Unnamed: 0,gb_pred,rf_pred,xgb_pred
0,0,0,0
1,0,0,0
2,0,0,0
3,1,1,1
4,0,0,0


In [None]:
train_2_emb.columns

Index(['gb_pred', 'rf_pred', 'xgb_pred'], dtype='object')

## Part 4:Make predictions on test##

In [None]:
test_emb['gb_pred'] = gb.predict(test[features])
test_emb['rf_pred'] = rf.predict(test[features])
test_emb['xgb_pred'] = xgb.predict(test[features])

In [None]:
test_emb.head()

Unnamed: 0,gb_pred,rf_pred,xgb_pred
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


## Part 5: Train a new model on train_2 using predictions as features ##

In [None]:
#This is a 2nd level model: You use predictions from the first model as the features for the 2nd model
model = LogisticRegression()
emb_features = ['gb_pred', 'rf_pred', 'xgb_pred']
#Use the 3 model's predictions as the features and the actual label as the label
model.fit(train_2_emb[emb_features], train_2[label])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#See here that the model uses the RandomForest the most, followed by XGBoost and then GradientBoosting
# Higher weights means rely more on that specific model
model.coef_

array([[0.65185596, 1.24644208, 1.06351645]])

In [None]:
preds = model.predict(test_emb[emb_features])

In [None]:
preds

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,

In [None]:
pd.DataFrame(preds, columns=['Survived']).to_csv('test23.csv')