In [2]:
#https://www.kaggle.com/c/tabular-playground-series-apr-2021/

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


In [2]:
plt.rc('figure', figsize=(12.0, 6.0))

In [3]:
####################################
# Importing data and merging
####################################

# Reading dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
ids = test['PassengerId']
# Adding a column in each dataset before merging
train['Type'] = 'train'
test['Type'] = 'test'

# Merging train and test
data = train.append(test)

####################################
# Missing values and new features
####################################
    
# New feature : Family_size
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data.loc[:,'FsizeD'] = 'Alone'
data.loc[(data['Family_Size'] > 1),'FsizeD'] = 'Small'
data.loc[(data['Family_Size'] > 4),'FsizeD'] = 'Big'

# Replacing missing Fare by median/Pclass 
fa = data[data["Pclass"] == 3]
data['Fare'].fillna(fa['Fare'].median(), inplace = True)

#  New feature : Child
data.loc[:,'Child'] = 1
data.loc[(data['Age'] >= 18),'Child'] =0

# New feature : Family Survival 
# data['Last_Name'] = data['Name'].apply(lambda x: str.split(x, ",")[0])
# DEFAULT_SURVIVAL_VALUE = 0.5

# data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
# for grp, grp_df in data[['Survived','Fare', 'Ticket', 'PassengerId',
#                            'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
                               
#     if (len(grp_df) != 1):
#         # A Family group is found.
#         for ind, row in grp_df.iterrows():
#             smax = grp_df.drop(ind)['Survived'].max()
#             smin = grp_df.drop(ind)['Survived'].min()
#             passID = row['PassengerId']
#             if (smax == 1.0):
#                 data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
#             elif (smin == 0.0):
#                 data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                
# for _, grp_df in data.groupby('Ticket'):
#     if (len(grp_df) != 1):
#         for ind, row in grp_df.iterrows():
#             if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
#                 smax = grp_df.drop(ind)['Survived'].max()
#                 smin = grp_df.drop(ind)['Survived'].min()
#                 passID = row['PassengerId']
#                 if (smax == 1.0):
#                     data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
#                 elif (smin == 0.0):
#                     data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                    
####################################
# Encoding and pre-modeling
####################################                  

# dropping useless features
data = data.drop(columns = ['Age','Cabin','Embarked', 'Name',
                            'Parch', 'SibSp','Ticket', 'Family_Size'])

# Encoding features
target_col = ["Survived"]
id_dataset = ["Type"]
cat_cols   = data.nunique()[data.nunique() < 12].keys().tolist()
cat_cols   = [x for x in cat_cols ]
# numerical columns
num_cols   = [x for x in data.columns if x not in cat_cols + target_col + id_dataset]
# Binary columns with 2 values
bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()
# Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]
# Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])
# Duplicating columns for multi value columns
data = pd.get_dummies(data = data,columns = multi_cols )
# Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns = num_cols)
# dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index = True,right_index = True,how = "left")
data = data.drop(columns = ['PassengerId'],axis = 1)

# Target = 1st column
cols = data.columns.tolist()
cols.insert(0, cols.pop(cols.index('Survived')))
data = data.reindex(columns= cols)

# Cutting train and test
train = data[data['Type'] == 1].drop(columns = ['Type'])
test = data[data['Type'] == 0].drop(columns = ['Type'])

In [4]:
train.shape, test.shape

((100000, 10), (100000, 10))

In [5]:
train.isna().any().sum(), test.isna().any().sum()

(0, 0)

In [6]:
train.head()

Unnamed: 0,Survived,Sex,Child,Pclass_1,Pclass_2,Pclass_3,FsizeD_Alone,FsizeD_Big,FsizeD_Small,Fare
0,1,1,1,1,0,0,0,0,1,-0.259156
1,0,1,1,0,0,1,1,0,0,-0.463749
2,0,1,1,0,0,1,0,0,1,0.39587
3,0,1,0,0,0,1,1,0,0,-0.468349
4,1,1,0,0,0,1,1,0,0,-0.546685


pretty neat

## Predicting using simple classification models

In [7]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [8]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived', axis=1), train.Survived, test_size=0.2, random_state=0, stratify=train.Survived)

### Simple function to train and display metrics of generalised models using GridSearch to find good parameters

In [9]:
def FitModel(algorithm,gridSearchParams,cv):
    grid = GridSearchCV(
        estimator=algorithm,
        param_grid=gridSearchParams,
        cv=cv,  verbose=1, n_jobs=-1)
    
    grid_result = grid.fit(X_train, y_train)
    best_params = grid_result.best_params_
    
    print('Best Params :',best_params)
    
    pred = grid.predict(X_test) 
    print(accuracy_score(y_test, pred))
    print(classification_report(y_test, pred))
    print(confusion_matrix(y_test, pred))
    
    return grid_result # returning the model

pd.options.display.float_format = '{:.2f}'.format

### Starting with Logistic Regression

In [10]:
lr = FitModel(LogisticRegression(), {}, 5)
#76

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params : {}
0.75935
              precision    recall  f1-score   support

           0       0.79      0.78      0.79     11445
           1       0.71      0.73      0.72      8555

    accuracy                           0.76     20000
   macro avg       0.75      0.76      0.75     20000
weighted avg       0.76      0.76      0.76     20000

[[8941 2504]
 [2309 6246]]


## AdaBoostClassifier

In [54]:
%%time
params = {'n_estimators': [30,50,100,500], 'learning_rate': [0.1,0.5,1.0]}
ada = FitModel(AdaBoostClassifier(), params, 5)
#75 Best Params : {'learning_rate': 0.1, 'n_estimators': 30}

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params : {'learning_rate': 0.1, 'n_estimators': 30}
0.75764
              precision    recall  f1-score   support

           0       0.80      0.78      0.79     14307
           1       0.71      0.73      0.72     10693

    accuracy                           0.76     25000
   macro avg       0.75      0.75      0.75     25000
weighted avg       0.76      0.76      0.76     25000

[[11098  3209]
 [ 2850  7843]]
CPU times: user 1.76 s, sys: 68 ms, total: 1.83 s
Wall time: 2min 51s


GridSearchCV(cv=5, estimator=AdaBoostClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.5, 1.0],
                         'n_estimators': [30, 50, 100, 500]},
             verbose=1)

## RandomForestClassifier
StackingClassifier, VotingClassifier

In [57]:
%%time
params = {'n_estimators': [30,50,100,500]}
rf = FitModel(RandomForestClassifier(), params, 5)
#69 Best Params : {'n_estimators': 500}

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Params : {'n_estimators': 500}
0.68852
              precision    recall  f1-score   support

           0       0.72      0.74      0.73     14307
           1       0.64      0.62      0.63     10693

    accuracy                           0.69     25000
   macro avg       0.68      0.68      0.68     25000
weighted avg       0.69      0.69      0.69     25000

[[10614  3693]
 [ 4094  6599]]
CPU times: user 54 s, sys: 540 ms, total: 54.5 s
Wall time: 3min 36s


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'n_estimators': [30, 50, 100, 500]}, verbose=1)

## XGBoostClassifier

In [14]:
from xgboost import XGBClassifier

In [63]:
%%time
params={"learning_rate"    : [0.1] ,
 "max_depth"        : [5],
 "min_child_weight" : [3],
 "gamma"            : [0.3],
 "colsample_bytree" : [0.5] }
xgb = FitModel(XGBClassifier(), params, 4)
# 0.76 Best Params : {'colsample_bytree': 0.5, 'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3}

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Params : {'colsample_bytree': 0.5, 'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3}
0.76204
              precision    recall  f1-score   support

           0       0.80      0.77      0.79     14307
           1       0.71      0.75      0.73     10693

    accuracy                           0.76     25000
   macro avg       0.76      0.76      0.76     25000
weighted avg       0.76      0.76      0.76     25000

[[11067  3240]
 [ 2709  7984]]
CPU times: user 3.48 s, sys: 28.1 ms, total: 3.51 s
Wall time: 3min 20s


GridSearchCV(cv=4,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

## VotingClassifier + StackingClassifier + First submissions (for each model)

In [15]:
ada_clf = AdaBoostClassifier(n_estimators=30, learning_rate=0.1)
lr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
xgb_clf = XGBClassifier(learning_rate=0.1, max_depth=5, min_child_weight=3, gamma=0.3, colsample_bytree=0.5)
estimators = [('lr',lr_clf),('ada',ada_clf),('rf',rf_clf),('xgb',xgb_clf)]
voting_clf = VotingClassifier(estimators=estimators)
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=ExtraTreesClassifier())

In [50]:
def generate_submission(predictions, csv_name):
    df = pd.DataFrame(columns=['PassengerId','Survived'])
    df['PassengerId'] = ids
    df['Survived'] = pd.Series(predictions)
    df.to_csv(csv_name, header=True, index=False)
    return df.head(3)

In [16]:
for name, clf in estimators+[('voting',voting_clf), ('stacking',stacking_clf)]:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,
          accuracy_score(y_test, y_pred))
    y_pred = clf.predict(test.drop('Survived', axis=1))
    generate_submission(y_pred, f'{name}.csv')

StackingClassifier 0.73945


## KerasClassifier (ROOM FOR IMPROVEMENT)

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [53]:
def create_model():
    dim=X_train.shape[1]
    model = Sequential()
    model.add(Dense(dim, activation='relu'))
    
    model.add(Dense(dim/2, activation='relu'))
    
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [54]:
model = create_model()

In [38]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((80000, 9), (80000,), (20000, 9), (20000,))

In [55]:
model.fit(X_train, y_train, epochs=40, validation_data=(X_test,y_test), callbacks=[EarlyStopping(patience=4, monitor='accuracy')])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40


<tensorflow.python.keras.callbacks.History at 0x7f8cc0e0b668>

In [57]:
pred = model.predict(test.drop('Survived', axis=1)).flatten()
pred = pred>0.5

In [58]:
pred = pred.astype(int)
generate_submission(pred, 'keras.csv')

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,0
2,100002,1


In [59]:
!kaggle competitions submit -c tabular-playground-series-apr-2021 -f keras.csv -m "Keras 2"

100%|████████████████████████████████████████| 879k/879k [00:00<00:00, 1.91MB/s]
Successfully submitted to Tabular Playground Series - Apr 2021

## SageMaker XGBoost + Hiperparameter tunning job

In [None]:
from sagemaker.xgboost.estimator import XGBoost
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
import boto3

region = boto3.Session().region_name
session = sagemaker.Session()

role = get_execution_role()
bucket = session.default_bucket()

## SageMaker Autopilot