## Imports

In [24]:
# pip install --upgrade scikit-learn

In [1]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import re
import requests
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24

Pandas   1.4.1
Sklearn  1.0.2


## Get the dataset
- **CLOUD = True**: Download dataset from Kaggle. Necesary for cloud enviroments like COLAB. **Specify your [kaggle credentials](https://www.kaggle.com/docs/api)**.
- **CLOUD = False**: Get the dataset from your local machine. **Specify the data path**.

In [5]:
# DATA_PATH = "../../Datasets/Tabular/titanic/"

In [2]:
DATA_PATH = './data/'

## Load data

In [3]:
df      = pd.read_csv(DATA_PATH + "train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


## Check missings

In [111]:
# df.isnull().sum()
f = df.isnull().sum()
print(f)

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
Title         0
dtype: int64


In [6]:
# df_test.isnull().sum()
df_test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

# Exercise 1 (2pts):
Extract the title (Mr, Mrs, ... ) from the "Name" column.

Tips:
- split(',')[1] to get the 2nd part, and remove the surnamename
- split('.')[0] to get the 1str part, and remove the name

In [7]:
df.head(2)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [8]:
# CODE HERE get_Title_from_Name funtion
# Create this function using lambda (not def)
# get_Title_from_Name = x.split(',')[0]



df['Title'] = df['Name'].apply(lambda i: (i.split(',')[1].split('.')[0]).lstrip())
df_test['Title'] = df_test['Name'].apply(lambda i: (i.split(',')[1].split('.')[0]).lstrip())


In [9]:
assert df['Title'].values[0] == "Mr"
assert df['Title'].values[1] == "Mrs"
assert df['Title'].values[2] == "Miss"

assert df_test['Title'].values[0] == "Mr"
assert df_test['Title'].values[1] == "Mrs"
assert df_test['Title'].values[414] == "Dona"

# Exercise 2 (1pts):
Apply the title_dictionary to get a better information about the title. You have to overwrite the Title variable.

In [10]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [11]:
# Use map to apply the prevous dict https://datagy.io/pandas-map-apply/

df["Title"] = df["Title"].map(title_dictionary)
df_test["Title"] = df_test["Title"].map(title_dictionary)



In [12]:
assert df['Title'].values[886] == "Officer"
assert df_test['Title'].values[417] == "Master"

# Exercise OPTINAL (0pts):
Try to extract some information from the feature **Ticket**. Search on Internet if that colum has some kind of information.

In [22]:
df_test.tail()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S,Mr
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S,Mr
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S,Mr
1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C,Master


# Exercise OPTIONAL (0pts):
Try to extract some information from the feature **Cabin**. Search on Internet if that colum has some kind of information.

PassengerId
892      NaN
893      NaN
894      NaN
895      NaN
896      NaN
        ... 
1305     NaN
1306    C105
1307     NaN
1308     NaN
1309     NaN
Name: Cabin, Length: 418, dtype: object

# Preprocessing
For X data, notice that...
- We drop Survived because is the target variable
- We drop Name because we have extracted the Title: Mr, Mrs, ...
- We drop Ticket because it has no information -> see df.Ticket.nunique()
- We drop Cabin because it has a lot of missings (77% are missings)

Then, we identify **numerical** variables and **categorical** variables,

In [23]:
df.Ticket.nunique()

681

In [81]:
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin']) # # X_TEST DATA (NEW DATA)

In [82]:
x.head(2)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,male,22.0,1,0,7.25,S,Mr
2,1,female,38.0,1,0,71.2833,C,Mrs


In [39]:
df.Parch.unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

In [84]:
cat_vars  = ['Sex', 'Embarked', 'Title']         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

Categorical features:
 ['Sex', 'Embarked', 'Title']


# Exercise 3 (2pts):
Create a **ColumnTransformer for Tree Models**. You need to create 2 pipelines (one for numerical and other for categories). Remember:
- Categorical pipeline: Some SimpleImputer -> Some Encoder
- Numerical pipeline: Some SimpleImputer -> NO Encoder

In [85]:
"""
num_preprocessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
])

cat_preporcessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
  # Some Encoder here. Remember to handle_unknown
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_preprocessing, num_vars),
    ('cat', cat_preporcessing, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro
""";

### BEGIN SOLUTION
num_4_treeModels = pipeline.Pipeline(steps=[ ( 'imputer',impute.SimpleImputer( )  ) ])
cat_4_treeModels = pipeline.Pipeline(steps=[ ( 'imputer',impute.SimpleImputer(strategy='most_frequent' )  ),('ordinal', preprocessing.OrdinalEncoder(handle_unknown= 'use_encoded_value', unknown_value = -1))  ])
tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') 


tree_prepro

In [52]:
assert type(tree_prepro)      is compose._column_transformer.ColumnTransformer
assert type(num_4_treeModels) is pipeline.Pipeline
assert type(cat_4_treeModels) is pipeline.Pipeline
assert len(num_4_treeModels) == 1
assert len(cat_4_treeModels) == 2

# Exercise 4 (1pts):
1. Complete the dictionary with some Tree Models.
2. Then we put each model in a Pipeline where:
   - first is the prepocessing with the column Transformer
   - Then is the Tree model
3. Display the fullpipeline of the LGBMClassifier

In [56]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier

In [104]:

tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(random_state = 0),
  "Extra Trees": ExtraTreesClassifier(random_state = 0),
  "Random Forest": RandomForestClassifier(random_state = 0),
  "AdaBoost": AdaBoostClassifier(random_state = 0),
  "Skl GBM": GradientBoostingClassifier(random_state = 0),
  "Skl HistGBM": HistGradientBoostingClassifier(random_state = 0),
  "XGBoost": XGBClassifier(random_state = 0),
  "LightGBM": LGBMClassifier(random_state = 0),
  "CatBoost":CatBoostClassifier(random_state = 0)
  }
tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}
tree_classifiers["XGBoost"]





In [105]:
for pipe in tree_classifiers.values():
    assert type(pipe) is pipeline.Pipeline

# Exercise 5 (3pts):
Define a simple split validation strategy with:
- 80% for train
- 20% for validation
- With stratification
- random_state=0

And train all the models in a for loop

In [106]:

x_train, x_val, y_train, y_val = model_selection.train_test_split(   
    x,y,test_size=0.2, random_state = 0,stratify=y
)


results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

"""
for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    
    # GET PREDICTIONS USING x_val
    pred = # CODE HERE

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
"""

### BEGIN SOLUTION

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    
    model.fit(x_train,y_train)
    
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,




  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,


Learning rate set to 0.008911
0:	learn: 0.6896726	total: 1.32ms	remaining: 1.32s
1:	learn: 0.6836959	total: 4.23ms	remaining: 2.11s
2:	learn: 0.6800766	total: 6.45ms	remaining: 2.14s
3:	learn: 0.6746137	total: 8.4ms	remaining: 2.09s
4:	learn: 0.6698224	total: 9.52ms	remaining: 1.89s
5:	learn: 0.6646849	total: 11.2ms	remaining: 1.85s
6:	learn: 0.6581636	total: 13.3ms	remaining: 1.89s
7:	learn: 0.6523196	total: 17.8ms	remaining: 2.2s
8:	learn: 0.6466238	total: 19.9ms	remaining: 2.19s
9:	learn: 0.6419766	total: 21.9ms	remaining: 2.17s
10:	learn: 0.6365860	total: 23.8ms	remaining: 2.14s
11:	learn: 0.6315233	total: 25.7ms	remaining: 2.12s
12:	learn: 0.6262742	total: 27.6ms	remaining: 2.1s
13:	learn: 0.6214949	total: 30.9ms	remaining: 2.17s
14:	learn: 0.6168132	total: 32.7ms	remaining: 2.15s
15:	learn: 0.6117856	total: 34.8ms	remaining: 2.14s
16:	learn: 0.6068928	total: 36.9ms	remaining: 2.13s
17:	learn: 0.6027087	total: 39ms	remaining: 2.13s
18:	learn: 0.5977712	total: 41ms	remaining: 2.11s

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Skl GBM,84.916201,82.59552,0.460714
2,XGBoost,81.564246,78.787879,0.247848
3,CatBoost,81.564246,79.057971,4.827963
4,Skl HistGBM,81.005587,78.603426,2.78868
5,LightGBM,81.005587,78.333333,0.325308
6,AdaBoost,79.888268,77.964427,0.334792
7,Random Forest,78.212291,75.520422,0.462712
8,Extra Trees,77.653631,75.065876,0.560653
9,Decision Tree,77.094972,75.421607,0.057964


In [107]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 6 (3pts):
Define a 10 Fold cross validation strategy with:
- With stratification
- shuffle=True
- random_state=0

And train all the models in a for loop.

Tip you can use **[cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)** for both training and predict with 

In [108]:
skf = model_selection.StratifiedKFold(
    n_splits=10,shuffle=True,random_state=0
)
skf

StratifiedKFold(n_splits=10, random_state=0, shuffle=True)

In [109]:

skf = model_selection.StratifiedKFold(
    n_splits=10,shuffle=True,random_state=0
)




results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

"""
for model_name, model in tree_classifiers.items():
    start_time = time.time()
        
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred =  # CODE HERE

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
"""

for model_name, model in tree_classifiers.items():
    start_time = time.time()
        
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = model_selection.cross_val_predict(model, x, y, cv=skf)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,








































  results = results.append({"Model":    model_name,
  results = results.append({"Model":    model_name,


Learning rate set to 0.009371
0:	learn: 0.6897445	total: 1.2ms	remaining: 1.2s
1:	learn: 0.6859893	total: 2.31ms	remaining: 1.15s
2:	learn: 0.6793726	total: 4.5ms	remaining: 1.5s
3:	learn: 0.6732200	total: 7.82ms	remaining: 1.95s
4:	learn: 0.6683890	total: 9.09ms	remaining: 1.81s
5:	learn: 0.6620413	total: 11.4ms	remaining: 1.9s
6:	learn: 0.6558555	total: 14.3ms	remaining: 2.02s
7:	learn: 0.6513085	total: 16.3ms	remaining: 2.02s
8:	learn: 0.6454314	total: 18.9ms	remaining: 2.08s
9:	learn: 0.6398634	total: 23.9ms	remaining: 2.37s
10:	learn: 0.6345910	total: 26.6ms	remaining: 2.39s
11:	learn: 0.6291396	total: 28.8ms	remaining: 2.37s
12:	learn: 0.6237670	total: 31.1ms	remaining: 2.36s
13:	learn: 0.6194304	total: 32.9ms	remaining: 2.31s
14:	learn: 0.6156077	total: 34.3ms	remaining: 2.25s
15:	learn: 0.6132333	total: 35.9ms	remaining: 2.21s
16:	learn: 0.6084108	total: 38.2ms	remaining: 2.21s
17:	learn: 0.6042160	total: 41.5ms	remaining: 2.27s
18:	learn: 0.5992437	total: 45.4ms	remaining: 2.3

  results = results.append({"Model":    model_name,


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Skl GBM,83.277217,81.027706,2.866889
2,CatBoost,83.277217,81.027706,40.317116
3,Skl HistGBM,82.491582,80.831176,15.353731
4,LightGBM,82.491582,80.8863,3.071099
5,XGBoost,81.930415,80.430927,5.815322
6,Random Forest,81.369248,79.920429,4.263973
7,AdaBoost,81.144781,79.903653,2.182306
8,Extra Trees,80.695847,79.153485,5.93878
9,Decision Tree,79.124579,77.823315,0.708351


In [110]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 7.1
Train with all data the best model

In [48]:
# best_model = tree_classifiers["SELECT MY BEST MODEL HERE"]

# Fit best model with all data



# Exercise 7.2 (2pts)
With your best model, generate the predicitions for test data (x_test)

In [49]:
# test_pred = # Get the predictions for x_test



In [50]:
assert len(test_pred) == 418
assert np.unique(test_pred).tolist() == [0,1]

# Exercise 7.3

Submit to kaggle.

- You can use the kaggle command line app. Check https://github.com/Kaggle/kaggle-api

In [51]:
"""sub = pd.DataFrame(test_pred, index=x_test.index, columns=["Survived"])
sub.head()"""

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [52]:
'''sub.to_csv("sub.csv")'''

In [53]:
!kaggle competitions submit -c titanic -f sub.csv -m "My submission message"

'kaggle' is not recognized as an internal or external command,
operable program or batch file.
