## Imports

In [None]:
# pip install --upgrade scikit-learn

In [2]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24

Pandas   1.2.4
Sklearn  0.24.1


## Get the dataset
- **CLOUD = True**: Download dataset from Kaggle. Necesary for cloud enviroments like COLAB. **Specify your [kaggle credentials](https://www.kaggle.com/docs/api)**.
- **CLOUD = False**: Get the dataset from your local machine. **Specify the data path**.

In [3]:
CLOUD = False

if CLOUD:
    import os
    os.environ['KAGGLE_USERNAME'] = "your_kaggle_username"
    os.environ['KAGGLE_KEY']      = "your_kaggle_api_key"  # See https://www.kaggle.com/docs/api
    !pip install --upgrade kaggle
    !kaggle competitions download -c titanic
    DATA_PATH = "./"

else:
    DATA_PATH = "../../Datasets/Tabular/titanic/"

## Load data

In [4]:
df      = pd.read_csv(DATA_PATH + "train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)
df.head()

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Check missings

In [5]:
df.isnull().sum() / len(df) #Embark is missing in full data (Calculating percentage missing)

Survived    0.000000
Pclass      0.000000
Name        0.000000
Sex         0.000000
Age         0.198653
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.000000
Cabin       0.771044
Embarked    0.002245
dtype: float64

In [6]:
df_test.isnull().sum() / len(df) #Fare is missing in test data (calculating the percentage missing)

Pclass      0.000000
Name        0.000000
Sex         0.000000
Age         0.096521
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.001122
Cabin       0.367003
Embarked    0.000000
dtype: float64

# Exercise 1 (2pts):
Extract the title (Mr, Mrs, ... ) from the "Name" column.

Tips:
- split(',')[1] to get the 2nd part, and remove the surnamename
- split('.')[0] to get the 1str part, and remove the name

In [7]:
# CODE HERE get_Title_from_Name funtion
# Create this function using lambda (not def)

get_Title_from_Name = lambda x: x.split(',')[1].split('.')[0].strip()

# YOUR CODE HERE

#raise NotImplementedError()

df['Title']      = df['Name'].map(get_Title_from_Name)
df_test['Title'] = df_test['Name'].map(get_Title_from_Name)

In [8]:
assert df['Title'].values[0] == "Mr"
assert df['Title'].values[1] == "Mrs"
assert df['Title'].values[2] == "Miss"

assert df_test['Title'].values[0] == "Mr"
assert df_test['Title'].values[1] == "Mrs"
assert df_test['Title'].values[414] == "Dona"

# Exercise 2 (1pts):
Apply the title_dictionary to get a better information about the title. You have to overwrite the Title variable.

In [9]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [10]:
# Use map to apply the prevous dict


# YOUR CODE HERE

df["Title"] = df.Title.map(title_dictionary)
df_test["Title"] = df_test.Title.map(title_dictionary)
#raise NotImplementedError()

In [11]:
assert df['Title'].values[886] == "Officer"
assert df_test['Title'].values[417] == "Master"

# Exercise OPTINAL (0pts):
Try to extract some information from the feature **Ticket**. Search on Internet if that colum has some kind of information.

In [45]:
df_test['Ticket']

# Exercise OPTIONAL (0pts):
Try to extract some information from the feature **Cabin**. Search on Internet if that colum has some kind of information.

In [None]:
df_test['Cabin']

# Preprocessing
For X data, notice that...
- We drop Survived because is the target variable
- We drop Name because we have extracted the Title: Mr, Mrs, ...
- We drop Ticket because it has no information -> see df.Ticket.nunique()
- We drop Cabin because it has a lot of missings (77% are missings)

Then, we identify **numerical** variables and **categorical** variables,

In [12]:
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin']) # # X_TEST DATA (NEW DATA)

In [13]:
cat_vars  = ['Sex', 'Embarked', 'Title']         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

Categorical features:
 ['Sex', 'Embarked', 'Title']


# Exercise 3 (2pts):
Create a **ColumnTransformer for Tree Models**. You need to create 2 pipelines (one for numerical and other for categories). Remember:
- Categorical pipeline: Some SimpleImputer -> Some Encoder
- Numerical pipeline: Some SimpleImputer -> NO Encoder

In [26]:
# YOUR CODE HERE

num_4_treeModels = pipeline.Pipeline(steps=[
  ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999))
])

cat_4_treeModels = pipeline.Pipeline(steps=[
   ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('OrdinalEncoder', preprocessing.OrdinalEncoder(handle_unknown='use_encoder_value')) # specify unknowns 
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro
#raise NotImplementedError()

In [27]:
assert type(tree_prepro)      is compose._column_transformer.ColumnTransformer
assert type(num_4_treeModels) is pipeline.Pipeline
assert type(cat_4_treeModels) is pipeline.Pipeline
assert len(num_4_treeModels) == 1
assert len(cat_4_treeModels) == 2

# Exercise 4 (1pts):
1. Complete the diccionary with some Tree Models.
2. Then we put each model in a Pipeline where:
   - first is the prepocessing with the column Transformer
   - Then is the Tree model
3. Display the fullpipeline of the LGBMClassifier

In [28]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier

In [24]:

# YOUR CODE HERE

tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees":   ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost":      AdaBoostClassifier(n_estimators=100),
  "Skl GBM":       GradientBoostingClassifier(n_estimators=100),
  "Skl HistGBM":   HistGradientBoostingClassifier(max_iter=100),
  "XGBoost":       XGBClassifier(n_estimators=100),
  "LightGBM":      LGBMClassifier(n_estimators=100),
  "CatBoost":      CatBoostClassifier(n_estimators=100),
} 


#raise NotImplementedError()

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

tree_classifiers["LightGBM"]

In [29]:
for pipe in tree_classifiers.values():
    assert type(pipe) is pipeline.Pipeline

# Exercise 5 (3pts):
Define a simple split validation strategy with:
- 80% for train
- 20% for validation
- With stratification
- random_state=0

And train all the models in a for loop

In [30]:
"""
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    # CODE HERE
)
"""
# YOUR CODE HERE
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    x, y,
    test_size=0.2,
    stratify = y,   
    random_state=4  
)
#raise NotImplementedError()


results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})


for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    model.fit(x_train, y_train)
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)

# YOUR CODE HERE
#raise NotImplementedError()


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Learning rate set to 0.073611
0:	learn: 0.6682963	total: 1.73ms	remaining: 171ms
1:	learn: 0.6468979	total: 3.11ms	remaining: 152ms
2:	learn: 0.6244752	total: 4.82ms	remaining: 156ms
3:	learn: 0.6043446	total: 5.76ms	remaining: 138ms
4:	learn: 0.5887822	total: 6.54ms	remaining: 124ms
5:	learn: 0.5727134	total: 8.09ms	remaining: 127ms
6:	learn: 0.5584710	total: 9.16ms	remaining: 122ms
7:	learn: 0.5442384	total: 10.2ms	remaining: 117ms
8:	learn: 0.5313387	total: 11.3ms	remaining: 114ms
9:	learn: 0.5206598	total: 13.1ms	remaining: 118ms
10:	learn: 0.5105335	total: 14.9ms	remaining: 120ms
11:	learn: 0.5015384	total: 17.1ms	remaining: 125ms
12:	learn: 0.4950929	total: 26.1ms	remaining: 175ms
13:	learn: 0.4877274	total: 30.8ms	remaining: 189ms
14:	learn: 0.4819678	total: 31.9ms	remaining: 181ms
15:	learn: 0.4749300	total: 33.2ms	remaining: 174ms
16:	learn: 0.4687799	total: 34.3ms	remaining: 168ms
17:	learn: 0.4639781	total: 35.6ms	remaining: 162ms
18:	learn: 0.4587636	total: 36.7ms	remaining

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,XGBoost,88.268156,86.943347,0.434039
2,LightGBM,85.47486,84.400527,0.190929
3,AdaBoost,84.916201,84.216074,0.385639
4,Skl HistGBM,84.916201,83.675889,1.217274
5,Skl GBM,84.357542,83.221344,0.190249
6,CatBoost,84.357542,82.681159,0.34823
7,Random Forest,83.240223,81.772069,0.316593
8,Decision Tree,81.564246,81.218709,0.125943
9,Extra Trees,80.446927,79.229249,0.343031


In [33]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 6 (3pts):
Define a 10 Fold cross validation strategy with:
- With stratification
- shuffle=True
- random_state=0

And train all the models in a for loop.

Tip you can use **[cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)** for both training and predict with 

In [34]:
skf = model_selection.StratifiedKFold(
    n_splits=10, shuffle=True, random_state=0
)

# YOUR CODE HERE
#raise NotImplementedError()

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
        
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = model_selection.cross_val_predict(model, x, y, cv=skf)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              

# YOUR CODE HERE
#raise NotImplementedError()


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

s
17:	learn: 0.4497651	total: 27.2ms	remaining: 124ms
18:	learn: 0.4444018	total: 28.5ms	remaining: 121ms
19:	learn: 0.4390476	total: 29.5ms	remaining: 118ms
20:	learn: 0.4351984	total: 30.5ms	remaining: 115ms
21:	learn: 0.4315537	total: 33.8ms	remaining: 120ms
22:	learn: 0.4293651	total: 35.9ms	remaining: 120ms
23:	learn: 0.4248839	total: 39.4ms	remaining: 125ms
24:	learn: 0.4232964	total: 40.4ms	remaining: 121ms
25:	learn: 0.4200897	total: 41.8ms	remaining: 119ms
26:	learn: 0.4171460	total: 43.9ms	remaining: 119ms
27:	learn: 0.4154904	total: 45.4ms	remaining: 117ms
28:	learn: 0.4128861	total: 47.5ms	remaining: 116ms
29:	learn: 0.4099351	total: 48.6ms	remaining: 113ms
30:	learn: 0.4074888	total: 49.6ms	remaining: 110ms
31:	learn: 0.4050633	total: 50.5ms	remaining: 107ms
32:	learn: 0.4025727	total: 51.6ms	remaining: 105ms
33:	learn: 0.4002960	total: 53.3ms	remaining: 103ms
34:	learn: 0.3981457	total: 54.5ms	remaining: 101ms
35:	learn: 0.3957543	total: 55.9ms	remaining: 99.3ms
36:	learn

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Skl GBM,83.72615,81.99837,2.828816
2,Skl HistGBM,82.828283,81.324897,16.088139
3,LightGBM,82.603816,81.142748,1.942335
4,CatBoost,82.379349,79.968364,3.024142
5,Random Forest,82.154882,80.888697,3.212516
6,XGBoost,81.818182,80.394977,3.37767
7,AdaBoost,81.369248,80.361423,5.785335
8,Decision Tree,81.257015,80.215224,0.528466
9,Extra Trees,80.246914,78.84431,2.98773


In [35]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 7.1
Train with all data the best model

In [36]:
best_model = tree_classifiers["Skl GBM"]

# Fit best model with all data
best_model.fit(x,y)

#raise NotImplementedError()

# Exercise 7.2 (2pts)
With your best model, generate the predicitions for test data (x_test)

In [37]:
test_pred = best_model.predict(x_test)

# YOUR CODE HERE
#raise NotImplementedError()

In [38]:
assert len(test_pred) == 418
assert np.unique(test_pred).tolist() == [0,1]

# Exercise 7.3

Submit to kaggle.

- You can use the kaggle command line app. Check https://github.com/Kaggle/kaggle-api

In [39]:
sub = pd.DataFrame(test_pred, index=x_test.index, columns=["Survived"])
sub.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


In [40]:
sub.to_csv("sub.csv")

In [41]:
!kaggle competitions submit -c titanic -f sub.csv -m "My submission message"

zsh:1: command not found: kaggle
