# 1. Imports

In [4]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from category_encoders import TargetEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [5]:
from sklearn import set_config
set_config(transform_output='pandas')

# 2. Dataset Initialization

In [6]:
df_train = pd.read_csv('./data/train.csv')

In [7]:
df_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [8]:
df_test = pd.read_csv('./data/test.csv')

In [9]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


We should keep:
Pclass, Age, Title, Sex

## Estrazione Dati

Creare Ptitle: Mr, Mrs, Ms, ... in base al nome - raggruppamento (VIP)

# Esercizio

In [10]:
class AddTitle(TransformerMixin, BaseEstimator):
    def _init_(self):
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.values
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X['Title'] = X['Name'].str.findall(r',\s*([^\.]*)\s*\.').str[0]
        X['Title'] = X['Title'].apply(
            lambda x: x if x not in {'Dr', 'Rev', 'Col', 'Major', 'Capt', 'the Countess', 'Mlle', 'Ms', 'Lady', 'Sir', 'Mme', 'Don', 'Jonkheer'} else 'VIP')
        return X

    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            return pd.Series(list(self.feature_names_in_) + ['Title']).astype(object)
        else:
            return pd.Series(list(input_features) + ['Title']).astype(object)

In [11]:
age_imputer = ColumnTransformer([
    ('age_imputer', SimpleImputer(strategy='median', missing_values=pd.NA), ['Age'])
], 
remainder='passthrough',  # passthough columns not listed in the transformers
verbose_feature_names_out=False  # set to False to get rid of the transformer's name in the output
)

In [12]:
obtain_title_transformer = FunctionTransformer(
    lambda x: x.str.extract(r',\s*([^\.]*)\s*\.', expand=False).str.strip()
)

In [13]:
VIP_title_grouping = FunctionTransformer(
    lambda x: x.replace(['Dr', 'Rev', 'Col', 'Major', 'Capt', 'the Countess', 'Mlle', 'Ms', 'Lady', 'Sir', 'Mme', 'Don', 'Jonkheer'], 'VIP')
)

In [14]:
title_transformer = ColumnTransformer([
    ('title_transformer', obtain_title_transformer, ['Name']),
    ('VIP_title_grouping', VIP_title_grouping, ['Name'])
], 
remainder='passthrough',  # passthough columns not listed in the transformers
verbose_feature_names_out=False  # set to False to get rid of the transformer's name in the output
)

In [15]:
# replace male with 0 and female with 1
male_transformer = FunctionTransformer(
    lambda x: x.replace('male', '0')
)
female_transformer = FunctionTransformer(
    lambda x: x.replace('female', '1')
)

In [16]:
sex_transformer = ColumnTransformer([
    ('male_transformer', male_transformer, ["Sex"]),
    ('female_transformer', female_transformer, ["Sex"])
],
remainder='passthrough',  # passthough columns not listed in the transformers
verbose_feature_names_out=False  # set to False to get rid of the transformer's name in the output
)

In [17]:
drop_columns_transformer = ColumnTransformer([
    ('drop_columns_transformer', 'drop', ['SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'])
],
remainder='passthrough',  # passthough columns not listed in the transformers
verbose_feature_names_out=False  # set to False to get rid of the transformer's name in the output
)

In [18]:
one_hot_encoder = ColumnTransformer([
    ('one_hot_encoder', OneHotEncoder(sparse_output=False), [["Name"]]),
],
remainder='passthrough',  # passthough columns not listed in the transformers
verbose_feature_names_out=False  # set to False to get rid of the transformer's name in the output
)

In [19]:
target_encoder = ColumnTransformer([
    ('target_encoder', TargetEncoder(), ["Name"]),
],
remainder='passthrough',  # passthough columns not listed in the transformers
verbose_feature_names_out=False  # set to False to get rid of the transformer's name in the output
)

In [20]:
pipeline = Pipeline([
    ('age_imputer', age_imputer),
    ('title_transformer', AddTitle()),
    ('sex_transformer', sex_transformer),
    ('drop_columns_transformer', drop_columns_transformer),
    ('one_hot_encoder', one_hot_encoder),
    # ('target_encoder', target_encoder)
])

In [21]:
y_train = df_train['Survived']
X_train = df_train.drop('Survived', axis=1)

In [22]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
pipeline.fit_transform(X_train, y_train)

TypeError: unhashable type: 'list'

# Codifica delle categoriche (onehot vs target-encoding per title)

# Funzioni e oggetti utili

Trasformazione di default per tutta la matrice
1. ColumnTransformer
2. FunctionTransformer

Cross validation