Target Encoding

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import matthews_corrcoef
from xgboost import XGBClassifier
import gc

pd.set_option('display.max_columns', None)

np.__version__

'1.26.4'

In [2]:
def replace_non_alpha_with_nan(df, categories):
    # cols_to_filter = ['cap-shape', 'cap-surface', 'cap-color', 
    #                   'does-bruise-or-bleed', 'gill-attachment', 
    #                   'gill-spacing', 'gill-color', 'stem-surface', 
    #                   'stem-color', 'has-ring', 'ring-type', 'habitat', 'stem-root', 'veil-type', 'veil-color', 'spore-print-color']

    cols_to_filter = categories
    

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    # col_values = {}
    # for col in cols_to_filter:
    #     value_counts = train[col].value_counts()# ? ONLY Based on trained dataset
    #     col_values[col] = value_counts[value_counts > 10].index.values.tolist()


    # def filter_alpha(value, value_list_no_outliers):
    #     if isinstance(value, str):
    #         return value if len(value) == 1 and value in value_list_no_outliers and value in alphabet_list else np.nan # if value is a single character
        
    #     return np.nan
    
    # for col in cols_to_filter:
    #     df[col] = df[col].apply(lambda x : filter_alpha(x, col_values[col]))


    # * Customized feature engineering
    features_dict = {
        'cap_shape': ['x', 'f', 's', 'b', 'o', 'p', 'c'],
        'cap_surface': ['t', 's', 'y', 'h', 'g', 'd', 'k', 'e', 'i', 'w', 'l'],
        'cap_color': ['n', 'y', 'w', 'g', 'e', 'o', 'p', 'r', 'u', 'b', 'k', 'l'],
        'does_bruise_or_bleed': ['f', 't'],
        'gill_attachment': ['a', 'd', 'x', 'e', 's', 'p', 'f'],
        'gill_spacing': ['c', 'd', 'f'],
        'gill_color': ['w', 'n', 'y', 'p', 'g', 'o', 'k', 'f', 'r', 'e', 'b', 'u'],
        'stem_root': ['b', 's', 'r', 'c', 'f'],
        'stem_surface': ['s', 'y', 'i', 't', 'g', 'k', 'h', 'f'],
        'stem_color': ['w', 'n', 'y', 'g', 'o', 'e', 'u', 'p', 'k', 'r', 'l', 'b', 'f'],
        'veil_type': ['u'],
        'veil_color': ['w', 'y', 'n', 'u', 'k', 'e'],
        'has_ring': ['f', 't'],
        'ring_type': ['f', 'e', 'z', 'l', 'r', 'p', 'g', 'm'],
        'spore_print_color': ['k', 'p', 'w', 'n', 'r', 'u', 'g'],
        'habitat': ['d', 'g', 'l', 'm', 'h', 'w', 'p', 'u'],
        'season': ['a', 'u', 'w', 's']
    }


    for classes, cols  in zip(features_dict.keys(), cols_to_filter):
        # df.loc[(~df[cols].isin(features_dict[classes])) & pd.notna(df[cols]), cols] = 'missing'
        df.loc[~df[cols].isin(features_dict[classes]), cols] = np.nan

    return df

In [3]:
def encode_train_and_test_data(df_train, target, df_test, df_validation, kaggle_test, num_cols, cat_cols, variable):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy = 'most_frequent')),
        # ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        # ('Target', TargetEncoder(smoothing=variable)),
        ('OneHot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))

        # ('adjust', FunctionTransformer(lambda x : x + 1)) # * Adjust function
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ],
        remainder = 'passthrough'
    )



    # * Target Encoding
    # train_te = preprocessor.fit_transform(df_train[all_columns], target)
    # test_te = preprocessor.transform(df_test[all_columns])
    # val_te = preprocessor.transform(df_validation[all_columns])

    all_columns = num_cols + cat_cols

    train_te = preprocessor.fit_transform(df_train[all_columns])
    test_te = preprocessor.transform(df_test[all_columns])
    val_te = preprocessor.transform(df_validation[all_columns])


    feature_names_out = preprocessor.get_feature_names_out()

    smoothing['feature names'] = feature_names_out


    print(f"All Columns {feature_names_out}")

    print(f"Train Transformed = {train_te}")

    # smoothing['categories'] = 

    df_train_transformed = pd.DataFrame(train_te, columns = feature_names_out)
    df_test_transformed = pd.DataFrame(test_te, columns = feature_names_out)
    df_validation_transformed = pd.DataFrame(val_te, columns = feature_names_out)

    # print(preprocessor['cat'].get_feature_names_out())

    # df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols], df['class']), columns = num_cols + cat_cols)

    df_train_final = df_train_transformed
    df_test_final = df_test_transformed
    df_validation_final = df_validation_transformed


    kaggle_test_te = preprocessor.transform(kaggle_test[all_columns])
    kaggle_test_final = pd.DataFrame(kaggle_test_te, columns = feature_names_out)


    return df_train_final, df_test_final, df_validation_final, kaggle_test_final

In [4]:
import itertools
from sklearn.preprocessing import LabelEncoder

def find_train_combinations(train, cat_cols, num_cols):
    
    
    all_columns = cat_cols + num_cols

    # ? returning features from train_combinations with correlations greater than the mean of the original
    
    ord_enc = LabelEncoder()
    train['class'] = ord_enc.fit_transform(train['class'])

    corr_matrix = train.corr()
    
    threshold = abs(corr_matrix['class']).sort_values(ascending=False).mean()
    print(f" Mean Correlation of Original Data {threshold}")


    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    print(train.columns)
    combinations = itertools.combinations(filtered_cols, 2)
    print(combinations)

    train_combinations = train['class'].to_frame()

    for col1, col2 in combinations:
       combination = train[col1] * train[col2]
       train_combinations = train_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? returning features from train_combinations with correlations greater than the mean of the original

    # corr_combinations = train_combinations.corr()
    # abs_values = abs(corr_combinations['class'])
    # new_cols = abs_values.loc[abs_values > threshold].index.tolist()
    # if 'class' in new_cols:
    #     new_cols.remove('class')
    new_cols = ['habitat x cap-diameter']
    
    
    train['class'] = ord_enc.inverse_transform(train['class'])
    return train.join(train_combinations[new_cols])


In [5]:
def find_test_combinations(test, cat_cols, num_cols, train_columns):
    

    all_columns = cat_cols + num_cols

    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    combinations = itertools.combinations(filtered_cols, 2)

    test_combinations = pd.DataFrame(index = test.index)

    for col1, col2 in combinations:
       combination = test[col1] * test[col2]
       test_combinations = test_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? Remove 'class' feature from test set
    train_columns = train_columns.drop('class')

    test = test.join(test_combinations)

    return test[train_columns]

In [6]:
def aggregate(df):


    merging = df.groupby(['cap-shape'])['cap-diameter'].median().reset_index()

    names = {"cap-diameter" : "cap-shape x cap-diameter median"}

    merging = merging.rename(columns = names)


    return pd.merge(df, merging, on = "cap-shape", how = 'left')

In [7]:

def model(variable):

    train = pd.read_csv('train.csv')
    # secondary_data = pd.read_csv('secondary_data.csv', sep = ';')


    # train = pd.concat([train, secondary_data], ignore_index = True)
    test = pd.read_csv('test.csv')
    train = train.drop('id', axis = 1)
    test = test.drop('id', axis = 1)

    y = train['class']
    X = train.drop('class', axis = 1)

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

    smoothing['X_train before'] = X_train

    
    cat_cols = [col for col in train.select_dtypes('object').columns if col != 'class']
    num_cols = [col for col in train.select_dtypes('number').columns]
    print(f'Categorical columns:\n {cat_cols}\n')
    print(f'Numeric columns:\n {num_cols}')

    X_train = replace_non_alpha_with_nan(X_train, cat_cols)
    X_test = replace_non_alpha_with_nan(X_test, cat_cols)
    X_val = replace_non_alpha_with_nan(X_val, cat_cols)
    test = replace_non_alpha_with_nan(test, cat_cols)
    
    smoothing['X_train'] = X_train

    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    y_val = le.transform(y_val)

    # X_train = aggregate(X_train)
    # X_test = aggregate(X_test)
    # X_val = aggregate(X_val)
    # test = aggregate(test)

    print(f"X_train columns  {X_train.columns}")


    X_train, X_test, X_val, test = encode_train_and_test_data(X_train, y_train, X_test, X_val, test, num_cols, cat_cols, variable)

    # smoothing['Series'] = X_train['cat__cap-shape_nan']
    
    smoothing['X_train after encoding'] = X_train

    # train = find_train_combinations(train, cat_cols, num_cols) # ! Change train_new back to train after testing
    # test = find_test_combinations(test, cat_cols, num_cols, train.columns)

    def handle_missing_data(df_transformed):
        
        df_transformed = df_transformed.fillna(-10)

        print("Missing values after imputation:")
        print(df_transformed.isnull().sum())
        return df_transformed
    
    X_train = handle_missing_data(X_train)
    X_test = handle_missing_data(X_test)
    X_val = handle_missing_data(X_val)
    test = handle_missing_data(test)


    from sklearn.metrics import matthews_corrcoef

    def mcc_metric(y_pred, dmatrix):
        y_true = dmatrix.get_label()
        y_pred = (y_pred > 0.5).astype(int) 
        mcc = matthews_corrcoef(y_true, y_pred)
        return 'mcc', mcc

    model = XGBClassifier(
        device = 'cuda',
        colsample_bytree = 0.6,
        max_depth = 14,
        min_child_weight = 7,
        random_state = 42,
        n_estimators = 200,
    )
    XGB = model.fit(
        X_train, 
        y_train, 
        eval_set = [(X_test, y_test)],
        eval_metric = mcc_metric # * Only Visual does not affect model training
        )
    
    y_pred = XGB.predict(X_val)
    score = matthews_corrcoef(y_val, y_pred)
    print('MCC', score)
    smoothing[variable] = score

    print(test.info())

    test_pred_prob = XGB.predict(test)
    test_pred_class = le.inverse_transform(test_pred_prob)
    submission = pd.read_csv('sample_submission.csv')

    submission['class'] = test_pred_class
    submission.to_csv('version_9_submission.csv', index = False)

In [8]:
smoothing = {}

# for i in np.linspace(5,20,4):
#     model(i)
# model()
model(9.0)

Categorical columns:
 ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

Numeric columns:
 ['cap-diameter', 'stem-height', 'stem-width']
X_train columns  Index(['cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')
All Columns ['num__cap-diameter' 'num__stem-height' 'num__stem-width'
 'cat__cap-shape_b' 'cat__cap-shape_c' 'cat__cap-shape_f'
 'cat__cap-shape_o' 'cat__cap-shape_p' 'cat__cap-shape_s'
 'cat__cap-shape_x' 'cat__cap-shape_nan' 'cat__cap-surface_d'
 'cat__cap-surface_e' 'cat__cap-surface_g'



[0]	validation_0-logloss:0.47682	validation_0-mcc:0.89619
[1]	validation_0-logloss:0.33956	validation_0-mcc:0.96131
[2]	validation_0-logloss:0.28013	validation_0-mcc:0.96287
[3]	validation_0-logloss:0.22134	validation_0-mcc:0.96903
[4]	validation_0-logloss:0.16872	validation_0-mcc:0.97399
[5]	validation_0-logloss:0.13979	validation_0-mcc:0.97618
[6]	validation_0-logloss:0.11106	validation_0-mcc:0.97829
[7]	validation_0-logloss:0.09386	validation_0-mcc:0.97930
[8]	validation_0-logloss:0.07996	validation_0-mcc:0.98013
[9]	validation_0-logloss:0.07107	validation_0-mcc:0.98056
[10]	validation_0-logloss:0.06334	validation_0-mcc:0.98088
[11]	validation_0-logloss:0.05820	validation_0-mcc:0.98119
[12]	validation_0-logloss:0.05475	validation_0-mcc:0.98137
[13]	validation_0-logloss:0.05180	validation_0-mcc:0.98157
[14]	validation_0-logloss:0.04851	validation_0-mcc:0.98198
[15]	validation_0-logloss:0.04647	validation_0-mcc:0.98216
[16]	validation_0-logloss:0.04498	validation_0-mcc:0.98223
[17]	va

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




MCC 0.9840858983233973
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2077964 entries, 0 to 2077963
Columns: 135 entries, num__cap-diameter to cat__season_w
dtypes: float64(135)
memory usage: 2.1 GB
None


In [9]:
if 'cap-shape x cap-diameter median' in smoothing['feature names']:
    print('hello')

In [10]:
smoothing[9.0]

0.9840858983233973

In [11]:

# * NEW BENCHMARK FOR ONE HOT ENCODING
# 0.9843645395934322
# One Hot Encoding only fit_transform train and spliting missing and noise : 0.9843650325175951

In [12]:
# dict(sorted(smoothing.items(), key=lambda item: item[1]))

One Hot Encoding 

NaNs in categorical columns have their own feature after one hot encoding

In [13]:
# 1. MCC 0.9841636261609926
# 2. MCC 0.9842990664421013

In [14]:

# * Target Encoding
#  {np.float64(2.0): np.float64(0.984050532687097),
#  np.float64(7.0): np.float64(0.9841408416248388),
#  np.float64(1.0): np.float64(0.984153302705847),
#  np.float64(8.0): np.float64(0.9842243023588968),
#  np.float64(6.0): np.float64(0.9842284725563117),
#  np.float64(10.0): np.float64(0.984234247240041),
#  np.float64(3.0): np.float64(0.9842414180099538),
#  np.float64(4.0): np.float64(0.984298975666255),
#  np.float64(5.0): np.float64(0.9843180598293413),
#  np.float64(9.0): np.float64(0.9843415677666381)}

In [15]:

# * One Hot Encoding
# * One Hot Encoding = MCC 0.9845601293900792

In [16]:


# * With train, test, and validation set 
# * One Hot Encoding = MCC 0.9845601293900792
# * Target Encoding Smoothing = 9.0 cv = 5: MCC 0.9843784324361466
# * Target Encoding Smoothing = 9.0 cv = 10 : MCC 0.9844689041665822
# * Target Encoding Category_encoders Smoothing = 9.0 : MCC 0.984527270385196
# * Ordinal Encoding : MCC 0.9844821290074579