Target Encoding

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import matthews_corrcoef
from xgboost import XGBClassifier
import gc

pd.set_option('display.max_columns', None)
smoothing = {}



In [2]:
def replace_non_alpha_with_nan(df, categories):
    # cols_to_filter = ['cap-shape', 'cap-surface', 'cap-color', 
    #                   'does-bruise-or-bleed', 'gill-attachment', 
    #                   'gill-spacing', 'gill-color', 'stem-surface', 
    #                   'stem-color', 'has-ring', 'ring-type', 'habitat', 'stem-root', 'veil-type', 'veil-color', 'spore-print-color']

    cols_to_filter = categories
    

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    # col_values = {}
    # for col in cols_to_filter:
    #     value_counts = train[col].value_counts()# ? ONLY Based on trained dataset
    #     col_values[col] = value_counts[value_counts > 10].index.values.tolist()


    # def filter_alpha(value, value_list_no_outliers):
    #     if isinstance(value, str):
    #         return value if len(value) == 1 and value in value_list_no_outliers and value in alphabet_list else np.nan # if value is a single character
        
    #     return np.nan
    
    # for col in cols_to_filter:
    #     df[col] = df[col].apply(lambda x : filter_alpha(x, col_values[col]))


    # * Customized feature engineering
    features_dict = {
        'cap_shape': ['x', 'f', 's', 'b', 'o', 'p', 'c'],
        'cap_surface': ['t', 's', 'y', 'h', 'g', 'd', 'k', 'e', 'i', 'w', 'l'],
        'cap_color': ['n', 'y', 'w', 'g', 'e', 'o', 'p', 'r', 'u', 'b', 'k', 'l'],
        'does_bruise_or_bleed': ['f', 't'],
        'gill_attachment': ['a', 'd', 'x', 'e', 's', 'p', 'f'],
        'gill_spacing': ['c', 'd', 'f'],
        'gill_color': ['w', 'n', 'y', 'p', 'g', 'o', 'k', 'f', 'r', 'e', 'b', 'u'],
        'stem_root': ['b', 's', 'r', 'c', 'f'],
        'stem_surface': ['s', 'y', 'i', 't', 'g', 'k', 'h', 'f'],
        'stem_color': ['w', 'n', 'y', 'g', 'o', 'e', 'u', 'p', 'k', 'r', 'l', 'b', 'f'],
        'veil_type': ['u'],
        'veil_color': ['w', 'y', 'n', 'u', 'k', 'e'],
        'has_ring': ['f', 't'],
        'ring_type': ['f', 'e', 'z', 'l', 'r', 'p', 'g', 'm'],
        'spore_print_color': ['k', 'p', 'w', 'n', 'r', 'u', 'g'],
        'habitat': ['d', 'g', 'l', 'm', 'h', 'w', 'p', 'u'],
        'season': ['a', 'u', 'w', 's']
    }


    for classes, cols  in zip(features_dict.keys(), cols_to_filter):
        df.loc[(~df[cols].isin(features_dict[classes])) & pd.notna(df[cols]), cols] = 'missing'

    return df

In [3]:
def encode_train_and_test_data(df_train, target, df_test, df_validation, kaggle_test, num_cols, cat_cols, variable):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy = 'most_frequent')),
        # ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        # ('Target', TargetEncoder(smoothing=variable)),
        ('OneHot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))

        # ('adjust', FunctionTransformer(lambda x : x + 1)) # * Adjust function
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ],
        remainder = 'passthrough'
    )



    # * Target Encoding
    # train_te = preprocessor.fit_transform(df_train[all_columns], target)
    # test_te = preprocessor.transform(df_test[all_columns])
    # val_te = preprocessor.transform(df_validation[all_columns])

    all_columns = num_cols + cat_cols


    train_te = preprocessor.fit_transform(df_train[all_columns])
    test_te = preprocessor.transform(df_test[all_columns]) # ? Only fit_transform on train data or else might cause inconsistent feature_names_out
    val_te = preprocessor.transform(df_validation[all_columns])


    feature_names_out = preprocessor.get_feature_names_out()

    smoothing['feature names'] = feature_names_out

    feature_columns = feature_names_out

    # print(f"All Columns {feature_names_out}")

    # print(f"Train Transformed = {train_te}")

    df_train_transformed = pd.DataFrame(train_te, columns = feature_columns)
    df_test_transformed = pd.DataFrame(test_te, columns = feature_columns)
    df_validation_transformed = pd.DataFrame(val_te, columns = feature_columns)

    # print(preprocessor['cat'].get_feature_names_out())

    # df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols], df['class']), columns = num_cols + cat_cols)

    df_train_final = df_train[['cap-shape x cap-diameter median']].join(df_train_transformed)
    df_test_final = df_test[['cap-shape x cap-diameter median']].join(df_test_transformed)
    df_validation_final = df_validation[['cap-shape x cap-diameter median']].join(df_validation_transformed)


    kaggle_test_te = preprocessor.transform(kaggle_test[all_columns])
    kaggle_test_transformed = pd.DataFrame(kaggle_test_te, columns = feature_names_out)
    kaggle_test_final = kaggle_test[['cap-shape x cap-diameter median']].join(kaggle_test_transformed)


    return df_train_final, df_test_final, df_validation_final, kaggle_test_final

In [4]:
import itertools
from sklearn.preprocessing import LabelEncoder

def find_train_combinations(train, cat_cols, num_cols):
    
    
    all_columns = cat_cols + num_cols

    # ? returning features from train_combinations with correlations greater than the mean of the original
    
    ord_enc = LabelEncoder()
    train['class'] = ord_enc.fit_transform(train['class'])

    corr_matrix = train.corr()
    
    threshold = abs(corr_matrix['class']).sort_values(ascending=False).mean()
    print(f" Mean Correlation of Original Data {threshold}")


    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    print(train.columns)
    combinations = itertools.combinations(filtered_cols, 2)
    print(combinations)

    train_combinations = train['class'].to_frame()

    for col1, col2 in combinations:
       combination = train[col1] * train[col2]
       train_combinations = train_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? returning features from train_combinations with correlations greater than the mean of the original

    # corr_combinations = train_combinations.corr()
    # abs_values = abs(corr_combinations['class'])
    # new_cols = abs_values.loc[abs_values > threshold].index.tolist()
    # if 'class' in new_cols:
    #     new_cols.remove('class')
    new_cols = ['habitat x cap-diameter']
    
    
    train['class'] = ord_enc.inverse_transform(train['class'])
    return train.join(train_combinations[new_cols])


In [5]:
def find_test_combinations(test, cat_cols, num_cols, train_columns):
    

    all_columns = cat_cols + num_cols

    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    combinations = itertools.combinations(filtered_cols, 2)

    test_combinations = pd.DataFrame(index = test.index)

    for col1, col2 in combinations:
       combination = test[col1] * test[col2]
       test_combinations = test_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? Remove 'class' feature from test set
    train_columns = train_columns.drop('class')

    test = test.join(test_combinations)

    return test[train_columns]

In [6]:
def aggregate(df):


    merging = df.groupby(['cap-shape'])['cap-diameter'].median().reset_index()

    names = {"cap-diameter" : "cap-shape x cap-diameter median"}

    merging = merging.rename(columns = names)


    return pd.merge(df, merging, on = "cap-shape", how = 'left')

In [7]:

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

y = train['class']
X = train.drop('class', axis = 1),

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

ValueError: Found input variables with inconsistent numbers of samples: [1, 3116945]

In [None]:
cat_cols = [col for col in train.select_dtypes('object').columns if col != 'class']
num_cols = [col for col in train.select_dtypes('number').columns]
print(f'Categorical columns:\n {cat_cols}\n')
print(f'Numeric columns:\n {num_cols}')

Categorical columns:
 ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

Numeric columns:
 ['cap-diameter', 'stem-height', 'stem-width']


In [None]:
X_train = replace_non_alpha_with_nan(X_train, cat_cols)
X_test = replace_non_alpha_with_nan(X_test, cat_cols)
X_val = replace_non_alpha_with_nan(X_val, cat_cols)
test = replace_non_alpha_with_nan(test, cat_cols)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
y_val = le.fit_transform(y_val)

In [None]:
X_train = aggregate(X_train)
X_test = aggregate(X_test)
X_val = aggregate(X_val)
test = aggregate(test)

In [None]:

print(f"X_train columns  {X_train.columns}")
X_train, X_test, X_val, test = encode_train_and_test_data(X_train, y_train, X_test, X_val, test, num_cols, cat_cols, 9.0)

X_train columns  Index(['cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season', 'cap-shape x cap-diameter median'],
      dtype='object')


In [None]:
smoothing['feature names']

array(['num__cap-diameter', 'num__stem-height', 'num__stem-width',
       'cat__cap-shape_b', 'cat__cap-shape_c', 'cat__cap-shape_f',
       'cat__cap-shape_missing', 'cat__cap-shape_o', 'cat__cap-shape_p',
       'cat__cap-shape_s', 'cat__cap-shape_x', 'cat__cap-shape_nan',
       'cat__cap-surface_d', 'cat__cap-surface_e', 'cat__cap-surface_g',
       'cat__cap-surface_h', 'cat__cap-surface_i', 'cat__cap-surface_k',
       'cat__cap-surface_l', 'cat__cap-surface_missing',
       'cat__cap-surface_s', 'cat__cap-surface_t', 'cat__cap-surface_w',
       'cat__cap-surface_y', 'cat__cap-surface_nan', 'cat__cap-color_b',
       'cat__cap-color_e', 'cat__cap-color_g', 'cat__cap-color_k',
       'cat__cap-color_l', 'cat__cap-color_missing', 'cat__cap-color_n',
       'cat__cap-color_o', 'cat__cap-color_p', 'cat__cap-color_r',
       'cat__cap-color_u', 'cat__cap-color_w', 'cat__cap-color_y',
       'cat__cap-color_nan', 'cat__does-bruise-or-bleed_f',
       'cat__does-bruise-or-bleed_missing

In [None]:










# smoothing['Series'] = X_train['cat__cap-shape_nan']

# train = find_train_combinations(train, cat_cols, num_cols) # ! Change train_new back to train after testing
# test = find_test_combinations(test, cat_cols, num_cols, train.columns)
def handle_missing_data(df_transformed):
    
    df_transformed = df_transformed.fillna(-10)
    print("Missing values after imputation:")
    print(df_transformed.isnull().sum())
    return df_transformed

X_train = handle_missing_data(X_train)
X_test = handle_missing_data(X_test)
X_val = handle_missing_data(X_val)
test = handle_missing_data(test)


from sklearn.metrics import matthews_corrcoef
def mcc_metric(y_pred, dmatrix):
    y_true = dmatrix.get_label()
    y_pred = (y_pred > 0.5).astype(int) 
    mcc = matthews_corrcoef(y_true, y_pred)
    return 'mcc', mcc

model = XGBClassifier(
    colsample_bytree = 0.6,
    max_depth = 14,
    min_child_weight = 7,
    random_state = 42,
    n_estimators = 200,
)
XGB = model.fit(
    X_train, 
    y_train, 
    eval_set = [(X_test, y_test)],
    eval_metric = mcc_metric # * Only Visual does not affect model training
    )


y_pred = XGB.predict(X_val)
score = matthews_corrcoef(y_val, y_pred)
print('MCC', score)
smoothing['score'] = score
print(test.info())


test_pred_prob = XGB.predict(test)
test_pred_class = le.inverse_transform(test_pred_prob)
submission = pd.read_csv('sample_submission.csv')
submission['class'] = test_pred_class
submission.to_csv('version_9_submission.csv', index = False)

Missing values after imputation:
cap-shape x cap-diameter median    0
num__cap-diameter                  0
num__stem-height                   0
num__stem-width                    0
cat__cap-shape_b                   0
                                  ..
cat__habitat_nan                   0
cat__season_a                      0
cat__season_s                      0
cat__season_u                      0
cat__season_w                      0
Length: 152, dtype: int64
Missing values after imputation:
cap-shape x cap-diameter median    0
num__cap-diameter                  0
num__stem-height                   0
num__stem-width                    0
cat__cap-shape_b                   0
                                  ..
cat__habitat_nan                   0
cat__season_a                      0
cat__season_s                      0
cat__season_u                      0
cat__season_w                      0
Length: 152, dtype: int64
Missing values after imputation:
cap-shape x cap-diameter median    



[0]	validation_0-logloss:0.48448	validation_0-mcc:0.82754
[1]	validation_0-logloss:0.41901	validation_0-mcc:0.82925
[2]	validation_0-logloss:0.32638	validation_0-mcc:0.90093
[3]	validation_0-logloss:0.25288	validation_0-mcc:0.94453
[4]	validation_0-logloss:0.21479	validation_0-mcc:0.93127
[5]	validation_0-logloss:0.18986	validation_0-mcc:0.93402
[6]	validation_0-logloss:0.15349	validation_0-mcc:0.94973
[7]	validation_0-logloss:0.13043	validation_0-mcc:0.95820
[8]	validation_0-logloss:0.11155	validation_0-mcc:0.96202
[9]	validation_0-logloss:0.10538	validation_0-mcc:0.95984
[10]	validation_0-logloss:0.09868	validation_0-mcc:0.95920
[11]	validation_0-logloss:0.08810	validation_0-mcc:0.96522
[12]	validation_0-logloss:0.07909	validation_0-mcc:0.96846
[13]	validation_0-logloss:0.07427	validation_0-mcc:0.96929
[14]	validation_0-logloss:0.07141	validation_0-mcc:0.96932
[15]	validation_0-logloss:0.06549	validation_0-mcc:0.97255
[16]	validation_0-logloss:0.06054	validation_0-mcc:0.97464
[17]	va

In [None]:
np.linspace(0,10,21)

array([ 0. ,  0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,
        5.5,  6. ,  6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. ])

In [None]:
smoothing['score']

np.float64(0.9818728560746158)

In [None]:
test

Unnamed: 0,cap-shape x cap-diameter median,num__cap-diameter,num__stem-height,num__stem-width,cat__cap-shape_b,cat__cap-shape_c,cat__cap-shape_f,cat__cap-shape_missing,cat__cap-shape_o,cat__cap-shape_p,cat__cap-shape_s,cat__cap-shape_x,cat__cap-shape_nan,cat__cap-surface_d,cat__cap-surface_e,cat__cap-surface_g,cat__cap-surface_h,cat__cap-surface_i,cat__cap-surface_k,cat__cap-surface_l,cat__cap-surface_missing,cat__cap-surface_s,cat__cap-surface_t,cat__cap-surface_w,cat__cap-surface_y,cat__cap-surface_nan,cat__cap-color_b,cat__cap-color_e,cat__cap-color_g,cat__cap-color_k,cat__cap-color_l,cat__cap-color_missing,cat__cap-color_n,cat__cap-color_o,cat__cap-color_p,cat__cap-color_r,cat__cap-color_u,cat__cap-color_w,cat__cap-color_y,cat__cap-color_nan,cat__does-bruise-or-bleed_f,cat__does-bruise-or-bleed_missing,cat__does-bruise-or-bleed_t,cat__does-bruise-or-bleed_nan,cat__gill-attachment_a,cat__gill-attachment_d,cat__gill-attachment_e,cat__gill-attachment_f,cat__gill-attachment_missing,cat__gill-attachment_p,cat__gill-attachment_s,cat__gill-attachment_x,cat__gill-attachment_nan,cat__gill-spacing_c,cat__gill-spacing_d,cat__gill-spacing_f,cat__gill-spacing_missing,cat__gill-spacing_nan,cat__gill-color_b,cat__gill-color_e,cat__gill-color_f,cat__gill-color_g,cat__gill-color_k,cat__gill-color_missing,cat__gill-color_n,cat__gill-color_o,cat__gill-color_p,cat__gill-color_r,cat__gill-color_u,cat__gill-color_w,cat__gill-color_y,cat__gill-color_nan,cat__stem-root_b,cat__stem-root_c,cat__stem-root_f,cat__stem-root_missing,cat__stem-root_r,cat__stem-root_s,cat__stem-root_nan,cat__stem-surface_f,cat__stem-surface_g,cat__stem-surface_h,cat__stem-surface_i,cat__stem-surface_k,cat__stem-surface_missing,cat__stem-surface_s,cat__stem-surface_t,cat__stem-surface_y,cat__stem-surface_nan,cat__stem-color_b,cat__stem-color_e,cat__stem-color_f,cat__stem-color_g,cat__stem-color_k,cat__stem-color_l,cat__stem-color_missing,cat__stem-color_n,cat__stem-color_o,cat__stem-color_p,cat__stem-color_r,cat__stem-color_u,cat__stem-color_w,cat__stem-color_y,cat__stem-color_nan,cat__veil-type_missing,cat__veil-type_u,cat__veil-type_nan,cat__veil-color_e,cat__veil-color_k,cat__veil-color_missing,cat__veil-color_n,cat__veil-color_u,cat__veil-color_w,cat__veil-color_y,cat__veil-color_nan,cat__has-ring_f,cat__has-ring_missing,cat__has-ring_t,cat__has-ring_nan,cat__ring-type_e,cat__ring-type_f,cat__ring-type_g,cat__ring-type_l,cat__ring-type_m,cat__ring-type_missing,cat__ring-type_p,cat__ring-type_r,cat__ring-type_z,cat__ring-type_nan,cat__spore-print-color_g,cat__spore-print-color_k,cat__spore-print-color_missing,cat__spore-print-color_n,cat__spore-print-color_p,cat__spore-print-color_r,cat__spore-print-color_u,cat__spore-print-color_w,cat__spore-print-color_nan,cat__habitat_d,cat__habitat_g,cat__habitat_h,cat__habitat_l,cat__habitat_m,cat__habitat_missing,cat__habitat_p,cat__habitat_u,cat__habitat_w,cat__habitat_nan,cat__season_a,cat__season_s,cat__season_u,cat__season_w
0,6.06,8.64,11.13,17.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,6.54,6.90,1.27,10.75,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3.17,2.00,6.18,3.14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,6.06,3.47,4.98,8.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6.06,6.17,6.73,13.70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,6.06,0.88,2.67,1.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2077960,6.06,3.12,2.69,7.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2077961,6.06,5.73,6.16,9.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2077962,3.17,5.03,6.00,3.46,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:

# for i in np.linspace(5,20,4):
#     model(i)
# model()
# model(9.0)

In [None]:
if 'cap-shape x cap-diameter median' in smoothing['feature names']:
    print('hello')

In [None]:
# smoothing[9.0]

In [None]:
# dict(sorted(smoothing.items(), key=lambda item: item[1]))

One Hot Encoding 

NaNs in categorical columns have their own feature after one hot encoding

In [None]:
# 1. MCC 0.9841636261609926
# 2. MCC 0.9842990664421013

In [None]:

# * Target Encoding
#  {np.float64(2.0): np.float64(0.984050532687097),
#  np.float64(7.0): np.float64(0.9841408416248388),
#  np.float64(1.0): np.float64(0.984153302705847),
#  np.float64(8.0): np.float64(0.9842243023588968),
#  np.float64(6.0): np.float64(0.9842284725563117),
#  np.float64(10.0): np.float64(0.984234247240041),
#  np.float64(3.0): np.float64(0.9842414180099538),
#  np.float64(4.0): np.float64(0.984298975666255),
#  np.float64(5.0): np.float64(0.9843180598293413),
#  np.float64(9.0): np.float64(0.9843415677666381)}

In [None]:

# * One Hot Encoding

In [None]:


# * With train, test, and validation set 
# * One Hot Encoding = MCC 0.9845601293900792
# * Target Encoding Smoothing = 9.0 cv = 5: MCC 0.9843784324361466
# * Target Encoding Smoothing = 9.0 cv = 10 : MCC 0.9844689041665822
# * Target Encoding Category_encoders Smoothing = 9.0 : MCC 0.984527270385196
# * Ordinal Encoding : MCC 0.9844821290074579

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

y = train['class']
X = train.drop('class', axis = 1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)


In [None]:
X_test

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
1745452,4.00,f,t,n,f,a,d,y,7.55,7.06,,k,o,,,f,f,,d,s
288331,9.30,o,,n,f,,,w,5.00,26.53,,,n,,,f,f,,d,u
421615,8.74,f,,w,f,d,c,y,2.72,18.36,,,w,,,f,f,,d,w
2794259,3.75,s,d,g,t,d,c,p,4.27,5.97,,,g,,,f,f,,d,u
78948,15.30,s,t,k,f,a,d,y,4.83,22.90,,,w,,,f,f,w,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1584520,5.57,b,h,k,f,s,d,w,5.56,11.97,,s,g,,,f,f,,d,a
2921070,5.62,f,y,n,f,s,,g,5.08,8.44,,,w,,,f,f,,d,a
2053944,6.23,f,y,n,f,e,c,p,3.63,11.73,,,w,,,t,l,,w,a
44159,7.30,x,,g,t,p,,w,11.47,26.34,,y,w,,,f,f,,d,u


In [None]:
X_train

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
586767,4.95,x,,p,f,,c,w,3.80,12.36,,,y,,,f,f,,d,a
2665706,1.47,b,y,n,f,,c,k,5.23,3.09,,,w,,,f,f,,d,u
1270574,10.29,x,e,l,f,x,c,w,7.17,20.57,,,w,,,f,f,,d,a
2935630,7.45,s,d,n,t,d,c,n,6.20,8.83,,,e,,,f,f,,d,a
3113991,5.44,x,,e,f,,c,r,3.23,12.29,,,y,,,f,f,,d,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1614808,11.93,x,e,y,f,x,c,w,7.61,24.34,,,w,,,f,f,,d,u
912625,6.87,x,e,e,f,a,,g,8.31,10.35,,,y,,w,t,z,,d,u
2152985,4.09,f,t,n,f,d,,o,6.00,7.38,,t,w,,,f,f,,d,a
2818983,3.58,f,s,w,f,d,d,w,3.31,5.68,,,w,,,f,f,,g,a


In [None]:
X

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,8.80,f,s,u,f,a,c,w,4.51,15.39,,,w,,,f,f,,d,a
1,4.51,x,h,o,f,a,c,n,4.79,6.48,,y,o,,,t,z,,d,w
2,6.94,f,s,b,f,x,c,w,6.85,9.93,,s,n,,,f,f,,l,w
3,3.88,f,y,g,f,s,,g,4.16,6.53,,,w,,,f,f,,d,u
4,5.85,x,l,w,f,d,,w,3.37,8.36,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,9.29,f,,n,t,,,w,12.14,18.81,b,,w,u,w,t,g,,d,u
3116941,10.88,s,,w,t,d,c,p,6.65,26.97,,,w,,,f,f,,d,u
3116942,7.82,x,e,e,f,a,,w,9.51,11.06,,,y,,w,t,z,,d,a
3116943,9.45,p,i,n,t,e,,p,9.13,17.77,,y,w,,,t,p,,d,u
