Target Encoding

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import matthews_corrcoef
from xgboost import XGBClassifier
import gc

pd.set_option('display.max_columns', None)



In [64]:
def replace_non_alpha_with_nan(df, categories):
    # cols_to_filter = ['cap-shape', 'cap-surface', 'cap-color', 
    #                   'does-bruise-or-bleed', 'gill-attachment', 
    #                   'gill-spacing', 'gill-color', 'stem-surface', 
    #                   'stem-color', 'has-ring', 'ring-type', 'habitat', 'stem-root', 'veil-type', 'veil-color', 'spore-print-color']

    cols_to_filter = categories
    

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    # col_values = {}
    # for col in cols_to_filter:
    #     value_counts = train[col].value_counts()# ? ONLY Based on trained dataset
    #     col_values[col] = value_counts[value_counts > 10].index.values.tolist()


    # def filter_alpha(value, value_list_no_outliers):
    #     if isinstance(value, str):
    #         return value if len(value) == 1 and value in value_list_no_outliers and value in alphabet_list else np.nan # if value is a single character
        
    #     return np.nan
    
    # for col in cols_to_filter:
    #     df[col] = df[col].apply(lambda x : filter_alpha(x, col_values[col]))


    # * Customized feature engineering
    features_dict = {
        'cap_shape': ['x', 'f', 's', 'b', 'o', 'p', 'c'],
        'cap_surface': ['t', 's', 'y', 'h', 'g', 'd', 'k', 'e', 'i', 'w', 'l'],
        'cap_color': ['n', 'y', 'w', 'g', 'e', 'o', 'p', 'r', 'u', 'b', 'k', 'l'],
        'does_bruise_or_bleed': ['f', 't'],
        'gill_attachment': ['a', 'd', 'x', 'e', 's', 'p', 'f'],
        'gill_spacing': ['c', 'd', 'f'],
        'gill_color': ['w', 'n', 'y', 'p', 'g', 'o', 'k', 'f', 'r', 'e', 'b', 'u'],
        'stem_root': ['b', 's', 'r', 'c', 'f'],
        'stem_surface': ['s', 'y', 'i', 't', 'g', 'k', 'h', 'f'],
        'stem_color': ['w', 'n', 'y', 'g', 'o', 'e', 'u', 'p', 'k', 'r', 'l', 'b', 'f'],
        'veil_type': ['u'],
        'veil_color': ['w', 'y', 'n', 'u', 'k', 'e'],
        'has_ring': ['f', 't'],
        'ring_type': ['f', 'e', 'z', 'l', 'r', 'p', 'g', 'm'],
        'spore_print_color': ['k', 'p', 'w', 'n', 'r', 'u', 'g'],
        'habitat': ['d', 'g', 'l', 'm', 'h', 'w', 'p', 'u'],
        'season': ['a', 'u', 'w', 's']
    }


    for classes, cols  in zip(features_dict.keys(), cols_to_filter):
        # df.loc[(~df[cols].isin(features_dict[classes])) & pd.notna(df[cols]), cols] = 'missing'
        df.loc[~df[cols].isin(features_dict[classes]), cols] = np.nan

    return df

In [65]:
def encode_train_and_test_data(df_train, target, df_test, kaggle_test, num_cols, cat_cols, variable):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy = 'most_frequent')),
        # ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        # ('Target', TargetEncoder(smoothing=variable)),
        ('OneHot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))

        # ('adjust', FunctionTransformer(lambda x : x + 1)) # * Adjust function
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ],
        remainder = 'passthrough'
    )



    # * Target Encoding
    # train_te = preprocessor.fit_transform(df_train[all_columns], target)
    # test_te = preprocessor.transform(df_test[all_columns])
    # val_te = preprocessor.transform(df_validation[all_columns])

    all_columns = num_cols + cat_cols

    train_te = preprocessor.fit_transform(df_train[all_columns])
    test_te = preprocessor.transform(df_test[all_columns])


    feature_names_out = preprocessor.get_feature_names_out()

    smoothing['feature names'] = feature_names_out


    print(f"All Columns {feature_names_out}")

    print(f"Train Transformed = {train_te}")

    # smoothing['categories'] = 

    df_train_transformed = pd.DataFrame(train_te, columns = feature_names_out)
    df_test_transformed = pd.DataFrame(test_te, columns = feature_names_out)

    # print(preprocessor['cat'].get_feature_names_out())

    # df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols], df['class']), columns = num_cols + cat_cols)

    df_train_final = df_train_transformed
    df_test_final = df_test_transformed


    kaggle_test_te = preprocessor.transform(kaggle_test[all_columns])
    kaggle_test_final = pd.DataFrame(kaggle_test_te, columns = feature_names_out)


    return df_train_final, df_test_final, kaggle_test_final

In [66]:
import itertools
from sklearn.preprocessing import LabelEncoder

def find_train_combinations(train, cat_cols, num_cols):
    
    
    all_columns = cat_cols + num_cols

    # ? returning features from train_combinations with correlations greater than the mean of the original
    
    ord_enc = LabelEncoder()
    train['class'] = ord_enc.fit_transform(train['class'])

    corr_matrix = train.corr()
    
    threshold = abs(corr_matrix['class']).sort_values(ascending=False).mean()
    print(f" Mean Correlation of Original Data {threshold}")


    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    print(train.columns)
    combinations = itertools.combinations(filtered_cols, 2)
    print(combinations)

    train_combinations = train['class'].to_frame()

    for col1, col2 in combinations:
       combination = train[col1] * train[col2]
       train_combinations = train_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? returning features from train_combinations with correlations greater than the mean of the original

    # corr_combinations = train_combinations.corr()
    # abs_values = abs(corr_combinations['class'])
    # new_cols = abs_values.loc[abs_values > threshold].index.tolist()
    # if 'class' in new_cols:
    #     new_cols.remove('class')
    new_cols = ['habitat x cap-diameter']
    
    
    train['class'] = ord_enc.inverse_transform(train['class'])
    return train.join(train_combinations[new_cols])


In [67]:
def find_test_combinations(test, cat_cols, num_cols, train_columns):
    

    all_columns = cat_cols + num_cols

    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    combinations = itertools.combinations(filtered_cols, 2)

    test_combinations = pd.DataFrame(index = test.index)

    for col1, col2 in combinations:
       combination = test[col1] * test[col2]
       test_combinations = test_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? Remove 'class' feature from test set
    train_columns = train_columns.drop('class')

    test = test.join(test_combinations)

    return test[train_columns]

In [68]:
def aggregate(df):

    # df['stem-area'] = df['stem-height'] * df['stem-width']

    # df['cap-shape-surface'] = df['cap-shape'] + df['cap-surface']

    df['cap-diameter-shape'] = df.groupby(['cap-shape'])['cap-diameter'].transform('mean')
    df['stem-root-height'] = df.groupby(['stem-root'])['stem-height'].transform('mean')


    return df

In [69]:
def get_cols(df):
    cat_cols = [col for col in df.select_dtypes('object').columns if col != 'class']
    num_cols = [col for col in df.select_dtypes('number').columns]
    print(f'Categorical columns:\n {cat_cols}\n')
    print(f'Numeric columns:\n {num_cols}')

    return cat_cols, num_cols

In [70]:
from sklearn.feature_selection import mutual_info_classif

def mutual_information(X, y):

    mi_scores = mutual_info_classif(X, y)
    mi_scores = pd.Series(mi_scores, name = "MI Scores", index = X.columns)
    mi_scores = mi_scores.sort_values(ascending = False)
    smoothing['mutual information'] = mi_scores

In [71]:

def model(variable):

    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    train = train.drop('id', axis = 1)
    test = test.drop('id', axis = 1)

    y = train['class']
    X = train.drop('class', axis = 1)

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    # X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

    smoothing['X_train before'] = X_train

    
    cat_cols, num_cols = get_cols(train)

    X_train = replace_non_alpha_with_nan(X_train, cat_cols)
    X_test = replace_non_alpha_with_nan(X_test, cat_cols)
    test = replace_non_alpha_with_nan(test, cat_cols)
    
    smoothing['X_train'] = X_train

    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    X_train = aggregate(X_train)
    X_test = aggregate(X_test)
    test = aggregate(test)

    cat_cols, num_cols = get_cols(X_train)

    print(f"X_train columns  {X_train.columns}")


    X_train, X_test, test = encode_train_and_test_data(X_train, y_train, X_test,  test, num_cols, cat_cols, variable)
    # smoothing['Series'] = X_train['cat__cap-shape_nan']
    
    smoothing['X_train after encoding'] = X_train

    # train = find_train_combinations(train, cat_cols, num_cols) # ! Change train_new back to train after testing
    # test = find_test_combinations(test, cat_cols, num_cols, train.columns)

    def handle_missing_data(df_transformed):
        
        df_transformed = df_transformed.fillna(-10)

        print("Missing values after imputation:")
        print(df_transformed.isnull().sum())
        return df_transformed
    
    X_train = handle_missing_data(X_train)
    X_test = handle_missing_data(X_test)
    test = handle_missing_data(test)

    # mutual_information(X_train, y_train)


    from sklearn.metrics import matthews_corrcoef

    def mcc_metric(y_pred, dmatrix):
        y_true = dmatrix.get_label()
        y_pred = (y_pred > 0.5).astype(int) 
        mcc = matthews_corrcoef(y_true, y_pred)
        return 'mcc', mcc

    model = XGBClassifier(

        colsample_bytree = 0.6,
        max_depth = 14,
        min_child_weight = 7,
        random_state = 42,
        n_estimators = 200,
    )
    XGB = model.fit(
        X_train, 
        y_train, 
        eval_set = [(X_test, y_test)],
        eval_metric = mcc_metric # * Only Visual does not affect model training
        )
    
    y_pred = XGB.predict(X_test)
    score = matthews_corrcoef(y_test, y_pred)
    print('MCC', score)
    smoothing[variable] = score


    test_pred_prob = XGB.predict(test)
    test_pred_class = le.inverse_transform(test_pred_prob)
    submission = pd.read_csv('sample_submission.csv')

    submission['class'] = test_pred_class
    submission.to_csv('version_9_submission.csv', index = False)

In [72]:
np.linspace(0,10,21)

array([ 0. ,  0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,
        5.5,  6. ,  6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. ])

In [73]:
smoothing = {}

# for i in np.linspace(5,20,4):
#     model(i)
# model()
model(9.0)

Categorical columns:
 ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

Numeric columns:
 ['cap-diameter', 'stem-height', 'stem-width']
Categorical columns:
 ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

Numeric columns:
 ['cap-diameter', 'stem-height', 'stem-width', 'cap-diameter-shape']
X_train columns  Index(['cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', '



[0]	validation_0-logloss:0.46278	validation_0-mcc:0.93474
[1]	validation_0-logloss:0.33116	validation_0-mcc:0.96542
[2]	validation_0-logloss:0.24195	validation_0-mcc:0.97427
[3]	validation_0-logloss:0.18825	validation_0-mcc:0.97719
[4]	validation_0-logloss:0.15418	validation_0-mcc:0.97782
[5]	validation_0-logloss:0.12243	validation_0-mcc:0.97908
[6]	validation_0-logloss:0.10474	validation_0-mcc:0.97965
[7]	validation_0-logloss:0.09019	validation_0-mcc:0.97992
[8]	validation_0-logloss:0.07843	validation_0-mcc:0.98040
[9]	validation_0-logloss:0.06799	validation_0-mcc:0.98087
[10]	validation_0-logloss:0.06224	validation_0-mcc:0.98110
[11]	validation_0-logloss:0.05635	validation_0-mcc:0.98142
[12]	validation_0-logloss:0.05212	validation_0-mcc:0.98166
[13]	validation_0-logloss:0.05007	validation_0-mcc:0.98190
[14]	validation_0-logloss:0.04663	validation_0-mcc:0.98212
[15]	validation_0-logloss:0.04540	validation_0-mcc:0.98231
[16]	validation_0-logloss:0.04330	validation_0-mcc:0.98262
[17]	va

KeyboardInterrupt: 

In [None]:
# * One Hot Encoding
# * 0.9840663425977435
# * with stem-area MCC 0.9839074927182325
# * with stem-area + cap-surface-shape

In [None]:

# ? Notes
# No different between fit_transform and transform for test data in preprocessing but transform probably recommended

In [None]:
# smoothing['mutual information'].reset_index().to_csv('mutual information')

KeyError: 'mutual information'