Target Encoding

In [283]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import matthews_corrcoef
from xgboost import XGBClassifier
import gc

pd.set_option('display.max_columns', None)



In [284]:
def replace_non_alpha_with_nan(df, categories):
    # cols_to_filter = ['cap-shape', 'cap-surface', 'cap-color', 
    #                   'does-bruise-or-bleed', 'gill-attachment', 
    #                   'gill-spacing', 'gill-color', 'stem-surface', 
    #                   'stem-color', 'has-ring', 'ring-type', 'habitat', 'stem-root', 'veil-type', 'veil-color', 'spore-print-color']

    cols_to_filter = categories
    

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    
    # col_values = {}
    # for col in cols_to_filter:
    #     value_counts = train[col].value_counts()# ? ONLY Based on trained dataset
    #     col_values[col] = value_counts[value_counts > 10].index.values.tolist()


    # def filter_alpha(value, value_list_no_outliers):
    #     if isinstance(value, str):
    #         return value if len(value) == 1 and value in value_list_no_outliers and value in alphabet_list else np.nan # if value is a single character
        
    #     return np.nan
    
    # for col in cols_to_filter:
    #     df[col] = df[col].apply(lambda x : filter_alpha(x, col_values[col]))


    # * Customized feature engineering
    features_dict = {
        'cap_shape': ['x', 'f', 's', 'b', 'o', 'p', 'c'],
        'cap_surface': ['t', 's', 'y', 'h', 'g', 'd', 'k', 'e', 'i', 'w', 'l'],
        'cap_color': ['n', 'y', 'w', 'g', 'e', 'o', 'p', 'r', 'u', 'b', 'k', 'l'],
        'does_bruise_or_bleed': ['f', 't'],
        'gill_attachment': ['a', 'd', 'x', 'e', 's', 'p', 'f'],
        'gill_spacing': ['c', 'd', 'f'],
        'gill_color': ['w', 'n', 'y', 'p', 'g', 'o', 'k', 'f', 'r', 'e', 'b', 'u'],
        'stem_root': ['b', 's', 'r', 'c', 'f'],
        'stem_surface': ['s', 'y', 'i', 't', 'g', 'k', 'h', 'f'],
        'stem_color': ['w', 'n', 'y', 'g', 'o', 'e', 'u', 'p', 'k', 'r', 'l', 'b', 'f'],
        'veil_type': ['u'],
        'veil_color': ['w', 'y', 'n', 'u', 'k', 'e'],
        'has_ring': ['f', 't'],
        'ring_type': ['f', 'e', 'z', 'l', 'r', 'p', 'g', 'm'],
        'spore_print_color': ['k', 'p', 'w', 'n', 'r', 'u', 'g'],
        'habitat': ['d', 'g', 'l', 'm', 'h', 'w', 'p', 'u'],
        'season': ['a', 'u', 'w', 's']
    }


    for classes, cols  in zip(features_dict.keys(), cols_to_filter):
        # df.loc[(~df[cols].isin(features_dict[classes])) & pd.notna(df[cols]), cols] = 'missing'
        df.loc[~df[cols].isin(features_dict[classes]), cols] = np.nan

    return df

In [285]:
def encode_train_and_test_data(df_train, target, df_test, kaggle_test, num_cols, cat_cols, variable):

    smoothing['info'] = df_train
 
    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy = 'most_frequent')),
        # ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        # ('Target', TargetEncoder(smoothing=variable)),
        # ('OneHot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))

        ('adjust', FunctionTransformer(lambda x : x)) # * Adjust function
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ],
        remainder = 'passthrough'
    )



    # * Target Encoding
    # train_te = preprocessor.fit_transform(df_train[all_columns], target)
    # test_te = preprocessor.transform(df_test[all_columns])
    # val_te = preprocessor.transform(df_validation[all_columns])

    all_columns = num_cols + cat_cols

    train_te = preprocessor.fit_transform(df_train[all_columns])
    test_te = preprocessor.transform(df_test[all_columns])


    # feature_names_out = preprocessor.get_feature_names_out()

    # smoothing['feature names'] = feature_names_out


    # print(f"All Columns {feature_names_out}")

    print(f"Train Transformed = {train_te}")

    # smoothing['categories'] = 

    # df_train_transformed = pd.DataFrame(train_te, columns = feature_names_out)
    # df_test_transformed = pd.DataFrame(test_te, columns = feature_names_out)

    df_train_transformed = pd.DataFrame(train_te, columns = all_columns)
    df_test_transformed = pd.DataFrame(test_te, columns = all_columns)


    df_train_transformed[num_cols] = df_train_transformed[num_cols].astype('float64')
    df_test_transformed[num_cols] = df_test_transformed[num_cols].astype('float64')

    # print(preprocessor['cat'].get_feature_names_out())

    # df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols], df['class']), columns = num_cols + cat_cols)

    df_train_final = df_train_transformed
    df_test_final = df_test_transformed

    smoothing['info2'] = df_train_transformed


    kaggle_test_te = preprocessor.transform(kaggle_test[all_columns])
    kaggle_test_final = pd.DataFrame(kaggle_test_te, columns = all_columns)
    kaggle_test_final[num_cols] = kaggle_test_final[num_cols].astype('float64')


    return df_train_final, df_test_final, kaggle_test_final

In [286]:
import itertools
from sklearn.preprocessing import LabelEncoder

def find_train_combinations(train, cat_cols, num_cols):
    
    
    all_columns = cat_cols + num_cols

    # ? returning features from train_combinations with correlations greater than the mean of the original
    
    ord_enc = LabelEncoder()
    train['class'] = ord_enc.fit_transform(train['class'])

    corr_matrix = train.corr()
    
    threshold = abs(corr_matrix['class']).sort_values(ascending=False).mean()
    print(f" Mean Correlation of Original Data {threshold}")


    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    print(train.columns)
    combinations = itertools.combinations(filtered_cols, 2)
    print(combinations)

    train_combinations = train['class'].to_frame()

    for col1, col2 in combinations:
       combination = train[col1] * train[col2]
       train_combinations = train_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? returning features from train_combinations with correlations greater than the mean of the original

    # corr_combinations = train_combinations.corr()
    # abs_values = abs(corr_combinations['class'])
    # new_cols = abs_values.loc[abs_values > threshold].index.tolist()
    # if 'class' in new_cols:
    #     new_cols.remove('class')
    new_cols = ['habitat x cap-diameter']
    
    
    train['class'] = ord_enc.inverse_transform(train['class'])
    return train.join(train_combinations[new_cols])


In [287]:
def find_test_combinations(test, cat_cols, num_cols, train_columns):
    

    all_columns = cat_cols + num_cols

    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    combinations = itertools.combinations(filtered_cols, 2)

    test_combinations = pd.DataFrame(index = test.index)

    for col1, col2 in combinations:
       combination = test[col1] * test[col2]
       test_combinations = test_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? Remove 'class' feature from test set
    train_columns = train_columns.drop('class')

    test = test.join(test_combinations)

    return test[train_columns]

In [288]:
def aggregate(df):

    # df['stem-area'] = df['stem-height'] * df['stem-width']

    # df['cap-shape-surface'] = df['cap-shape'] + df['cap-surface']

    df['cap-diameter-shape'] = df.groupby(['cap-shape'])['cap-diameter'].transform('mean')
    df['stem-root-height'] = df.groupby(['stem-root'])['stem-height'].transform('mean')


    return df

In [289]:
def get_cols(df):
        
    cat_cols = [col for col in df.select_dtypes('category').columns if col != 'class']
    num_cols = [col for col in df.select_dtypes('number').columns]
    print(f'Categorical columns:\n {cat_cols}\n')
    print(f'Numeric columns:\n {num_cols}')

    return cat_cols, num_cols

In [290]:
from sklearn.feature_selection import mutual_info_classif

def mutual_information(X, y):

    mi_scores = mutual_info_classif(X, y)
    mi_scores = pd.Series(mi_scores, name = "MI Scores", index = X.columns)
    mi_scores = mi_scores.sort_values(ascending = False)
    smoothing['mutual information'] = mi_scores

In [291]:
def clean_cols(df):
    
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')

    return df

In [292]:
from sklearn.impute import SimpleImputer

def handle_missing_data(df):

        cat_cols = df.select_dtypes(include = ['category', 'object']).columns
        imputer_cat = SimpleImputer(strategy='constant', fill_value = 'missing')
        df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

        # * KNN Imputer on num_cols used during encoding function
        print(df.isnull().sum())
        return df
    

In [293]:

def model(variable):

    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    train = train.drop('id', axis = 1)
    test = test.drop('id', axis = 1)


    train = clean_cols(train)
    test = clean_cols(test)

    print(train.info())

    y = train['class']
    X = train.drop('class', axis = 1)

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    # X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

    smoothing['X_train before'] = X_train

    
    cat_cols, num_cols = get_cols(train)

    X_train = replace_non_alpha_with_nan(X_train, cat_cols)
    X_test = replace_non_alpha_with_nan(X_test, cat_cols)
    test = replace_non_alpha_with_nan(test, cat_cols)
    
    smoothing['X_train'] = X_train

    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

    # X_train = aggregate(X_train)
    # X_test = aggregate(X_test)
    # test = aggregate(test)

    cat_cols, num_cols = get_cols(X_train)

    print(f"X_train columns  {X_train.columns}")
    smoothing['cat_cols'] = cat_cols
    smoothing['num_cols'] = num_cols
 

    X_train, X_test, test = encode_train_and_test_data(X_train, y_train, X_test,  test, num_cols, cat_cols, variable)
    # smoothing['Series'] = X_train['cat__cap-shape_nan']
    
    smoothing['X_train after encoding'] = X_train

    smoothing['cat_cols2'] = cat_cols
    smoothing['num_cols2'] = num_cols

    # train = find_train_combinations(train, cat_cols, num_cols) # ! Change train_new back to train after testing
    # test = find_test_combinations(test, cat_cols, num_cols, train.columns)


    X_train = handle_missing_data(X_train)
    X_test = handle_missing_data(X_test)
    test = handle_missing_data(test)

    smoothing['X_train imputed'] = X_train
    smoothing['X_test imputed'] = X_test
    smoothing['test imputed'] = test
    # mutual_information(X_train, y_train)

    X_train = clean_cols(X_train)
    X_test = clean_cols(X_test)
    test = clean_cols(test)

    smoothing['X_train final'] = X_train
    smoothing['X_test final'] = X_test
    smoothing['test final'] = test


    X_train = X_train.fillna(np.nan)
    X_test = X_test.fillna(np.nan)
    test = test.fillna(np.nan)


    from sklearn.metrics import matthews_corrcoef

    def mcc_metric(y_pred, dmatrix):
        y_true = dmatrix.get_label()
        y_pred = (y_pred > 0.5).astype(int) 
        mcc = matthews_corrcoef(y_true, y_pred)
        return 'mcc', mcc

    model = XGBClassifier(
        device = 'cuda',
        colsample_bytree = 0.6,
        max_depth = 14,
        min_child_weight = 7,
        random_state = 42,
        n_estimators = 200,
        enable_categorical = True
    )
    XGB = model.fit(
        X_train, 
        y_train, 
        eval_set = [(X_test, y_test)],
        eval_metric = mcc_metric # * Only Visual does not affect model training
        )
    
    y_pred = XGB.predict(X_test)
    score = matthews_corrcoef(y_test, y_pred)
    print('MCC', score)
    smoothing[variable] = score


    test_pred_prob = XGB.predict(test)
    test_pred_class = le.inverse_transform(test_pred_prob)
    submission = pd.read_csv('sample_submission.csv')

    submission['class'] = test_pred_class
    submission.to_csv('version_9_submission.csv', index = False)

In [294]:
np.linspace(0,10,21)

array([ 0. ,  0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,
        5.5,  6. ,  6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. ])

In [295]:
smoothing = {}

# for i in np.linspace(5,20,4):
#     model(i)
# model()
model(9.0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 21 columns):
 #   Column                Dtype   
---  ------                -----   
 0   class                 category
 1   cap-diameter          float64 
 2   cap-shape             category
 3   cap-surface           category
 4   cap-color             category
 5   does-bruise-or-bleed  category
 6   gill-attachment       category
 7   gill-spacing          category
 8   gill-color            category
 9   stem-height           float64 
 10  stem-width            float64 
 11  stem-root             category
 12  stem-surface          category
 13  stem-color            category
 14  veil-type             category
 15  veil-color            category
 16  has-ring              category
 17  ring-type             category
 18  spore-print-color     category
 19  habitat               category
 20  season                category
dtypes: category(18), float64(3)
memory usage: 124.9 MB
Non



AttributeError: `np.NaN` was removed in the NumPy 2.0 release. Use `np.nan` instead.

In [None]:
# * One Hot Encoding
# * 0.9840663425977435
# * with stem-area MCC 0.9839074927182325
# * with stem-area + cap-surface-shape

In [None]:

# ? Notes
# No different between fit_transform and transform for test data in preprocessing but transform probably recommended

In [None]:
# smoothing['mutual information'].reset_index().to_csv('mutual information')

KeyError: 'mutual information'

In [None]:
smoothing['info'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 2493556 entries, 1252551 to 2219110
Data columns (total 20 columns):
 #   Column                Dtype   
---  ------                -----   
 0   cap-diameter          float64 
 1   cap-shape             category
 2   cap-surface           category
 3   cap-color             category
 4   does-bruise-or-bleed  category
 5   gill-attachment       category
 6   gill-spacing          category
 7   gill-color            category
 8   stem-height           float64 
 9   stem-width            float64 
 10  stem-root             category
 11  stem-surface          category
 12  stem-color            category
 13  veil-type             category
 14  veil-color            category
 15  has-ring              category
 16  ring-type             category
 17  spore-print-color     category
 18  habitat               category
 19  season                category
dtypes: category(17), float64(3)
memory usage: 116.6 MB


In [None]:
df = smoothing['X_train after encoding']
df[df.isna().any(axis = 1)]

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season


In [None]:
smoothing['X_test final']

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,7.82,4.49,13.30,s,t,r,t,d,missing,g,missing,t,g,missing,missing,f,f,missing,d,a
1,3.20,3.76,17.16,o,missing,g,f,f,f,f,missing,g,n,missing,missing,f,f,missing,d,s
2,9.36,10.12,22.83,f,missing,n,t,missing,missing,w,b,missing,w,missing,w,t,g,missing,d,a
3,6.52,5.82,8.03,s,d,e,t,d,c,w,missing,missing,e,missing,missing,f,f,missing,d,a
4,3.13,9.92,6.00,b,t,n,f,missing,missing,n,missing,missing,w,missing,missing,t,missing,k,g,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623384,2.39,6.68,2.29,x,g,e,f,a,missing,p,missing,missing,w,missing,missing,f,f,missing,d,a
623385,8.53,6.77,24.69,f,y,y,f,missing,missing,w,b,missing,n,missing,w,t,e,missing,d,a
623386,1.50,3.88,1.36,x,k,n,f,missing,missing,n,missing,missing,n,missing,missing,f,f,missing,d,w
623387,8.26,8.33,16.13,x,h,y,t,x,c,y,missing,missing,w,missing,missing,f,f,missing,d,a


In [None]:
smoothing['test final']

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,8.64,11.13,17.12,x,missing,n,t,missing,missing,w,b,missing,w,u,w,t,g,missing,d,a
1,6.90,1.27,10.75,o,t,o,f,missing,c,y,missing,missing,n,missing,missing,f,f,missing,d,a
2,2.00,6.18,3.14,b,g,n,f,missing,c,n,missing,missing,n,missing,missing,f,f,missing,d,s
3,3.47,4.98,8.51,x,t,n,f,s,c,n,missing,missing,w,missing,n,t,z,missing,d,u
4,6.17,6.73,13.70,x,h,y,f,p,missing,y,missing,missing,y,missing,y,t,missing,missing,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,0.88,2.67,1.35,x,g,w,f,a,d,w,missing,missing,e,missing,missing,f,f,missing,d,u
2077960,3.12,2.69,7.38,x,s,w,f,d,c,w,missing,missing,w,missing,missing,f,f,missing,g,a
2077961,5.73,6.16,9.74,x,e,e,f,a,missing,w,missing,missing,y,missing,w,t,z,missing,d,a
2077962,5.03,6.00,3.46,b,g,n,f,a,d,g,missing,s,g,missing,missing,f,f,missing,d,a
