Feature Engineering Multiplying categorical variables together

Testing different imputing methods

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
def replace_non_alpha_with_nan(df):
    cols_to_filter = ['cap-shape', 'cap-surface', 'cap-color', 
                      'does-bruise-or-bleed', 'gill-attachment', 
                      'gill-spacing', 'gill-color', 'stem-surface', 
                      'stem-color', 'has-ring', 'ring-type', 'habitat', 'stem-root', 'veil-type', 'veil-color', 'spore-print-color']

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    col_values = {}
    for col in cols_to_filter:
        value_counts = train[col].value_counts()
        col_values[col] = value_counts[value_counts > 10].index.values.tolist()


    def filter_alpha(value, value_list_no_outliers):
        if isinstance(value, str):
            return value if len(value) == 1 and value in value_list_no_outliers and value in alphabet_list else np.nan # if value is a single character
        
        return np.nan
    
    for col in cols_to_filter:
        df[col] = df[col].apply(lambda x : filter_alpha(x, col_values[col]))

    return df

In [6]:
train = replace_non_alpha_with_nan(train)
test = replace_non_alpha_with_nan(test)

In [7]:
cat_cols = [col for col in train.select_dtypes('object').columns if col != 'class']
num_cols = [col for col in train.select_dtypes('number').columns]
print(f'Categorical columns:\n {cat_cols}\n')
print(f'Numeric columns:\n {num_cols}')

Categorical columns:
 ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

Numeric columns:
 ['cap-diameter', 'stem-height', 'stem-width']


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

def encode_data(df, num_cols, cat_cols):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ]
    )

    df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols]), columns = num_cols + cat_cols)

    df_final = df[['class']].join(df_transformed)

    return df_final

In [9]:
train = encode_data(train, num_cols, cat_cols)

In [10]:
def encode_data(df, num_cols, cat_cols):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors=3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1))
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ]
    )

    df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols]), columns = num_cols + cat_cols)

    df_final = df_transformed
    return df_final

In [11]:
test = encode_data(test, num_cols, cat_cols)

In [12]:
import itertools
from sklearn.preprocessing import LabelEncoder

def find_train_combinations(train, cat_cols, num_cols):
    
    all_columns = cat_cols + num_cols

    # ? returning features from train_combinations with correlations greater than the mean of the original
    
    ord_enc = LabelEncoder()
    train['class'] = ord_enc.fit_transform(train['class'])

    corr_matrix = train.corr()
    
    threshold = abs(corr_matrix['class']).sort_values(ascending=False).mean()
    print(f" Mean Correlation of Original Data {threshold}")


    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    print(train.columns)
    combinations = itertools.combinations(filtered_cols, 2)
    print(combinations)

    train_combinations = train['class'].to_frame()

    for col1, col2 in combinations:
       combination = train[col1] * train[col2]
       train_combinations = train_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? returning features from train_combinations with correlations greater than the mean of the original

    # corr_combinations = train_combinations.corr()
    # abs_values = abs(corr_combinations['class'])
    # new_cols = abs_values.loc[abs_values > threshold].index.tolist()
    # if 'class' in new_cols:
    #     new_cols.remove('class')
    new_cols = ['habitat x cap-diameter']
    
    
    train['class'] = ord_enc.inverse_transform(train['class'])
    return train.join(train_combinations[new_cols])


In [13]:
train = find_train_combinations(train, cat_cols, num_cols) # ! Change train_new back to train after testing

 Mean Correlation of Original Data 0.15629186050265248
['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season', 'cap-diameter', 'stem-height', 'stem-width']
Index(['class', 'cap-diameter', 'stem-height', 'stem-width', 'cap-shape',
       'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment',
       'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')
<itertools.combinations object at 0x000001B3926431A0>


In [14]:
def find_test_combinations(test, cat_cols, num_cols, train_columns):

    all_columns = cat_cols + num_cols

    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    combinations = itertools.combinations(filtered_cols, 2)

    test_combinations = pd.DataFrame(index = test.index)

    for col1, col2 in combinations:
       combination = test[col1] * test[col2]
       test_combinations = test_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? Remove 'class' feature from test set
    train_columns = train_columns.drop('class')

    test = test.join(test_combinations)

    return test[train_columns]


test = find_test_combinations(test, cat_cols, num_cols, train.columns)

['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season', 'cap-diameter', 'stem-height', 'stem-width']


In [15]:
def handle_missing_data(df_transformed):
    
    df_transformed = df_transformed.fillna(-10)

    print("Missing values after imputation:")
    print(df_transformed.isnull().sum())
    return df_transformed

In [16]:
train = handle_missing_data(train)
test = handle_missing_data(test)

Missing values after imputation:
class                     0
cap-diameter              0
stem-height               0
stem-width                0
cap-shape                 0
cap-surface               0
cap-color                 0
does-bruise-or-bleed      0
gill-attachment           0
gill-spacing              0
gill-color                0
stem-root                 0
stem-surface              0
stem-color                0
veil-type                 0
veil-color                0
has-ring                  0
ring-type                 0
spore-print-color         0
habitat                   0
season                    0
habitat x cap-diameter    0
dtype: int64
Missing values after imputation:
cap-diameter              0
stem-height               0
stem-width                0
cap-shape                 0
cap-surface               0
cap-color                 0
does-bruise-or-bleed      0
gill-attachment           0
gill-spacing              0
gill-color                0
stem-root                

In [17]:
test

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,habitat x cap-diameter
0,8.64,11.13,17.12,18.0,-10.0,12.0,2.0,-10.0,-10.0,18.0,0.0,-10.0,18.0,0.0,9.0,7.0,6.0,-10.0,3.0,0.0,25.92
1,6.90,1.27,10.75,11.0,17.0,13.0,1.0,-10.0,2.0,20.0,-10.0,-10.0,11.0,-10.0,-10.0,1.0,5.0,-10.0,3.0,0.0,20.70
2,2.00,6.18,3.14,1.0,6.0,12.0,1.0,-10.0,2.0,11.0,-10.0,-10.0,11.0,-10.0,-10.0,1.0,5.0,-10.0,3.0,1.0,6.00
3,3.47,4.98,8.51,18.0,17.0,12.0,1.0,15.0,2.0,11.0,-10.0,-10.0,18.0,-10.0,3.0,7.0,20.0,-10.0,3.0,2.0,10.41
4,6.17,6.73,13.70,18.0,7.0,21.0,1.0,14.0,-10.0,20.0,-10.0,-10.0,20.0,-10.0,10.0,7.0,-10.0,-10.0,3.0,2.0,18.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2077959,0.88,2.67,1.35,18.0,6.0,19.0,1.0,0.0,3.0,18.0,-10.0,-10.0,3.0,-10.0,-10.0,1.0,5.0,-10.0,3.0,2.0,2.64
2077960,3.12,2.69,7.38,18.0,16.0,19.0,1.0,3.0,2.0,18.0,-10.0,-10.0,18.0,-10.0,-10.0,1.0,5.0,-10.0,6.0,0.0,18.72
2077961,5.73,6.16,9.74,18.0,4.0,4.0,1.0,0.0,-10.0,18.0,-10.0,-10.0,20.0,-10.0,9.0,7.0,20.0,-10.0,3.0,0.0,17.19
2077962,5.03,6.00,3.46,1.0,6.0,12.0,1.0,0.0,3.0,6.0,-10.0,15.0,5.0,-10.0,-10.0,1.0,5.0,-10.0,3.0,0.0,15.09


In [18]:
train

Unnamed: 0,class,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season,habitat x cap-diameter
0,e,8.80,4.51,15.39,5.0,16.0,18.0,1.0,0.0,2.0,18.0,-10.0,-10.0,18.0,-10.0,-10.0,1.0,5.0,-10.0,3.0,0.0,26.40
1,p,4.51,4.79,6.48,18.0,7.0,13.0,1.0,0.0,2.0,11.0,-10.0,20.0,12.0,-10.0,-10.0,7.0,20.0,-10.0,3.0,3.0,13.53
2,e,6.94,6.85,9.93,5.0,16.0,1.0,1.0,19.0,2.0,18.0,-10.0,15.0,11.0,-10.0,-10.0,1.0,5.0,-10.0,9.0,3.0,62.46
3,e,3.88,4.16,6.53,5.0,21.0,6.0,1.0,15.0,-10.0,6.0,-10.0,-10.0,18.0,-10.0,-10.0,1.0,5.0,-10.0,3.0,2.0,11.64
4,e,5.85,3.37,8.36,18.0,10.0,19.0,1.0,3.0,-10.0,18.0,-10.0,-10.0,18.0,-10.0,-10.0,1.0,5.0,-10.0,6.0,0.0,35.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,e,9.29,12.14,18.81,5.0,-10.0,12.0,2.0,-10.0,-10.0,18.0,0.0,-10.0,18.0,0.0,9.0,7.0,6.0,-10.0,3.0,2.0,27.87
3116941,e,10.88,6.65,26.97,14.0,-10.0,19.0,2.0,3.0,2.0,13.0,-10.0,-10.0,18.0,-10.0,-10.0,1.0,5.0,-10.0,3.0,2.0,32.64
3116942,p,7.82,9.51,11.06,18.0,4.0,4.0,1.0,0.0,-10.0,18.0,-10.0,-10.0,20.0,-10.0,9.0,7.0,20.0,-10.0,3.0,0.0,23.46
3116943,e,9.45,9.13,17.77,12.0,8.0,12.0,2.0,4.0,-10.0,13.0,-10.0,20.0,18.0,-10.0,-10.0,7.0,12.0,-10.0,3.0,2.0,28.35


In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['class'] = le.fit_transform(train['class'])

In [20]:
y = train['class']
X = train.drop('class', axis = 1)

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [22]:
from sklearn.metrics import matthews_corrcoef

def mcc_metric(y_pred, dmatrix):
    y_true = dmatrix.get_label()
    y_pred = (y_pred > 0.5).astype(int) 
    mcc = matthews_corrcoef(y_true, y_pred)
    return 'mcc', mcc

In [23]:
from sklearn.metrics import matthews_corrcoef
from xgboost import XGBClassifier
import gc

model = XGBClassifier(

    colsample_bytree = 0.6,
    max_depth = 14,
    min_child_weight = 7,
    random_state = 42,
    n_estimators = 200,
)

In [24]:
XGB = model.fit(
    X_train, 
    y_train, 
    eval_set = [(X_test, y_test)],
    eval_metric = mcc_metric)



[0]	validation_0-logloss:0.48047	validation_0-mcc:0.88345
[1]	validation_0-logloss:0.34029	validation_0-mcc:0.95509
[2]	validation_0-logloss:0.25212	validation_0-mcc:0.97165
[3]	validation_0-logloss:0.18999	validation_0-mcc:0.97681
[4]	validation_0-logloss:0.14579	validation_0-mcc:0.97960
[5]	validation_0-logloss:0.11553	validation_0-mcc:0.98061
[6]	validation_0-logloss:0.09405	validation_0-mcc:0.98116
[7]	validation_0-logloss:0.07948	validation_0-mcc:0.98154
[8]	validation_0-logloss:0.06783	validation_0-mcc:0.98188
[9]	validation_0-logloss:0.06033	validation_0-mcc:0.98204
[10]	validation_0-logloss:0.05573	validation_0-mcc:0.98210
[11]	validation_0-logloss:0.05049	validation_0-mcc:0.98244
[12]	validation_0-logloss:0.04753	validation_0-mcc:0.98257
[13]	validation_0-logloss:0.04544	validation_0-mcc:0.98263
[14]	validation_0-logloss:0.04371	validation_0-mcc:0.98274
[15]	validation_0-logloss:0.04173	validation_0-mcc:0.98294
[16]	validation_0-logloss:0.04098	validation_0-mcc:0.98299
[17]	va

In [25]:
y_pred = XGB.predict(X_test)

In [26]:
score = matthews_corrcoef(y_test, y_pred)
print('MCC', score)

MCC 0.983700368604822


In [27]:
test_pred_prob = XGB.predict(test)

In [28]:
test_pred_class = le.inverse_transform(test_pred_prob)

In [29]:
submission = pd.read_csv('sample_submission.csv')

submission['class'] = test_pred_class

In [30]:
submission.to_csv('submission.csv', index = False)