Feature Engineering Multiply features together to create new combinations

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
def replace_non_alpha_with_nan(df):
    cols_to_filter = ['cap-shape', 'cap-surface', 'cap-color', 
                      'does-bruise-or-bleed', 'gill-attachment', 
                      'gill-spacing', 'gill-color', 'stem-surface', 
                      'stem-color', 'has-ring', 'ring-type', 'habitat', 'stem-root', 'veil-type', 'veil-color', 'spore-print-color']

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    col_values = {}
    for col in cols_to_filter:
        value_counts = train[col].value_counts()
        col_values[col] = value_counts[value_counts > 10].index.values.tolist()


    def filter_alpha(value, value_list_no_outliers):
        if isinstance(value, str):
            return value if len(value) == 1 and value in value_list_no_outliers and value in alphabet_list else np.nan # if value is a single character
        
        return np.nan
    
    for col in cols_to_filter:
        df[col] = df[col].apply(lambda x : filter_alpha(x, col_values[col]))

    return df

In [7]:
train = replace_non_alpha_with_nan(train)
test = replace_non_alpha_with_nan(test)

In [8]:
cat_cols = [col for col in train.select_dtypes('object').columns if col != 'class']
num_cols = [col for col in train.select_dtypes('number').columns]
print(f'Categorical columns:\n {cat_cols}\n')
print(f'Numeric columns:\n {num_cols}')

Categorical columns:
 ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

Numeric columns:
 ['cap-diameter', 'stem-height', 'stem-width']


In [9]:
# train_combinations = train_preprocessed['cat__class'].to_frame()

# for i, col in enumerate(cat_cols.columns):
#     if col != 'cat__class':
#         for j, col2 in enumerate(cat_cols[i+1:], start=i+1):
#             if col2 != 'cat__class':
#                 combination = cat_cols[col] * cat_cols[col2]
#                 train_combinations = train_combinations.join(combination.rename(f'{col} x {col2}'))

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

def handle_missing_data(df, num_cols, cat_cols):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ]
    )

    df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols]), columns = num_cols + cat_cols)

    print("Missing values after imputation:")
    print(df_transformed.isnull().sum())

    df_final = df[['class']].join(df_transformed)

    return df_final

In [11]:
train = handle_missing_data(train, num_cols, cat_cols)

Missing values after imputation:
cap-diameter                  0
stem-height                   0
stem-width                    0
cap-shape                   103
cap-surface              671104
cap-color                    94
does-bruise-or-bleed         92
gill-attachment          524011
gill-spacing            1258510
gill-color                  133
stem-root               2757121
stem-surface            1980917
stem-color                   91
veil-type               2957561
veil-color              2740997
has-ring                     70
ring-type                128920
spore-print-color       2849729
habitat                     100
season                        0
dtype: int64


In [12]:
def handle_missing_testset(df, num_cols, cat_cols):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors=3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1))
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ]
    )

    df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols]), columns = num_cols + cat_cols)

    print("Missing values after imputation:")
    print(df_transformed.isna().sum())

    df_final = df_transformed
    return df_final

In [13]:
test = handle_missing_testset(test, num_cols, cat_cols)

Missing values after imputation:
cap-diameter                  0
stem-height                   0
stem-width                    0
cap-shape                    78
cap-surface              446951
cap-color                    63
does-bruise-or-bleed         65
gill-attachment          349877
gill-spacing             839642
gill-color                   98
stem-root               1838068
stem-surface            1321534
stem-color                   66
veil-type               1971579
veil-color              1826171
has-ring                     71
ring-type                 86224
spore-print-color       1899666
habitat                      70
season                        0
dtype: int64


In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['class'] = le.fit_transform(train['class'])

In [15]:
y = train['class']
X = train.drop('class', axis = 1)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [17]:
from sklearn.metrics import matthews_corrcoef

def mcc_metric(y_pred, dmatrix):
    y_true = dmatrix.get_label()
    y_pred = (y_pred > 0.5).astype(int) 
    mcc = matthews_corrcoef(y_true, y_pred)
    return 'mcc', mcc

In [18]:
from sklearn.metrics import matthews_corrcoef
from xgboost import XGBClassifier
import gc

model = XGBClassifier(

    colsample_bytree = 0.6,
    max_depth = 14,
    min_child_weight = 7,
    random_state = 42,
    n_estimators = 200,
)

In [19]:
XGB = model.fit(
    X_train, 
    y_train, 
    eval_set = [(X_test, y_test)],
    eval_metric = mcc_metric)



[0]	validation_0-logloss:0.46222	validation_0-mcc:0.91813
[1]	validation_0-logloss:0.32837	validation_0-mcc:0.96150
[2]	validation_0-logloss:0.24617	validation_0-mcc:0.96949
[3]	validation_0-logloss:0.19085	validation_0-mcc:0.97191
[4]	validation_0-logloss:0.14647	validation_0-mcc:0.97723
[5]	validation_0-logloss:0.12313	validation_0-mcc:0.97799
[6]	validation_0-logloss:0.10108	validation_0-mcc:0.97966
[7]	validation_0-logloss:0.08521	validation_0-mcc:0.98064
[8]	validation_0-logloss:0.07556	validation_0-mcc:0.98096
[9]	validation_0-logloss:0.06499	validation_0-mcc:0.98132
[10]	validation_0-logloss:0.06043	validation_0-mcc:0.98128
[11]	validation_0-logloss:0.05418	validation_0-mcc:0.98181
[12]	validation_0-logloss:0.05136	validation_0-mcc:0.98184
[13]	validation_0-logloss:0.04779	validation_0-mcc:0.98224
[14]	validation_0-logloss:0.04550	validation_0-mcc:0.98246
[15]	validation_0-logloss:0.04355	validation_0-mcc:0.98260
[16]	validation_0-logloss:0.04200	validation_0-mcc:0.98287
[17]	va

In [20]:
y_pred = XGB.predict(X_test)

In [21]:
score = matthews_corrcoef(y_test, y_pred)
print('MCC', score)

MCC 0.9840690757540304


In [22]:
test_pred_prob = XGB.predict(test)

In [23]:
test_pred_class = le.inverse_transform(test_pred_prob)

In [24]:
submission = pd.read_csv('sample_submission.csv')

submission['class'] = test_pred_class

In [25]:
submission.to_csv('submission.csv', index = False)