In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
from sklearn.preprocessing import OneHotEncoder

In [5]:
cat_cols = [col for col in train.select_dtypes('object').columns]
num_cols = [col for col in train.select_dtypes('number').columns]

In [6]:
from sklearn.compose import ColumnTransformer

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

def encode_data(df, num_cols, cat_cols):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ]
    )

    df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols]), columns = num_cols + cat_cols)

    # df_final = df[['class']].join(df_transformed)

    df_final = df_transformed

    return df_final

In [8]:
train_preprocessed = encode_data(train, num_cols, cat_cols)

In [9]:
import itertools
from sklearn.preprocessing import LabelEncoder

def find_train_combinations(train, cat_cols, num_cols):
    
    all_columns = cat_cols + num_cols

    # ? returning features from train_combinations with correlations greater than the mean of the original
    
    ord_enc = LabelEncoder()
    train['class'] = ord_enc.fit_transform(train['class'])

   #  corr_matrix = train.corr()
    
   #  threshold = abs(corr_matrix['class']).sort_values(ascending=False).mean()
   #  print(f" Mean Correlation of Original Data {threshold}")


    filtered_cols = [col for col in all_columns if col != 'class']
    print(filtered_cols)
    print(train.columns)
    combinations = itertools.combinations(filtered_cols, 2)
    print(combinations)

    train_combinations = train['class'].to_frame()

    for col1, col2 in combinations:
       combination = train[col1] * train[col2]
       train_combinations = train_combinations.join(combination.rename(f'{col1} x {col2}'))
    
    # ? returning features from train_combinations with correlations greater than the mean of the original

   #  corr_combinations = train_combinations.corr()
   #  abs_values = abs(corr_combinations['class'])
   #  new_cols = abs_values.loc[abs_values > threshold].index.tolist()

    new_cols = train_combinations.columns.tolist()

    if 'class' in new_cols:
        new_cols.remove('class')
   #  new_cols = ['habitat x cap-diameter', 'habitat x stem-width']
    
    
    train['class'] = ord_enc.inverse_transform(train['class'])
    return train.join(train_combinations[new_cols])

In [11]:
train_preprocessed_corr = find_train_combinations(train_preprocessed, cat_cols, num_cols)

['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season', 'id', 'cap-diameter', 'stem-height', 'stem-width']
Index(['id', 'cap-diameter', 'stem-height', 'stem-width', 'class', 'cap-shape',
       'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment',
       'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')
<itertools.combinations object at 0x0000027422529760>


In [12]:
corr_matrix = train_preprocessed_corr.corr()

In [14]:
abs(corr_matrix['class']).sort_values(ascending= False)

class                               1.000000
stem-root x veil-type               0.981362
stem-surface x veil-type            0.948013
stem-root x veil-color              0.644496
spore-print-color x cap-diameter    0.563447
                                      ...   
veil-type x id                      0.001661
has-ring x season                   0.000525
id                                  0.000136
gill-attachment x has-ring          0.000056
cap-color x stem-surface            0.000047
Name: class, Length: 232, dtype: float64

In [15]:
def handle_missing_data(df_transformed):
    df_transformed = df_transformed.fillna(-10)

    print("Missing values after imputation:")
    print(df_transformed.isnull().sum())
    
    return df_transformed

In [16]:
train_preprocessed_imputed = handle_missing_data(train_preprocessed_corr)

Missing values after imputation:
id                            0
cap-diameter                  0
stem-height                   0
stem-width                    0
class                         0
                             ..
id x stem-height              0
id x stem-width               0
cap-diameter x stem-height    0
cap-diameter x stem-width     0
stem-height x stem-width      0
Length: 232, dtype: int64


In [17]:
corr_imputed = train_preprocessed_imputed.corr()

In [20]:
abs(corr_imputed['class']).sort_values(ascending=False)

class                                 1.000000
habitat x stem-width                  0.192176
habitat x cap-diameter                0.186009
stem-color x stem-width               0.176561
stem-color x cap-diameter             0.175999
                                        ...   
gill-spacing x ring-type              0.001188
cap-shape x cap-surface               0.000559
has-ring x season                     0.000526
id                                    0.000136
cap-surface x does-bruise-or-bleed    0.000014
Name: class, Length: 232, dtype: float64