In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from sklearn.preprocessing import OneHotEncoder

In [4]:
cat_cols = [col for col in train.select_dtypes('object').columns]
num_cols = [col for col in train.select_dtypes('number').columns]

In [5]:
from sklearn.compose import ColumnTransformer

In [6]:
def replace_non_alpha_with_nan(df):
    cols_to_filter = ['cap-shape', 'cap-surface', 'cap-color', 
                      'does-bruise-or-bleed', 'gill-attachment', 
                      'gill-spacing', 'gill-color', 'stem-surface', 
                      'stem-color', 'has-ring', 'ring-type', 'habitat', 'stem-root', 'veil-type', 'veil-color', 'spore-print-color']

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    col_values = {}
    for col in cols_to_filter:
        value_counts = train[col].value_counts()
        col_values[col] = value_counts[value_counts > 10].index.values.tolist()


    def filter_alpha(value, value_list_no_outliers):
        if isinstance(value, str):
            return value if len(value) == 1 and value in value_list_no_outliers and value in alphabet_list else np.nan # if value is a single character
        
        return np.nan
    
    for col in cols_to_filter:
        df[col] = df[col].apply(lambda x : filter_alpha(x, col_values[col]))

    return df

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

def handle_missing_data(df, num_cols, cat_cols):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ]
    )

    df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols]), columns = num_cols + cat_cols)

    df_transformed = df_transformed.fillna(-20)

    print("Missing values after imputation:")
    print(df_transformed.isnull().sum())

    # df_final = df[['class']].join(df_transformed)

    df_final = df_transformed

    return df_final

In [8]:
train = replace_non_alpha_with_nan(train)
train_preprocessed = handle_missing_data(train, num_cols, cat_cols)

Missing values after imputation:
id                      0
cap-diameter            0
stem-height             0
stem-width              0
class                   0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-root               0
stem-surface            0
stem-color              0
veil-type               0
veil-color              0
has-ring                0
ring-type               0
spore-print-color       0
habitat                 0
season                  0
dtype: int64


In [9]:
from sklearn.preprocessing import LabelEncoder

ord_enc = LabelEncoder()
train['class'] = ord_enc.fit_transform(train['class'])

In [10]:
train

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,0,8.80,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,1,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,0,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,0,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,0,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,0,9.29,f,,n,t,,,w,...,b,,w,u,w,t,g,,d,u
3116941,3116941,0,10.88,s,,w,t,d,c,p,...,,,w,,,f,f,,d,u
3116942,3116942,1,7.82,x,e,e,f,a,,w,...,,,y,,w,t,z,,d,a
3116943,3116943,0,9.45,p,i,n,t,e,,p,...,,y,w,,,t,p,,d,u


In [11]:
train['class'] = ord_enc.inverse_transform(train['class'])

In [12]:
train

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.80,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,e,9.29,f,,n,t,,,w,...,b,,w,u,w,t,g,,d,u
3116941,3116941,e,10.88,s,,w,t,d,c,p,...,,,w,,,f,f,,d,u
3116942,3116942,p,7.82,x,e,e,f,a,,w,...,,,y,,w,t,z,,d,a
3116943,3116943,e,9.45,p,i,n,t,e,,p,...,,y,w,,,t,p,,d,u


In [13]:
corr_matrix = train_preprocessed.corr()

In [14]:
threshold = abs(corr_matrix['class']).sort_values(ascending=False).mean()

In [15]:
import itertools

def find_train_combinations(train, cat_cols, num_cols):
    
    all_columns = cat_cols + num_cols

    filtered_cols = [col for col in all_columns if col != 'class']
    combinations = itertools.combinations(filtered_cols, 2)

    train_combinations = train['class'].to_frame()

    for col1, col2 in combinations:
       combination = train[col1] * train[col2]
       train_combinations = train_combinations.join(combination.rename(f'{col1} x {col2}'))

    return train_combinations

train_combinations = find_train_combinations(train_preprocessed, cat_cols, num_cols)

In [16]:
corr_combinations = train_combinations.corr()

In [17]:
abs_values = abs(corr_combinations['class'])
new_cols = abs_values.loc[abs_values > threshold].index.tolist()

In [18]:
new_cols.remove('class')

In [19]:
train = train.join(train_combinations[new_cols])

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,spore-print-color x stem-width,habitat x cap-diameter,habitat x stem-width,season x cap-diameter,season x stem-width,id x cap-diameter,id x stem-width,cap-diameter x stem-height,cap-diameter x stem-width,stem-height x stem-width
0,0,e,8.80,f,s,u,f,a,c,w,...,-307.8,26.40,46.17,0.00,0.00,0.00,0.00,39.6880,135.4320,69.4089
1,1,p,4.51,x,h,o,f,a,c,n,...,-129.6,13.53,19.44,13.53,19.44,4.51,6.48,21.6029,29.2248,31.0392
2,2,e,6.94,f,s,b,f,x,c,w,...,-198.6,62.46,89.37,20.82,29.79,13.88,19.86,47.5390,68.9142,68.0205
3,3,e,3.88,f,y,g,f,s,,g,...,-130.6,11.64,19.59,7.76,13.06,11.64,19.59,16.1408,25.3364,27.1648
4,4,e,5.85,x,l,w,f,d,,w,...,-167.2,35.10,50.16,0.00,0.00,23.40,33.44,19.7145,48.9060,28.1732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,e,9.29,f,,n,t,,,w,...,-376.2,27.87,56.43,18.58,37.62,28956372.60,58629641.40,112.7806,174.7449,228.3534
3116941,3116941,e,10.88,s,,w,t,d,c,p,...,-539.4,32.64,80.91,21.76,53.94,33912318.08,84063898.77,72.3520,293.4336,179.3505
3116942,3116942,p,7.82,x,e,e,f,a,,w,...,-221.2,23.46,33.18,0.00,0.00,24374486.44,34473378.52,74.3682,86.4892,105.1806
3116943,3116943,e,9.45,p,i,n,t,e,,p,...,-355.4,28.35,53.31,18.90,35.54,29455111.35,55388077.11,86.2785,167.9265,162.2401


In [22]:
train['class'] = ord_enc.fit_transform(train['class'])

In [26]:
train_preprocessed['class'] = train['class']

In [28]:
train_preprocessed = train_preprocessed.join(train_combinations[new_cols])

In [30]:
corr_matrix_after = train_preprocessed.corr()

In [46]:
abs(corr_matrix_after['class']).sort_values(ascending=False).to_csv('correlations.csv')

In [99]:
# from sklearn.preprocessing import OrdinalEncoder

# preprocessing = ColumnTransformer([
#     ('cat', OrdinalEncoder(), cat_cols)
# ])

# train_prep = preprocessing.fit_transform(train)

In [100]:
# train_preprocessed = pd.DataFrame(train_prep, columns = preprocessing.get_feature_names_out())

In [101]:
corr_matrix = train_preprocessed.corr()

In [102]:
train_preprocessed

Unnamed: 0,id,cap-diameter,stem-height,stem-width,class,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0.0,8.80,4.51,15.39,0.0,53.0,72.0,72.0,8.0,44.0,...,,,55.0,,,5.0,18.0,,25.0,0.0
1,1.0,4.51,4.79,6.48,1.0,71.0,56.0,64.0,8.0,44.0,...,,58.0,47.0,,,18.0,39.0,,25.0,3.0
2,2.0,6.94,6.85,9.93,0.0,53.0,72.0,49.0,8.0,75.0,...,,51.0,46.0,,,5.0,18.0,,36.0,3.0
3,3.0,3.88,4.16,6.53,0.0,53.0,81.0,57.0,8.0,70.0,...,,,55.0,,,5.0,18.0,,25.0,2.0
4,4.0,5.85,3.37,8.36,0.0,71.0,65.0,74.0,8.0,47.0,...,,,55.0,,,5.0,18.0,,29.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940.0,9.29,12.14,18.81,0.0,53.0,,63.0,20.0,,...,15.0,,55.0,19.0,21.0,18.0,19.0,,25.0,2.0
3116941,3116941.0,10.88,6.65,26.97,0.0,67.0,,74.0,20.0,47.0,...,,,55.0,,,5.0,18.0,,25.0,2.0
3116942,3116942.0,7.82,9.51,11.06,1.0,71.0,53.0,55.0,8.0,44.0,...,,,57.0,,21.0,18.0,39.0,,25.0,0.0
3116943,3116943.0,9.45,9.13,17.77,0.0,64.0,59.0,63.0,20.0,52.0,...,,58.0,55.0,,,18.0,27.0,,25.0,2.0


In [103]:
corr_matrix['class'].sort_values()

veil-color             -0.378448
spore-print-color      -0.240618
stem-width             -0.169807
cap-diameter           -0.162627
stem-surface           -0.129510
gill-spacing           -0.103937
cap-shape              -0.090220
gill-attachment        -0.082450
stem-color             -0.070814
season                 -0.063554
gill-color             -0.060206
stem-height            -0.049740
cap-surface            -0.038520
does-bruise-or-bleed   -0.037555
habitat                -0.027232
id                     -0.000136
veil-type               0.003288
cap-color               0.043374
has-ring                0.049628
ring-type               0.126961
stem-root               0.347033
class                   1.000000
Name: class, dtype: float64

In [104]:
train_combinations = train_preprocessed['cat__class'].to_frame()

KeyError: 'cat__class'

In [None]:
cap_shape_x_habitat = train_preprocessed['cat__cap-shape'] * train_preprocessed['cat__habitat']


train_combinations.join(cap_shape_x_habitat.rename('cap-shape x habitat'))

Unnamed: 0,cat__class,cap-shape x habitat
0,0.0,1325.0
1,1.0,1775.0
2,0.0,1908.0
3,0.0,1325.0
4,0.0,2059.0
...,...,...
3116940,0.0,1325.0
3116941,0.0,1675.0
3116942,1.0,1775.0
3116943,0.0,1600.0


In [None]:
for i, col in enumerate(train_preprocessed.columns):
    if col != 'cat__class':
        for j, col2 in enumerate(train_preprocessed.columns[i+1:], start=i+1):
            if col2 != 'cat__class':
                combination = train_preprocessed[col] * train_preprocessed[col2]
                train_combinations = train_combinations.join(combination.rename(f'{col} x {col2}'))

In [None]:
train_combinations.shape

(3116945, 137)

In [None]:
corr_matrix = train_combinations.corr()

In [None]:
corr_matrix['cat__class'].sort_values()

cat__gill-attachment x cat__veil-color       -0.545771
cat__cap-surface x cat__veil-color           -0.487548
cat__does-bruise-or-bleed x cat__veil-type   -0.475117
cat__cap-shape x cat__veil-color             -0.446869
cat__gill-spacing x cat__veil-color          -0.441272
                                                ...   
cat__stem-root x cat__stem-surface            0.551504
cat__stem-root x cat__veil-color              0.644496
cat__stem-surface x cat__veil-type            0.948013
cat__stem-root x cat__veil-type               0.981362
cat__class                                    1.000000
Name: cat__class, Length: 137, dtype: float64

In [None]:
corr_matrix_original = train_preprocessed.corr()

In [None]:
corr_matrix_original['cat__class'].sort_values()

cat__veil-color             -0.378448
cat__spore-print-color      -0.240618
cat__stem-surface           -0.129510
cat__gill-spacing           -0.103937
cat__cap-shape              -0.090220
cat__gill-attachment        -0.082450
cat__stem-color             -0.070814
cat__season                 -0.063554
cat__gill-color             -0.060206
cat__cap-surface            -0.038520
cat__does-bruise-or-bleed   -0.037555
cat__habitat                -0.027232
cat__veil-type               0.003288
cat__cap-color               0.043374
cat__has-ring                0.049628
cat__ring-type               0.126961
cat__stem-root               0.347033
cat__class                   1.000000
Name: cat__class, dtype: float64