In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
def replace_non_alpha_with_nan(df):
    cols_to_filter = ['cap-shape', 'cap-surface', 'cap-color', 
                      'does-bruise-or-bleed', 'gill-attachment', 
                      'gill-spacing', 'gill-color', 'stem-surface', 
                      'stem-color', 'has-ring', 'ring-type', 'habitat', 'stem-root', 'veil-type', 'veil-color', 'spore-print-color']

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    col_values = {}
    for col in cols_to_filter:
        value_counts = train[col].value_counts()
        col_values[col] = value_counts[value_counts > 10].index.values.tolist()


    def filter_alpha(value, value_list_no_outliers):
        if isinstance(value, str):
            return value if len(value) == 1 and value in value_list_no_outliers and value in alphabet_list else np.nan # if value is a single character
        
        return np.nan
    
    for col in cols_to_filter:
        df[col] = df[col].apply(lambda x : filter_alpha(x, col_values[col]))

    return df

In [6]:
train = replace_non_alpha_with_nan(train)
test = replace_non_alpha_with_nan(test)

In [7]:
cat_cols = [col for col in train.select_dtypes('object').columns if col != 'class']
num_cols = [col for col in train.select_dtypes('number').columns]
print(f'Categorical columns:\n {cat_cols}\n')
print(f'Numeric columns:\n {num_cols}')

Categorical columns:
 ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

Numeric columns:
 ['cap-diameter', 'stem-height', 'stem-width']


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

def encode_train_data(df, num_cols, cat_cols):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        # ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        ('adjust', FunctionTransformer(lambda x : x + 1)) # * Adjust function
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ]
    )

    df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols]), columns = num_cols + cat_cols)

    df_final = df[['class']].join(df_transformed)

    return df_final

In [9]:
train = encode_train_data(train, num_cols, cat_cols)

In [10]:
def handle_missing_data(df_transformed):
    
    df_transformed = df_transformed.fillna(-10)

    print("Missing values after imputation:")
    print(df_transformed.isnull().sum())
    return df_transformed

In [11]:
train = handle_missing_data(train)

Missing values after imputation:
class                   0
cap-diameter            0
stem-height             0
stem-width              0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-root               0
stem-surface            0
stem-color              0
veil-type               0
veil-color              0
has-ring                0
ring-type               0
spore-print-color       0
habitat                 0
season                  0
dtype: int64


In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['class'] = le.fit_transform(train['class'])

In [13]:
y = train['class']
X = train.drop('class', axis = 1)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [15]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators = 500, random_state = 42, n_jobs = -1)
rnd_clf.fit(X_train, y_train)

In [19]:
features = {}

In [20]:
for score, name in zip(rnd_clf.feature_importances_, X_train.columns):
    print(score, name)
    features[name] = score

0.07421968821824947 cap-diameter
0.0665977407285968 stem-height
0.12528471179604794 stem-width
0.04552303460794976 cap-shape
0.08002795288801462 cap-surface
0.04651689756460876 cap-color
0.02745682127554058 does-bruise-or-bleed
0.08668959098635648 gill-attachment
0.07454080392592163 gill-spacing
0.06597129534039452 gill-color
0.04026662889316048 stem-root
0.057742587303075445 stem-surface
0.07183809255229735 stem-color
0.008223712626854377 veil-type
0.013679599649420887 veil-color
0.020537066132967248 has-ring
0.043208644547018955 ring-type
0.012779982630003909 spore-print-color
0.02688179407735533 habitat
0.012013354256165228 season


In [29]:
values = list(features.values())
keys = list(features.keys())

In [33]:
sorted_value_index = np.argsort(values)

In [31]:
sorted_features = {keys[i] : values[i] for i in sorted_value_index}

In [32]:
sorted_features

{'veil-type': np.float64(0.008223712626854377),
 'season': np.float64(0.012013354256165228),
 'spore-print-color': np.float64(0.012779982630003909),
 'veil-color': np.float64(0.013679599649420887),
 'has-ring': np.float64(0.020537066132967248),
 'habitat': np.float64(0.02688179407735533),
 'does-bruise-or-bleed': np.float64(0.02745682127554058),
 'stem-root': np.float64(0.04026662889316048),
 'ring-type': np.float64(0.043208644547018955),
 'cap-shape': np.float64(0.04552303460794976),
 'cap-color': np.float64(0.04651689756460876),
 'stem-surface': np.float64(0.057742587303075445),
 'gill-color': np.float64(0.06597129534039452),
 'stem-height': np.float64(0.0665977407285968),
 'stem-color': np.float64(0.07183809255229735),
 'cap-diameter': np.float64(0.07421968821824947),
 'gill-spacing': np.float64(0.07454080392592163),
 'cap-surface': np.float64(0.08002795288801462),
 'gill-attachment': np.float64(0.08668959098635648),
 'stem-width': np.float64(0.12528471179604794)}