In [156]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import kurtosis, skew, trim_mean
from sklearn.preprocessing import QuantileTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import matthews_corrcoef, confusion_matrix, ConfusionMatrixDisplay

In [157]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [114]:
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

In [115]:
pd.set_option('display.max_columns', None)

In [116]:
train.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,e,8.8,f,s,u,f,a,c,w,4.51,15.39,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,6.48,,y,o,,,t,z,,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,9.93,,s,n,,,f,f,,l,w
3,e,3.88,f,y,g,f,s,,g,4.16,6.53,,,w,,,f,f,,d,u
4,e,5.85,x,l,w,f,d,,w,3.37,8.36,,,w,,,f,f,,g,a


In [117]:
test.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,8.64,x,,n,t,,,w,11.13,17.12,b,,w,u,w,t,g,,d,a
1,6.9,o,t,o,f,,c,y,1.27,10.75,,,n,,,f,f,,d,a
2,2.0,b,g,n,f,,c,n,6.18,3.14,,,n,,,f,f,,d,s
3,3.47,x,t,n,f,s,c,n,4.98,8.51,,,w,,n,t,z,,d,u
4,6.17,x,h,y,f,p,,y,6.73,13.7,,,y,,y,t,,,d,u


In [118]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cap-diameter,3116941.0,6.309848,4.657931,0.03,3.32,5.75,8.24,80.67
stem-height,3116945.0,6.348333,2.699755,0.0,4.67,5.88,7.41,88.72
stem-width,3116945.0,11.153785,8.095477,0.0,4.97,9.65,15.63,102.9


In [119]:
train.describe(include = 'object').T

Unnamed: 0,count,unique,top,freq
class,3116945,2,p,1705396
cap-shape,3116905,74,x,1436026
cap-surface,2445922,83,t,460777
cap-color,3116933,78,n,1359542
does-bruise-or-bleed,3116937,26,f,2569743
gill-attachment,2593009,78,a,646034
gill-spacing,1858510,48,c,1331054
gill-color,3116888,63,w,931538
stem-root,359922,38,b,165801
stem-surface,1136084,60,s,327610


In [120]:
test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cap-diameter,2077957.0,6.306192,4.685462,0.0,3.31,5.74,8.23,607.0
stem-height,2077963.0,6.346509,2.698978,0.0,4.67,5.88,7.41,57.29
stem-width,2077964.0,11.148374,8.100181,0.0,4.97,9.64,15.62,102.91


In [121]:
test.describe(include = 'object').T

Unnamed: 0,count,unique,top,freq
cap-shape,2077933,62,x,957949
cap-surface,1631060,59,t,306852
cap-color,2077951,57,n,904307
does-bruise-or-bleed,2077954,22,f,1713662
gill-attachment,1728143,66,a,430960
gill-spacing,1238369,35,c,886976
gill-color,2077915,56,w,620773
stem-root,239952,31,b,110581
stem-surface,756476,54,s,218496
stem-color,2077943,55,w,797365


In [122]:
train.isna().sum()

class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64

In [123]:
test.isna().sum()

cap-diameter                  7
cap-shape                    31
cap-surface              446904
cap-color                    13
does-bruise-or-bleed         10
gill-attachment          349821
gill-spacing             839595
gill-color                   49
stem-height                   1
stem-width                    0
stem-root               1838012
stem-surface            1321488
stem-color                   21
veil-type               1971545
veil-color              1826124
has-ring                     19
ring-type                 86195
spore-print-color       1899617
habitat                      25
season                        0
dtype: int64

In [124]:
missing_train = train.isna().mean() * 100
missing_test = test.isna().mean() * 100

In [125]:
print(missing_train[missing_train > 85])
print()
print(missing_test[missing_test > 85])

stem-root            88.452732
veil-type            94.884350
veil-color           87.936970
spore-print-color    91.425482
dtype: float64

stem-root            88.452543
veil-type            94.878689
veil-color           87.880445
spore-print-color    91.417224
dtype: float64


In [126]:
cols_to_drop_train = missing_train[missing_train > 85].index

cols_to_drop_test = missing_test[missing_test > 85].index

train = train.drop(cols_to_drop_train, axis = 1)

test = test.drop(cols_to_drop_test, axis = 1)



In [127]:
print(f'Train set Shape: {train.shape}')
print(f"Test Set Shape: {test.shape}")

Train set Shape: (3116945, 17)
Test Set Shape: (2077964, 16)


In [128]:
print(f'Number of Duplicated Values in Train Set: {train.duplicated().sum()}')
print(f'Number of Duplicated Values in test Set: {test.duplicated().sum()}')

Number of Duplicated Values in Train Set: 157
Number of Duplicated Values in test Set: 85


In [129]:

# ! Removing Duplicates causes data loss - no duplicates in ids
# train = train.drop_duplicates()
print(f"Train set shape: {train.shape}")
# test = test.drop_duplicates()
print(f'test set shape: {test.shape}')

Train set shape: (3116945, 17)
test set shape: (2077964, 16)


In [130]:
def replace_non_alpha_with_nan(df):
    cols_to_filter = ['cap-shape', 'cap-surface', 'cap-color', 
                      'does-bruise-or-bleed', 'gill-attachment', 
                      'gill-spacing', 'gill-color', 'stem-surface', 
                      'stem-color', 'has-ring', 'ring-type', 'habitat']

    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 
                     'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    
    def filter_alpha(value):
        if isinstance(value, str):
            return value if len(value) == 1 and value in alphabet_list else np.nan # if value is a single character
        
        return np.nan
    
    for col in cols_to_filter:
        df[col] = df[col].apply(filter_alpha)

    return df

In [131]:
train = replace_non_alpha_with_nan(train)
test = replace_non_alpha_with_nan(test)

In [132]:
cat_cols = [col for col in train.select_dtypes('object').columns if col != 'class']
num_cols = [col for col in train.select_dtypes('number').columns]
print(f'Categorical columns:\n {cat_cols}\n')
print(f'Numeric columns:\n {num_cols}')

Categorical columns:
 ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-surface', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season']

Numeric columns:
 ['cap-diameter', 'stem-height', 'stem-width']


In [133]:
mean_vs_trimmed_mean = pd.DataFrame({'mean':train[num_cols].mean(),
                                     'trimmed_mean' : train[num_cols].apply(lambda x: trim_mean(x, proportiontocut=0.1))})

kurtosis_skewness = pd.DataFrame({'kurtosis': train[num_cols].apply(kurtosis),
                                  'skewness': train[num_cols].apply(skew)})

print("Mean vs Trimmed Mean:")
print(mean_vs_trimmed_mean)

print("\nKurtosis and Skewness:")
print(kurtosis_skewness)

Mean vs Trimmed Mean:
                   mean  trimmed_mean
cap-diameter   6.309848      5.809199
stem-height    6.348333      6.025546
stem-width    11.153785     10.169430

Kurtosis and Skewness:
              kurtosis  skewness
cap-diameter       NaN       NaN
stem-height   7.761535  1.926681
stem-width    2.448970  1.235426


In [134]:
# Calculate mean and trimmed mean for each numeric columns
mean_vs_trimmed_mean = pd.DataFrame({'mean': test[num_cols].mean(),
                                     'trimmed_mean': test[num_cols].apply(lambda x: trim_mean(x, proportiontocut=0.1))})

# Calculate kurtosis and skewness for each numeric columns
kurtosis_skewness = pd.DataFrame({'kurtosis': test[num_cols].apply(kurtosis),
                                  'skewness': test[num_cols].apply(skew)})

print("Mean vs Trimmed Mean:")
print(mean_vs_trimmed_mean)

print("\nKurtosis and Skewness:")
print(kurtosis_skewness)

Mean vs Trimmed Mean:
                   mean  trimmed_mean
cap-diameter   6.306192      5.802797
stem-height    6.346509      6.023292
stem-width    11.148374     10.162144

Kurtosis and Skewness:
              kurtosis  skewness
cap-diameter       NaN       NaN
stem-height        NaN       NaN
stem-width    2.567584   1.24934


Handling Null Values

In [135]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

def handle_missing_data(df, num_cols, cat_cols):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors = 3))
    ])

    categorical_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ]
    )

    df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols]), columns = num_cols + cat_cols)

    print("Missing values after imputation:")
    print(df_transformed.isnull().sum())

    df_final = df[['class']].join(df_transformed)

    return df_final

In [136]:
train = handle_missing_data(train, num_cols, cat_cols)

Missing values after imputation:
cap-diameter            0
stem-height             0
stem-width              0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-surface            0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64


In [137]:
def handle_missing_testset(df, num_cols, cat_cols):

    numeric_transformer = Pipeline(steps = [
        ('imputer', KNNImputer(n_neighbors=3))
    ])

    categorical_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1))
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ]
    )

    df_transformed = pd.DataFrame(preprocessor.fit_transform(df[num_cols + cat_cols]), columns = num_cols + cat_cols)

    print("Missing values after imputation:")
    print(df_transformed.isna().sum())

    df_final = df_transformed
    return df_final

In [138]:
test = handle_missing_testset(test, num_cols, cat_cols)

Missing values after imputation:
cap-diameter            0
stem-height             0
stem-width              0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-surface            0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64


In [139]:
print(f'train set shape: {train.shape}')
train.head()

train set shape: (3116945, 17)


Unnamed: 0,class,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,e,8.8,4.51,15.39,5.0,16.0,18.0,5.0,0.0,2.0,19.0,16.0,19.0,4.0,5.0,3.0,0.0
1,p,4.51,4.79,6.48,20.0,7.0,13.0,5.0,0.0,2.0,12.0,21.0,13.0,16.0,22.0,3.0,3.0
2,e,6.94,6.85,9.93,5.0,16.0,1.0,5.0,20.0,2.0,19.0,16.0,12.0,4.0,5.0,10.0,3.0
3,e,3.88,4.16,6.53,5.0,21.0,6.0,5.0,16.0,2.0,6.0,16.0,19.0,4.0,5.0,3.0,2.0
4,e,5.85,3.37,8.36,20.0,10.0,19.0,5.0,3.0,2.0,19.0,16.0,19.0,4.0,5.0,6.0,0.0


In [140]:
print(f'train set shape: {test.shape}')
test.head()

train set shape: (2077964, 16)


Unnamed: 0,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,8.64,11.13,17.12,20.0,17.0,12.0,16.0,0.0,2.0,19.0,16.0,19.0,16.0,6.0,3.0,0.0
1,6.9,1.27,10.75,13.0,17.0,13.0,4.0,0.0,2.0,21.0,16.0,12.0,5.0,5.0,3.0,0.0
2,2.0,6.18,3.14,1.0,6.0,12.0,4.0,0.0,2.0,12.0,16.0,12.0,5.0,5.0,3.0,1.0
3,3.47,4.98,8.51,20.0,17.0,12.0,4.0,16.0,2.0,12.0,16.0,19.0,16.0,22.0,3.0,2.0
4,6.17,6.73,13.7,20.0,7.0,21.0,4.0,14.0,2.0,21.0,16.0,21.0,16.0,5.0,3.0,2.0


In [141]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['class'] = le.fit_transform(train['class'])

In [142]:
y = train['class']
X = train.drop('class', axis = 1)

In [143]:
from sklearn.model_selection import train_test_split

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [145]:
from sklearn.metrics import matthews_corrcoef

def mcc_metric(y_pred, dmatrix):
    y_true = dmatrix.get_label()
    y_pred = (y_pred > 0.5).astype(int) 
    mcc = matthews_corrcoef(y_true, y_pred)
    return 'mcc', mcc

In [146]:
from sklearn.metrics import matthews_corrcoef
from xgboost import XGBClassifier
import gc

model = XGBClassifier(

    colsample_bytree = 0.6,
    max_depth = 14,
    min_child_weight = 7,
    random_state = 42,
    n_estimators = 200,
)

In [147]:
XGB = model.fit(
    X_train, 
    y_train, 
    eval_set = [(X_test, y_test)],
    eval_metric = mcc_metric)



[0]	validation_0-logloss:0.45920	validation_0-mcc:0.93869
[1]	validation_0-logloss:0.33348	validation_0-mcc:0.95752
[2]	validation_0-logloss:0.25550	validation_0-mcc:0.96200
[3]	validation_0-logloss:0.20263	validation_0-mcc:0.96364
[4]	validation_0-logloss:0.15975	validation_0-mcc:0.97007
[5]	validation_0-logloss:0.12680	validation_0-mcc:0.97365
[6]	validation_0-logloss:0.10706	validation_0-mcc:0.97468
[7]	validation_0-logloss:0.09036	validation_0-mcc:0.97542
[8]	validation_0-logloss:0.08019	validation_0-mcc:0.97649
[9]	validation_0-logloss:0.07078	validation_0-mcc:0.97750
[10]	validation_0-logloss:0.06356	validation_0-mcc:0.97771
[11]	validation_0-logloss:0.05984	validation_0-mcc:0.97809
[12]	validation_0-logloss:0.05551	validation_0-mcc:0.97859
[13]	validation_0-logloss:0.05193	validation_0-mcc:0.97890
[14]	validation_0-logloss:0.04977	validation_0-mcc:0.97926
[15]	validation_0-logloss:0.04726	validation_0-mcc:0.97974
[16]	validation_0-logloss:0.04592	validation_0-mcc:0.97995
[17]	va

In [148]:
y_pred = XGB.predict(X_test)

In [149]:
score = matthews_corrcoef(y_test, y_pred)
print('MCC', score)

MCC 0.9822149793041021


In [150]:
test_pred_prob = XGB.predict(test)

In [151]:


test_pred_class = le.inverse_transform(test_pred_prob)

In [152]:
submission = pd.read_csv('sample_submission.csv')

submission['class'] = test_pred_class

In [153]:
submission.to_csv('submission.csv', index = False)

In [154]:
submission
# ? Handle Skenewws then simple imputer ?

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
...,...,...
2077959,5194904,p
2077960,5194905,p
2077961,5194906,p
2077962,5194907,e


In [155]:
train

Unnamed: 0,class,cap-diameter,stem-height,stem-width,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,0,8.80,4.51,15.39,5.0,16.0,18.0,5.0,0.0,2.0,19.0,16.0,19.0,4.0,5.0,3.0,0.0
1,1,4.51,4.79,6.48,20.0,7.0,13.0,5.0,0.0,2.0,12.0,21.0,13.0,16.0,22.0,3.0,3.0
2,0,6.94,6.85,9.93,5.0,16.0,1.0,5.0,20.0,2.0,19.0,16.0,12.0,4.0,5.0,10.0,3.0
3,0,3.88,4.16,6.53,5.0,21.0,6.0,5.0,16.0,2.0,6.0,16.0,19.0,4.0,5.0,3.0,2.0
4,0,5.85,3.37,8.36,20.0,10.0,19.0,5.0,3.0,2.0,19.0,16.0,19.0,4.0,5.0,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,0,9.29,12.14,18.81,5.0,17.0,12.0,16.0,0.0,2.0,19.0,16.0,19.0,16.0,6.0,3.0,2.0
3116941,0,10.88,6.65,26.97,16.0,17.0,19.0,16.0,3.0,2.0,14.0,16.0,19.0,4.0,5.0,3.0,2.0
3116942,1,7.82,9.51,11.06,20.0,4.0,4.0,5.0,0.0,2.0,19.0,16.0,21.0,16.0,22.0,3.0,0.0
3116943,0,9.45,9.13,17.77,14.0,8.0,12.0,16.0,4.0,2.0,14.0,21.0,19.0,16.0,14.0,3.0,2.0
