In [225]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [226]:
path="mushrooms"
train = pd.read_csv(os.path.join(path, "train.csv"))
test = pd.read_csv(os.path.join(path, "test.csv"))
ss = pd.read_csv(os.path.join(path, "sample_submission.csv"))

In [227]:
train.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             object 
 4   cap-surface           object 
 5   cap-color             object 
 6   does-bruise-or-bleed  object 
 7   gill-attachment       object 
 8   gill-spacing          object 
 9   gill-color            object 
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             object 
 13  stem-surface          object 
 14  stem-color            object 
 15  veil-type             object 
 16  veil-color            object 
 17  has-ring              object 
 18  ring-type             object 
 19  spore-print-color     object 
 20  habitat               object 
 21  season                object 
dtypes: float64(3), int64(1), object(18)
memory

In [228]:
train

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.80,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,e,9.29,f,,n,t,,,w,...,b,,w,u,w,t,g,,d,u
3116941,3116941,e,10.88,s,,w,t,d,c,p,...,,,w,,,f,f,,d,u
3116942,3116942,p,7.82,x,e,e,f,a,,w,...,,,y,,w,t,z,,d,a
3116943,3116943,e,9.45,p,i,n,t,e,,p,...,,y,w,,,t,p,,d,u


In [229]:
train.isna().sum() / len(train) * 100

id                       0.000000
class                    0.000000
cap-diameter             0.000128
cap-shape                0.001283
cap-surface             21.528227
cap-color                0.000385
does-bruise-or-bleed     0.000257
gill-attachment         16.809280
gill-spacing            40.373988
gill-color               0.001829
stem-height              0.000000
stem-width               0.000000
stem-root               88.452732
stem-surface            63.551362
stem-color               0.001219
veil-type               94.884350
veil-color              87.936970
has-ring                 0.000770
ring-type                4.134818
spore-print-color       91.425482
habitat                  0.001444
season                   0.000000
dtype: float64

In [230]:
train["class"].value_counts()

class
p    1705396
e    1411549
Name: count, dtype: int64

In [231]:
train.drop(["id", "stem-root", "stem-surface", "veil-type", "veil-color", "spore-print-color"], axis=1, inplace=True)

In [232]:
y = (train["class"] == "p").astype("int8").values
x = train.drop("class", axis=1)

In [233]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [234]:
info = {}

In [235]:
x_train

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
1249285,6.11,b,y,w,f,,,k,12.72,16.36,w,f,f,d,a
1952332,10.42,s,t,w,t,d,c,o,6.73,17.74,w,f,f,d,a
2912806,1.86,f,i,l,f,a,,w,3.63,2.46,w,f,f,d,u
136822,8.65,s,d,w,f,s,c,w,7.82,28.50,w,f,f,d,u
552456,4.86,x,y,g,f,s,,y,5.57,11.25,w,f,f,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,p,,n,f,p,,w,9.34,27.54,n,f,f,d,a
963395,7.39,x,s,b,f,d,d,b,5.99,12.80,w,f,f,d,w
2215104,1.94,b,,g,f,a,,g,5.41,2.12,g,f,f,d,a
1484405,3.58,x,s,w,f,d,c,w,3.47,8.04,w,f,f,d,u


In [236]:
m = x_train["cap-diameter"].mean()
info["cap-diameter_mean"] = m

x_train["cap-diameter"].fillna(m, inplace=True)

x_train["cap-diameter"] = x_train["cap-diameter"].astype("float32")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train["cap-diameter"].fillna(m, inplace=True)


In [237]:
train["cap-shape"].value_counts()[:15]

cap-shape
x    1436026
f     676238
s     365146
b     318646
o     108835
p     106967
c     104520
d         65
e         60
n         41
t         36
w         36
g         34
y         33
r         32
Name: count, dtype: int64

In [238]:
info["cap-shape_val"] = "x"
x_train["cap-shape"].fillna("x", inplace=True)
info["cap_shape_keys"] = list(x_train["cap-shape"].value_counts()[:4].index)
for k in info["cap_shape_keys"]:
    x_train[f"cap_shape_{k}"] = (x_train["cap-shape"] == k).astype("int8")
x_train[f"cap_shape_other"] = (~x_train["cap-shape"].isin(info["cap_shape_keys"])).astype("int8")
x_train.drop("cap-shape", axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train["cap-shape"].fillna("x", inplace=True)


In [239]:
x_train

Unnamed: 0,cap-diameter,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,cap_shape_x,cap_shape_f,cap_shape_s,cap_shape_b,cap_shape_other
1249285,6.11,y,w,f,,,k,12.72,16.36,w,f,f,d,a,0,0,0,1,0
1952332,10.42,t,w,t,d,c,o,6.73,17.74,w,f,f,d,a,0,0,1,0,0
2912806,1.86,i,l,f,a,,w,3.63,2.46,w,f,f,d,u,0,1,0,0,0
136822,8.65,d,w,f,s,c,w,7.82,28.50,w,f,f,d,u,0,0,1,0,0
552456,4.86,y,g,f,s,,y,5.57,11.25,w,f,f,d,u,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,,n,f,p,,w,9.34,27.54,n,f,f,d,a,0,0,0,0,1
963395,7.39,s,b,f,d,d,b,5.99,12.80,w,f,f,d,w,1,0,0,0,0
2215104,1.94,,g,f,a,,g,5.41,2.12,g,f,f,d,a,0,0,0,1,0
1484405,3.58,s,w,f,d,c,w,3.47,8.04,w,f,f,d,u,1,0,0,0,0


In [240]:
x_train["cap-surface"].isna().sum()

np.int64(502664)

In [241]:
cap_surface_dict = x_train["cap-surface"].value_counts(normalize=True)[:11]
cap_surface_dict = cap_surface_dict / cap_surface_dict.sum()
info["cap_surface_dict"] = cap_surface_dict
cap_surface_dict

cap-surface
t    0.188261
s    0.157622
y    0.134050
h    0.116445
g    0.107736
d    0.084486
k    0.052713
e    0.048919
i    0.046416
w    0.045012
l    0.018341
Name: proportion, dtype: float64

In [242]:
def cp(x):
    if isinstance(x, float) and np.isnan(x):
        return np.random.choice(cap_surface_dict.index, p=cap_surface_dict.values)
    elif np.isin(x, cap_surface_dict.index):
        return x
    else:
        return "other"

In [243]:
x_train["cap-surface-clean"]=x_train["cap-surface"].apply(cp)

In [244]:
x_train["cap-color"].isna().sum()

np.int64(9)

In [245]:
x_train["cap-color"].value_counts()[:15]

cap-color
n    1019646
y     290275
w     284478
g     157839
e     147888
o     134231
p      69112
r      58686
u      54864
b      46040
k      44901
l      29467
d         41
f         40
s         33
Name: count, dtype: int64

In [246]:
cap_color_dict = x_train["cap-color"].value_counts(normalize=True)[:6]
cap_color_dict = cap_color_dict / cap_color_dict.sum()
info["cap_color_dict"] = cap_color_dict
def cc(x):
    if isinstance(x, float) and np.isnan(x):
        return np.random.choice(cap_color_dict.index, p=cap_color_dict.values)
    elif np.isin(x, cap_color_dict.index):
        return x
    else:
        return "other"

In [247]:
x_train["cap-color-clean"]=x_train["cap-color"].apply(cp)

In [248]:
x_train

Unnamed: 0,cap-diameter,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,...,ring-type,habitat,season,cap_shape_x,cap_shape_f,cap_shape_s,cap_shape_b,cap_shape_other,cap-surface-clean,cap-color-clean
1249285,6.11,y,w,f,,,k,12.72,16.36,w,...,f,d,a,0,0,0,1,0,y,w
1952332,10.42,t,w,t,d,c,o,6.73,17.74,w,...,f,d,a,0,0,1,0,0,t,w
2912806,1.86,i,l,f,a,,w,3.63,2.46,w,...,f,d,u,0,1,0,0,0,i,l
136822,8.65,d,w,f,s,c,w,7.82,28.50,w,...,f,d,u,0,0,1,0,0,d,w
552456,4.86,y,g,f,s,,y,5.57,11.25,w,...,f,d,u,1,0,0,0,0,y,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,,n,f,p,,w,9.34,27.54,n,...,f,d,a,0,0,0,0,1,w,other
963395,7.39,s,b,f,d,d,b,5.99,12.80,w,...,f,d,w,1,0,0,0,0,s,other
2215104,1.94,,g,f,a,,g,5.41,2.12,g,...,f,d,a,0,0,0,1,0,i,g
1484405,3.58,s,w,f,d,c,w,3.47,8.04,w,...,f,d,u,1,0,0,0,0,s,w


In [249]:
x_train.drop(["cap-surface","cap-color"],axis=1,inplace=True)

In [250]:
x_train

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,cap_shape_x,cap_shape_f,cap_shape_s,cap_shape_b,cap_shape_other,cap-surface-clean,cap-color-clean
1249285,6.11,f,,,k,12.72,16.36,w,f,f,d,a,0,0,0,1,0,y,w
1952332,10.42,t,d,c,o,6.73,17.74,w,f,f,d,a,0,0,1,0,0,t,w
2912806,1.86,f,a,,w,3.63,2.46,w,f,f,d,u,0,1,0,0,0,i,l
136822,8.65,f,s,c,w,7.82,28.50,w,f,f,d,u,0,0,1,0,0,d,w
552456,4.86,f,s,,y,5.57,11.25,w,f,f,d,u,1,0,0,0,0,y,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,f,p,,w,9.34,27.54,n,f,f,d,a,0,0,0,0,1,w,other
963395,7.39,f,d,d,b,5.99,12.80,w,f,f,d,w,1,0,0,0,0,s,other
2215104,1.94,f,a,,g,5.41,2.12,g,f,f,d,a,0,0,0,1,0,i,g
1484405,3.58,f,d,c,w,3.47,8.04,w,f,f,d,u,1,0,0,0,0,s,w


In [251]:
x_train["does-bruise-or-bleed"].isna().sum()

np.int64(7)

In [252]:
x_train.columns

Index(['cap-diameter', 'does-bruise-or-bleed', 'gill-attachment',
       'gill-spacing', 'gill-color', 'stem-height', 'stem-width', 'stem-color',
       'has-ring', 'ring-type', 'habitat', 'season', 'cap_shape_x',
       'cap_shape_f', 'cap_shape_s', 'cap_shape_b', 'cap_shape_other',
       'cap-surface-clean', 'cap-color-clean'],
      dtype='object')

In [253]:
most_freq = x_train["does-bruise-or-bleed"].mode()[0]
x_train["does-bruise-or-bleed"].fillna(most_freq, inplace=True)
info["does-bruise-or-bleed_most_freq"] = most_freq
x_train["does-bruise-or-bleed"] = (x_train["does-bruise-or-bleed"] == "t").astype("int8")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train["does-bruise-or-bleed"].fillna(most_freq, inplace=True)


In [254]:
x_train

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,cap_shape_x,cap_shape_f,cap_shape_s,cap_shape_b,cap_shape_other,cap-surface-clean,cap-color-clean
1249285,6.11,0,,,k,12.72,16.36,w,f,f,d,a,0,0,0,1,0,y,w
1952332,10.42,1,d,c,o,6.73,17.74,w,f,f,d,a,0,0,1,0,0,t,w
2912806,1.86,0,a,,w,3.63,2.46,w,f,f,d,u,0,1,0,0,0,i,l
136822,8.65,0,s,c,w,7.82,28.50,w,f,f,d,u,0,0,1,0,0,d,w
552456,4.86,0,s,,y,5.57,11.25,w,f,f,d,u,1,0,0,0,0,y,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,0,p,,w,9.34,27.54,n,f,f,d,a,0,0,0,0,1,w,other
963395,7.39,0,d,d,b,5.99,12.80,w,f,f,d,w,1,0,0,0,0,s,other
2215104,1.94,0,a,,g,5.41,2.12,g,f,f,d,a,0,0,0,1,0,i,g
1484405,3.58,0,d,c,w,3.47,8.04,w,f,f,d,u,1,0,0,0,0,s,w


In [255]:
x_train["gill-attachment"].value_counts()[:30]

gill-attachment
a            484150
d            441860
x            270491
e            226407
s            221743
p            209511
f             90209
c                56
u                38
w                28
k                27
t                26
y                24
i                20
o                16
b                16
m                16
g                15
h                14
n                12
l                11
r                 8
season            5
z                 5
1.32              1
does None         1
13.15             1
13.94             1
3.91              1
p p               1
Name: count, dtype: int64

In [256]:
gill_att_dict = x_train["gill-attachment"].value_counts(normalize=True)[:6]
gill_att_dict = gill_att_dict/ gill_att_dict.sum()
info["gill-attachment-dict"] = gill_att_dict
def ga(x):
    if isinstance(x, float) and np.isnan(x):
        return np.random.choice(gill_att_dict.index, p=gill_att_dict.values)
    elif np.isin(x, gill_att_dict.index):
        return x
    else:
        return "other"

In [257]:
x_train["gill-attachment-clean"]=x_train["gill-attachment"].apply(ga)

In [258]:
x_train

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,cap_shape_x,cap_shape_f,cap_shape_s,cap_shape_b,cap_shape_other,cap-surface-clean,cap-color-clean,gill-attachment-clean
1249285,6.11,0,,,k,12.72,16.36,w,f,f,d,a,0,0,0,1,0,y,w,x
1952332,10.42,1,d,c,o,6.73,17.74,w,f,f,d,a,0,0,1,0,0,t,w,d
2912806,1.86,0,a,,w,3.63,2.46,w,f,f,d,u,0,1,0,0,0,i,l,a
136822,8.65,0,s,c,w,7.82,28.50,w,f,f,d,u,0,0,1,0,0,d,w,s
552456,4.86,0,s,,y,5.57,11.25,w,f,f,d,u,1,0,0,0,0,y,g,s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,0,p,,w,9.34,27.54,n,f,f,d,a,0,0,0,0,1,w,other,p
963395,7.39,0,d,d,b,5.99,12.80,w,f,f,d,w,1,0,0,0,0,s,other,d
2215104,1.94,0,a,,g,5.41,2.12,g,f,f,d,a,0,0,0,1,0,i,g,a
1484405,3.58,0,d,c,w,3.47,8.04,w,f,f,d,u,1,0,0,0,0,s,w,d


In [259]:
x_train["gill-spacing"].isna().sum()


np.int64(943723)

In [260]:
x_train["gill-spacing"].value_counts()[:20]

gill-spacing
c       997939
d       306171
f        89771
e           14
s           13
a           13
b           11
p            6
x            5
t            4
l            3
k            3
h            3
0            2
y            2
1.88         1
i            1
4.09         1
6.67         1
3.81         1
Name: count, dtype: int64

In [261]:
gill_sp_dict = x_train["gill-spacing"].value_counts(normalize=True)[:6]
gill_sp_dict = gill_sp_dict/ gill_sp_dict.sum()
info["gill-spacing-dict"] = gill_sp_dict
def gs(x):
    if isinstance(x, float) and np.isnan(x):
        return np.random.choice(gill_sp_dict.index, p=gill_sp_dict.values)
    elif np.isin(x, gill_sp_dict.index):
        return x
    else:
        return "other"

In [262]:
x_train["gill-spacing-clean"]=x_train["gill-spacing"].apply(gs)

In [263]:
x_train

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,...,season,cap_shape_x,cap_shape_f,cap_shape_s,cap_shape_b,cap_shape_other,cap-surface-clean,cap-color-clean,gill-attachment-clean,gill-spacing-clean
1249285,6.11,0,,,k,12.72,16.36,w,f,f,...,a,0,0,0,1,0,y,w,x,c
1952332,10.42,1,d,c,o,6.73,17.74,w,f,f,...,a,0,0,1,0,0,t,w,d,c
2912806,1.86,0,a,,w,3.63,2.46,w,f,f,...,u,0,1,0,0,0,i,l,a,d
136822,8.65,0,s,c,w,7.82,28.50,w,f,f,...,u,0,0,1,0,0,d,w,s,c
552456,4.86,0,s,,y,5.57,11.25,w,f,f,...,u,1,0,0,0,0,y,g,s,c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,0,p,,w,9.34,27.54,n,f,f,...,a,0,0,0,0,1,w,other,p,c
963395,7.39,0,d,d,b,5.99,12.80,w,f,f,...,w,1,0,0,0,0,s,other,d,d
2215104,1.94,0,a,,g,5.41,2.12,g,f,f,...,a,0,0,0,1,0,i,g,a,c
1484405,3.58,0,d,c,w,3.47,8.04,w,f,f,...,u,1,0,0,0,0,s,w,d,c


In [264]:
x_train["gill-color"].isna().sum()

np.int64(35)

In [265]:
gill_c_dict = x_train["gill-color"].value_counts(normalize=True)[:6]
gill_c_dict = gill_c_dict/ gill_c_dict.sum()
info["gill-color-dict"] = gill_c_dict
def gc(x):
    if isinstance(x, float) and np.isnan(x):
        return np.random.choice(gill_c_dict.index, p=gill_c_dict.values)
    elif np.isin(x, gill_c_dict.index):
        return x
    else:
        return "other"

In [266]:
x_train["gill-color-clean"]=x_train["gill-color"].apply(gc)

In [267]:
x_train

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,...,cap_shape_x,cap_shape_f,cap_shape_s,cap_shape_b,cap_shape_other,cap-surface-clean,cap-color-clean,gill-attachment-clean,gill-spacing-clean,gill-color-clean
1249285,6.11,0,,,k,12.72,16.36,w,f,f,...,0,0,0,1,0,y,w,x,c,other
1952332,10.42,1,d,c,o,6.73,17.74,w,f,f,...,0,0,1,0,0,t,w,d,c,o
2912806,1.86,0,a,,w,3.63,2.46,w,f,f,...,0,1,0,0,0,i,l,a,d,w
136822,8.65,0,s,c,w,7.82,28.50,w,f,f,...,0,0,1,0,0,d,w,s,c,w
552456,4.86,0,s,,y,5.57,11.25,w,f,f,...,1,0,0,0,0,y,g,s,c,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,0,p,,w,9.34,27.54,n,f,f,...,0,0,0,0,1,w,other,p,c,w
963395,7.39,0,d,d,b,5.99,12.80,w,f,f,...,1,0,0,0,0,s,other,d,d,other
2215104,1.94,0,a,,g,5.41,2.12,g,f,f,...,0,0,0,1,0,i,g,a,c,g
1484405,3.58,0,d,c,w,3.47,8.04,w,f,f,...,1,0,0,0,0,s,w,d,c,w


In [290]:
train.drop(["gill-attachment", "gill-spacing", "gill-color"], axis=1, inplace=True)

KeyError: "['gill-attachment', 'gill-spacing'] not found in axis"

In [269]:
x_train["stem-height"].isna().sum()

np.int64(0)

In [270]:
x_train["stem-height"]=x_train["stem-height"]/x_train["stem-height"].sum()
info["stem-height_dict"]=x_train["stem-height"].values

In [271]:
x_train["stem-width"].isna().sum()

np.int64(0)

In [272]:
x_train["stem-width"]=x_train["stem-width"]/x_train["stem-width"].sum()
info["stem-width_dict"]=x_train["stem-width"].values

In [273]:
x_train["stem-color"].isna().sum()

np.int64(28)

In [274]:
x_train["stem-color"].value_counts()[:15]

stem-color
w    896774
n    753274
y    280796
g     98636
o     83667
e     77687
u     50110
p     41116
k     25286
r     16724
l      7454
b      5522
f       433
s        38
t        32
Name: count, dtype: int64

In [275]:
info["stem-color-dict"] = "w"
x_train["stem-color"].fillna("w", inplace=True)
info["stem-color-dict"] = list(x_train["stem-color"].value_counts()[:3].index)
for k in info["stem-color-dict"]:
    x_train[f"stem-color-{k}"] = (x_train["stem-color"] == k).astype("int8")
x_train[f"stem-color"] = (~x_train["stem-color"].isin(info["stem-color-dict"])).astype("int8")
x_train.drop("stem-color", axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train["stem-color"].fillna("w", inplace=True)


In [276]:
x_train

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,has-ring,ring-type,habitat,...,cap_shape_b,cap_shape_other,cap-surface-clean,cap-color-clean,gill-attachment-clean,gill-spacing-clean,gill-color-clean,stem-color-w,stem-color-n,stem-color-y
1249285,6.11,0,,,k,8.571480e-07,6.271031e-07,f,f,d,...,1,0,y,w,x,c,other,1,0,0
1952332,10.42,1,d,c,o,4.535067e-07,6.800005e-07,f,f,d,...,0,0,t,w,d,c,o,1,0,0
2912806,1.86,0,a,,w,2.446106e-07,9.429545e-08,f,f,d,...,0,0,i,l,a,d,w,1,0,0
136822,8.65,0,s,c,w,5.269573e-07,1.092447e-06,f,f,d,...,0,0,d,w,s,c,w,1,0,0
552456,4.86,0,s,,y,3.753392e-07,4.312292e-07,f,f,d,...,0,0,y,g,s,c,y,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,0,p,,w,6.293838e-07,1.055649e-06,f,f,d,...,0,1,w,other,p,c,w,0,1,0
963395,7.39,0,d,d,b,4.036412e-07,4.906430e-07,f,f,d,...,0,0,s,other,d,d,other,1,0,0
2215104,1.94,0,a,,g,3.645574e-07,8.126275e-08,f,f,d,...,1,0,i,g,a,c,g,0,0,0
1484405,3.58,0,d,c,w,2.338289e-07,3.081851e-07,f,f,d,...,0,0,s,w,d,c,w,1,0,0


In [277]:
has_ring_mf = x_train["has-ring"].mode()[0]
x_train["has-ring"].fillna(has_ring_mf, inplace=True)
info["has-ring_most_freq"] = has_ring_mf
x_train["has-ring_flag"] = (x_train["has-ring"] == "t").astype("int8")



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train["has-ring"].fillna(has_ring_mf, inplace=True)


In [278]:
x_train["ring-type"].fillna("n", inplace=True)
info["ring-type_most_freq"] = "n"
info["ring-type_keys"] = list(x_train["ring-type"].value_counts()[:3].index)
for k in info["ring-type_keys"]:
    x_train[f"ring-type_{k}"] = (x_train["ring-type"] == k).astype("int8")
x_train["ring-type_other"] = (~x_train["ring-type"].isin(info["ring-type_keys"])).astype("int8")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train["ring-type"].fillna("n", inplace=True)


In [279]:
# Train
most_freq = x_train["habitat"].mode()[0]  
x_train["habitat"].fillna(most_freq, inplace=True)
info["habitat_most_freq"] = most_freq
info["habitat_keys"] = list(x_train["habitat"].value_counts()[:4].index)
for k in info["habitat_keys"]:
    x_train[f"habitat_{k}"] = (x_train["habitat"] == k).astype("int8")
x_train["habitat_other"] = (~x_train["habitat"].isin(info["habitat_keys"])).astype("int8")



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train["habitat"].fillna(most_freq, inplace=True)


In [280]:
x_train

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,has-ring,ring-type,habitat,...,has-ring_flag,ring-type_f,ring-type_n,ring-type_e,ring-type_other,habitat_d,habitat_g,habitat_l,habitat_m,habitat_other
1249285,6.11,0,,,k,8.571480e-07,6.271031e-07,f,f,d,...,0,1,0,0,0,1,0,0,0,0
1952332,10.42,1,d,c,o,4.535067e-07,6.800005e-07,f,f,d,...,0,1,0,0,0,1,0,0,0,0
2912806,1.86,0,a,,w,2.446106e-07,9.429545e-08,f,f,d,...,0,1,0,0,0,1,0,0,0,0
136822,8.65,0,s,c,w,5.269573e-07,1.092447e-06,f,f,d,...,0,1,0,0,0,1,0,0,0,0
552456,4.86,0,s,,y,3.753392e-07,4.312292e-07,f,f,d,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,0,p,,w,6.293838e-07,1.055649e-06,f,f,d,...,0,1,0,0,0,1,0,0,0,0
963395,7.39,0,d,d,b,4.036412e-07,4.906430e-07,f,f,d,...,0,1,0,0,0,1,0,0,0,0
2215104,1.94,0,a,,g,3.645574e-07,8.126275e-08,f,f,d,...,0,1,0,0,0,1,0,0,0,0
1484405,3.58,0,d,c,w,2.338289e-07,3.081851e-07,f,f,d,...,0,1,0,0,0,1,0,0,0,0


In [281]:
train.drop(["has-ring", "ring-type", "habitat"], axis=1, inplace=True)

In [282]:
most_freq = x_train["gill-attachment"].mode()[0]
x_train["gill-attachment"].fillna(most_freq, inplace=True)
info["gill-attachment_most_freq"] = most_freq
info["gill-attachment_keys"] = list(x_train["gill-attachment"].value_counts()[:3].index)

for k in info["gill-attachment_keys"]:
    x_train[f"gill-attachment_{k}"] = (x_train["gill-attachment"] == k).astype("int8")

x_train["gill-attachment_other"] = (~x_train["gill-attachment"].isin(info["gill-attachment_keys"])).astype("int8")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train["gill-attachment"].fillna(most_freq, inplace=True)


In [283]:
info["gill-spacing_keys"] = list(x_train["gill-spacing"].value_counts()[:2].index)

for k in info["gill-spacing_keys"]:
    x_train[f"gill-spacing_{k}"] = (x_train["gill-spacing"] == k).astype("int8")

x_train["gill-spacing_other"] = (~x_train["gill-spacing"].isin(info["gill-spacing_keys"])).astype("int8")


In [284]:
info["gill-color_keys"] = list(x_train["gill-color"].value_counts()[:4].index)

for k in info["gill-color_keys"]:
    x_train[f"gill-color_{k}"] = (x_train["gill-color"] == k).astype("int8")

x_train["gill-color_other"] = (~x_train["gill-color"].isin(info["gill-color_keys"])).astype("int8")


In [285]:
x_train

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,has-ring,ring-type,habitat,...,gill-attachment_x,gill-attachment_other,gill-spacing_c,gill-spacing_d,gill-spacing_other,gill-color_w,gill-color_n,gill-color_y,gill-color_p,gill-color_other
1249285,6.11,0,a,,k,8.571480e-07,6.271031e-07,f,f,d,...,0,0,0,0,1,0,0,0,0,1
1952332,10.42,1,d,c,o,4.535067e-07,6.800005e-07,f,f,d,...,0,0,1,0,0,0,0,0,0,1
2912806,1.86,0,a,,w,2.446106e-07,9.429545e-08,f,f,d,...,0,0,0,0,1,1,0,0,0,0
136822,8.65,0,s,c,w,5.269573e-07,1.092447e-06,f,f,d,...,0,1,1,0,0,1,0,0,0,0
552456,4.86,0,s,,y,3.753392e-07,4.312292e-07,f,f,d,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,0,p,,w,6.293838e-07,1.055649e-06,f,f,d,...,0,1,0,0,1,1,0,0,0,0
963395,7.39,0,d,d,b,4.036412e-07,4.906430e-07,f,f,d,...,0,0,0,1,0,0,0,0,0,1
2215104,1.94,0,a,,g,3.645574e-07,8.126275e-08,f,f,d,...,0,0,0,0,1,0,0,0,0,1
1484405,3.58,0,d,c,w,2.338289e-07,3.081851e-07,f,f,d,...,0,0,1,0,0,1,0,0,0,0


In [288]:
x_train.columns


Index(['cap-diameter', 'does-bruise-or-bleed', 'gill-attachment',
       'gill-spacing', 'gill-color', 'stem-height', 'stem-width', 'has-ring',
       'ring-type', 'habitat', 'season', 'cap_shape_x', 'cap_shape_f',
       'cap_shape_s', 'cap_shape_b', 'cap_shape_other', 'cap-surface-clean',
       'cap-color-clean', 'gill-attachment-clean', 'gill-spacing-clean',
       'gill-color-clean', 'stem-color-w', 'stem-color-n', 'stem-color-y',
       'has-ring_flag', 'ring-type_f', 'ring-type_n', 'ring-type_e',
       'ring-type_other', 'habitat_d', 'habitat_g', 'habitat_l', 'habitat_m',
       'habitat_other', 'gill-attachment_a', 'gill-attachment_d',
       'gill-attachment_x', 'gill-attachment_other', 'gill-spacing_c',
       'gill-spacing_d', 'gill-spacing_other', 'gill-color_w', 'gill-color_n',
       'gill-color_y', 'gill-color_p', 'gill-color_other'],
      dtype='object')

In [292]:
x_train["season"].isna().sum()

np.int64(0)

In [293]:
info["season_keys"] = list(x_train["season"].value_counts()[:4].index)
for k in info["season_keys"]:
    x_train[f"season_{k}"] = (x_train["season"] == k).astype("int8")
x_train["season_other"] = (~x_train["season"].isin(info["season_keys"])).astype("int8")
x_train.drop("season", axis=1, inplace=True)


In [294]:
x_train

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,has-ring,ring-type,habitat,...,gill-color_w,gill-color_n,gill-color_y,gill-color_p,gill-color_other,season_a,season_u,season_w,season_s,season_other
1249285,6.11,0,a,,k,8.571480e-07,6.271031e-07,f,f,d,...,0,0,0,0,1,1,0,0,0,0
1952332,10.42,1,d,c,o,4.535067e-07,6.800005e-07,f,f,d,...,0,0,0,0,1,1,0,0,0,0
2912806,1.86,0,a,,w,2.446106e-07,9.429545e-08,f,f,d,...,1,0,0,0,0,0,1,0,0,0
136822,8.65,0,s,c,w,5.269573e-07,1.092447e-06,f,f,d,...,1,0,0,0,0,0,1,0,0,0
552456,4.86,0,s,,y,3.753392e-07,4.312292e-07,f,f,d,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,8.96,0,p,,w,6.293838e-07,1.055649e-06,f,f,d,...,1,0,0,0,0,1,0,0,0,0
963395,7.39,0,d,d,b,4.036412e-07,4.906430e-07,f,f,d,...,0,0,0,0,1,0,0,1,0,0
2215104,1.94,0,a,,g,3.645574e-07,8.126275e-08,f,f,d,...,0,0,0,0,1,1,0,0,0,0
1484405,3.58,0,d,c,w,2.338289e-07,3.081851e-07,f,f,d,...,1,0,0,0,0,0,1,0,0,0


In [295]:
x_test.columns

Index(['cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-color', 'has-ring', 'ring-type',
       'habitat', 'season'],
      dtype='object')

In [299]:
x_test["cap-diameter"] /= info["cap-diameter_mean"]



In [300]:
x_test

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
430964,0.107685,b,h,n,f,,,k,9.95,6.09,w,t,,g,w
1907082,0.103418,f,,n,f,x,c,g,5.70,3.98,w,f,f,d,u
2626732,0.155126,f,i,n,f,e,c,n,4.02,12.81,w,t,r,w,u
38950,0.111199,s,,n,t,d,c,b,6.03,6.37,n,f,f,d,u
96583,0.304731,x,k,n,t,p,,o,7.60,29.43,y,f,f,d,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304798,0.138309,x,s,b,f,x,c,w,6.31,8.18,n,f,f,l,a
1796005,0.258544,s,,w,t,d,c,w,5.93,21.42,w,f,f,d,u
104462,0.044680,b,g,w,f,,c,n,6.81,3.24,n,f,f,m,u
52439,0.055976,b,g,w,f,,c,n,6.22,3.16,n,f,f,g,a


In [301]:
for k in info["cap_shape_keys"]:
    x_test[f"cap_shape_{k}"] = (x_test["cap-shape"] == k).astype("int8")
x_test["cap_shape_other"] = (
    ~x_test["cap-shape"].isin(info["cap_shape_keys"])
).astype("int8")
x_test.drop("cap-shape", axis=1, inplace=True)


In [305]:
def cp_test(x, cap_surface_dict):
    if pd.isna(x):
        return cap_surface_dict.index[0] 
    elif x in cap_surface_dict.index:
        return x
    else:
        return "other"
        

In [315]:
x_test["cap-surface-clean"] = x_test["cap-surface"].apply(lambda x: cp_test(x, info["cap_surface_dict"]))
for k in info["cap_surface_dict"]:
    x_test[f"cap_surface_{k}"] = (x_test["cap-surface"] == k).astype("int8")
x_test["cap_surface_other"] = (~x_test["cap-surface"].isin(info["cap_surface_dict"])).astype("int8")
x_test.drop("cap-surface", axis=1, inplace=True)



In [316]:
x_test

Unnamed: 0,cap-diameter,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,...,cap_surface_0.13404993499892345,cap_surface_0.11644540863465035,cap_surface_0.10773553691979385,cap_surface_0.08448559210938714,cap_surface_0.052712738848295394,cap_surface_0.048918965324582946,cap_surface_0.04641594693077726,cap_surface_0.04501235974348422,cap_surface_0.018340932690499485,cap_surface_other
430964,0.107685,n,f,,,k,9.95,6.09,w,t,...,0,0,0,0,0,0,0,0,0,1
1907082,0.103418,n,f,x,c,g,5.70,3.98,w,f,...,0,0,0,0,0,0,0,0,0,1
2626732,0.155126,n,f,e,c,n,4.02,12.81,w,t,...,0,0,0,0,0,0,0,0,0,1
38950,0.111199,n,t,d,c,b,6.03,6.37,n,f,...,0,0,0,0,0,0,0,0,0,1
96583,0.304731,n,t,p,,o,7.60,29.43,y,f,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304798,0.138309,b,f,x,c,w,6.31,8.18,n,f,...,0,0,0,0,0,0,0,0,0,1
1796005,0.258544,w,t,d,c,w,5.93,21.42,w,f,...,0,0,0,0,0,0,0,0,0,1
104462,0.044680,w,f,,c,n,6.81,3.24,n,f,...,0,0,0,0,0,0,0,0,0,1
52439,0.055976,w,f,,c,n,6.22,3.16,n,f,...,0,0,0,0,0,0,0,0,0,1


In [318]:
def cc_test(x, cap_color_dict):
    if pd.isna(x):
        return cap_color_dict.index[0] 
    elif x in cap_color_dict.index:
        return x
    else:
        return "other"
        

In [319]:
x_test["cap-color-clean"] = x_test["cap-color"].apply(lambda x: cc_test(x, info["cap_color_dict"]))
for k in info["cap_color_dict"]:
    x_test[f"cap_color_{k}"] = (x_test["cap-color"] == k).astype("int8")
x_test["cap_color_other"] = (~x_test["cap-color"].isin(info["cap_color_dict"])).astype("int8")
x_test.drop("cap-color", axis=1, inplace=True)


In [320]:
x_test.columns

Index(['cap-diameter', 'does-bruise-or-bleed', 'gill-attachment',
       'gill-spacing', 'gill-color', 'stem-height', 'stem-width', 'stem-color',
       'has-ring', 'ring-type', 'habitat', 'season', 'cap_shape_x',
       'cap_shape_f', 'cap_shape_s', 'cap_shape_b', 'cap_shape_other',
       'cap-surface-clean', 'cap_surface_0.18826056029020188',
       'cap_surface_0.15762202350940405', 'cap_surface_0.13404993499892345',
       'cap_surface_0.11644540863465035', 'cap_surface_0.10773553691979385',
       'cap_surface_0.08448559210938714', 'cap_surface_0.052712738848295394',
       'cap_surface_0.048918965324582946', 'cap_surface_0.04641594693077726',
       'cap_surface_0.04501235974348422', 'cap_surface_0.018340932690499485',
       'cap_surface_other', 'cap-color-clean', 'cap_color_0.5012129139575797',
       'cap_color_0.14268636232480336', 'cap_color_0.1398368133026799',
       'cap_color_0.0775866772646099', 'cap_color_0.07269520541379905',
       'cap_color_0.06598202773652805', '

In [322]:
x_test["does-bruise-or-bleed"].fillna(most_freq, inplace=True)
x_test["does-bruise-or-bleed"] = (x_test["does-bruise-or-bleed"] == "t").astype("int8")
x_test

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_test["does-bruise-or-bleed"].fillna(most_freq, inplace=True)


Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,...,cap_surface_0.018340932690499485,cap_surface_other,cap-color-clean,cap_color_0.5012129139575797,cap_color_0.14268636232480336,cap_color_0.1398368133026799,cap_color_0.0775866772646099,cap_color_0.07269520541379905,cap_color_0.06598202773652805,cap_color_other
430964,0.107685,0,,,k,9.95,6.09,w,t,,...,0,1,n,0,0,0,0,0,0,1
1907082,0.103418,0,x,c,g,5.70,3.98,w,f,f,...,0,1,n,0,0,0,0,0,0,1
2626732,0.155126,0,e,c,n,4.02,12.81,w,t,r,...,0,1,n,0,0,0,0,0,0,1
38950,0.111199,0,d,c,b,6.03,6.37,n,f,f,...,0,1,n,0,0,0,0,0,0,1
96583,0.304731,0,p,,o,7.60,29.43,y,f,f,...,0,1,n,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304798,0.138309,0,x,c,w,6.31,8.18,n,f,f,...,0,1,other,0,0,0,0,0,0,1
1796005,0.258544,0,d,c,w,5.93,21.42,w,f,f,...,0,1,w,0,0,0,0,0,0,1
104462,0.044680,0,,c,n,6.81,3.24,n,f,f,...,0,1,w,0,0,0,0,0,0,1
52439,0.055976,0,,c,n,6.22,3.16,n,f,f,...,0,1,w,0,0,0,0,0,0,1


In [331]:
def ga_test(x, gill_att_dict):
    if pd.isna(x):
        return gill_att_dict.index[0]  
    elif x in gill_att_dict.index:
        return x
    else:
        return "other"



In [339]:
x_test["gill-attachment"] = x_test["gill-attachment"].apply(lambda x: ga_test(x, gill_att_dict))
for k in info["gill-attachment-dict"]:
    x_test[f"gill_attachment_{k}"] = (x_test["gill-attachment"] == k).astype("int8")
x_test["gill-attachment_other"] = (~x_test["gill-attachment"].isin(info["gill-attachment-dict"])).astype("int8")
x_test.drop("gill-attachment", axis=1, inplace=True)


In [340]:
x_test

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,...,cap_color_0.06598202773652805,cap_color_other,gill-attachment-clean,gill_attachment_0.2611152639305519,gill_attachment_0.23830711663813625,gill_attachment_0.14588315368344298,gill_attachment_0.12210745339404001,gill_attachment_0.11959203133275302,gill_attachment_0.11299498102107584,gill-attachment_other
430964,0.107685,0,,k,9.95,6.09,w,t,,g,...,0,1,e,0,0,0,0,0,0,1
1907082,0.103418,0,c,g,5.70,3.98,w,f,f,d,...,0,1,x,0,0,0,0,0,0,1
2626732,0.155126,0,c,n,4.02,12.81,w,t,r,w,...,0,1,e,0,0,0,0,0,0,1
38950,0.111199,0,c,b,6.03,6.37,n,f,f,d,...,0,1,d,0,0,0,0,0,0,1
96583,0.304731,0,,o,7.60,29.43,y,f,f,d,...,0,1,p,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304798,0.138309,0,c,w,6.31,8.18,n,f,f,l,...,0,1,x,0,0,0,0,0,0,1
1796005,0.258544,0,c,w,5.93,21.42,w,f,f,d,...,0,1,d,0,0,0,0,0,0,1
104462,0.044680,0,c,n,6.81,3.24,n,f,f,m,...,0,1,a,0,0,0,0,0,0,1
52439,0.055976,0,c,n,6.22,3.16,n,f,f,g,...,0,1,s,0,0,0,0,0,0,1


In [341]:
info["gill-spacing-dict"] = gill_sp_dict        
def gs_test(x, gill_sp_dict):
    if pd.isna(x):
        return gill_sp_dict.index[0] 
    elif x in gill_sp_dict.index:
        return x
    else:
        return "other"
x_test["gill-spacing"] = x_test["gill-spacing"].apply(lambda x: gs_test(x, gill_sp_dict))
for k in info["gill-spacing-dict"]:
    x_test[f"gill_spacing_{k}"] = (x_test["gill-spacing"] == k).astype("int8")
x_test["gill-spacing_other"] = (~x_test["gill-spacing"].isin(info["gill-spacing-dict"])).astype("int8")
x_test.drop("gill-spacing", axis=1, inplace=True)                
        

In [342]:
x_test

Unnamed: 0,cap-diameter,does-bruise-or-bleed,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,...,gill_attachment_0.12210745339404001,gill_attachment_0.11959203133275302,gill_attachment_0.11299498102107584,gill-attachment_other,gill_spacing_0.7159222079300046,gill_spacing_0.21964731143300087,gill_spacing_0.06440178460615774,gill_spacing_1.0043610792864157e-05,gill_spacing_9.32621002194529e-06,gill-spacing_other
430964,0.107685,0,k,9.95,6.09,w,t,,g,w,...,0,0,0,1,0,0,0,0,0,1
1907082,0.103418,0,g,5.70,3.98,w,f,f,d,u,...,0,0,0,1,0,0,0,0,0,1
2626732,0.155126,0,n,4.02,12.81,w,t,r,w,u,...,0,0,0,1,0,0,0,0,0,1
38950,0.111199,0,b,6.03,6.37,n,f,f,d,u,...,0,0,0,1,0,0,0,0,0,1
96583,0.304731,0,o,7.60,29.43,y,f,f,d,u,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304798,0.138309,0,w,6.31,8.18,n,f,f,l,a,...,0,0,0,1,0,0,0,0,0,1
1796005,0.258544,0,w,5.93,21.42,w,f,f,d,u,...,0,0,0,1,0,0,0,0,0,1
104462,0.044680,0,n,6.81,3.24,n,f,f,m,u,...,0,0,0,1,0,0,0,0,0,1
52439,0.055976,0,n,6.22,3.16,n,f,f,g,a,...,0,0,0,1,0,0,0,0,0,1


In [344]:
info["gill-color-dict"] = gill_c_dict        
def gc_test(x, gill_c_dict):
    if pd.isna(x):
        return gill_c_dict.index[0] 
    elif x in gill_c_dict.index:
        return x
    else:
        return "other"
x_test["gill-color"] = x_test["gill-color"].apply(lambda x: gc_test(x, gill_c_dict))
for k in info["gill-color-dict"]:
    x_test[f"gill_color_{k}"] = (x_test["gill-color"] == k).astype("int8")
x_test["gill-color_other"] = (~x_test["gill-color"].isin(info["gill-color-dict"])).astype("int8")
x_test.drop("gill-color", axis=1, inplace=True)                
        

In [345]:
x_test

Unnamed: 0,cap-diameter,does-bruise-or-bleed,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,cap_shape_x,...,gill_spacing_1.0043610792864157e-05,gill_spacing_9.32621002194529e-06,gill-spacing_other,gill_color_0.35074476706114943,gill_color_0.20436448300345475,gill_color_0.17696026654557043,gill_color_0.1293609078204932,gill_color_0.07956174529142132,gill_color_0.059007830277910886,gill-color_other
430964,0.107685,0,9.95,6.09,w,t,,g,w,0,...,0,0,1,0,0,0,0,0,0,1
1907082,0.103418,0,5.70,3.98,w,f,f,d,u,0,...,0,0,1,0,0,0,0,0,0,1
2626732,0.155126,0,4.02,12.81,w,t,r,w,u,0,...,0,0,1,0,0,0,0,0,0,1
38950,0.111199,0,6.03,6.37,n,f,f,d,u,0,...,0,0,1,0,0,0,0,0,0,1
96583,0.304731,0,7.60,29.43,y,f,f,d,u,1,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304798,0.138309,0,6.31,8.18,n,f,f,l,a,1,...,0,0,1,0,0,0,0,0,0,1
1796005,0.258544,0,5.93,21.42,w,f,f,d,u,0,...,0,0,1,0,0,0,0,0,0,1
104462,0.044680,0,6.81,3.24,n,f,f,m,u,0,...,0,0,1,0,0,0,0,0,0,1
52439,0.055976,0,6.22,3.16,n,f,f,g,a,0,...,0,0,1,0,0,0,0,0,0,1
