In [36]:
!pip install numpy
!pip install pandas
!pip install seaborn
!pip install matplotlib
!pip install lightgbm



In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
from lightgbm import LGBMClassifier

In [38]:
train_df = pd.read_csv('data/train.csv', index_col=0)
test_df = pd.read_csv('data/test.csv', index_col=0)

In [39]:
train_df

Unnamed: 0_level_0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,e,8.80,f,s,u,f,a,c,w,4.51,...,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,...,,y,o,,,t,z,,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,...,,s,n,,,f,f,,l,w
3,e,3.88,f,y,g,f,s,,g,4.16,...,,,w,,,f,f,,d,u
4,e,5.85,x,l,w,f,d,,w,3.37,...,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,e,9.29,f,,n,t,,,w,12.14,...,b,,w,u,w,t,g,,d,u
3116941,e,10.88,s,,w,t,d,c,p,6.65,...,,,w,,,f,f,,d,u
3116942,p,7.82,x,e,e,f,a,,w,9.51,...,,,y,,w,t,z,,d,a
3116943,e,9.45,p,i,n,t,e,,p,9.13,...,,y,w,,,t,p,,d,u


In [40]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3116945 entries, 0 to 3116944
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 object 
 1   cap-diameter          float64
 2   cap-shape             object 
 3   cap-surface           object 
 4   cap-color             object 
 5   does-bruise-or-bleed  object 
 6   gill-attachment       object 
 7   gill-spacing          object 
 8   gill-color            object 
 9   stem-height           float64
 10  stem-width            float64
 11  stem-root             object 
 12  stem-surface          object 
 13  stem-color            object 
 14  veil-type             object 
 15  veil-color            object 
 16  has-ring              object 
 17  ring-type             object 
 18  spore-print-color     object 
 19  habitat               object 
 20  season                object 
dtypes: float64(3), object(18)
memory usage: 523.2+ MB


In [41]:
train_df.columns

Index(['class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')

In [42]:
cat_feats = ['cap-shape', 'cap-surface', 'cap-color',
             'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',  'stem-root', 'stem-surface', 'stem-color',
             'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
             'habitat', 'season']

In [43]:
for feat in cat_feats:
    train_df[feat] = train_df[feat].fillna('missing')
    test_df[feat] = test_df[feat].fillna('missing')
    train_df[feat] = train_df[feat].astype('category')
    test_df[feat] = test_df[feat].astype('category')

In [44]:
train_df['cap-diameter'] = train_df['cap-diameter'].fillna(train_df['cap-diameter'].mean())
test_df['cap-diameter'] = test_df['cap-diameter'].fillna(test_df['cap-diameter'].mean())

In [45]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3116945 entries, 0 to 3116944
Data columns (total 21 columns):
 #   Column                Dtype   
---  ------                -----   
 0   class                 object  
 1   cap-diameter          float64 
 2   cap-shape             category
 3   cap-surface           category
 4   cap-color             category
 5   does-bruise-or-bleed  category
 6   gill-attachment       category
 7   gill-spacing          category
 8   gill-color            category
 9   stem-height           float64 
 10  stem-width            float64 
 11  stem-root             category
 12  stem-surface          category
 13  stem-color            category
 14  veil-type             category
 15  veil-color            category
 16  has-ring              category
 17  ring-type             category
 18  spore-print-color     category
 19  habitat               category
 20  season                category
dtypes: category(17), float64(3), object(1)
memory usage: 169.5+

In [46]:
x = train_df.drop('class', axis=1)
x

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,8.80,f,s,u,f,a,c,w,4.51,15.39,missing,missing,w,missing,missing,f,f,missing,d,a
1,4.51,x,h,o,f,a,c,n,4.79,6.48,missing,y,o,missing,missing,t,z,missing,d,w
2,6.94,f,s,b,f,x,c,w,6.85,9.93,missing,s,n,missing,missing,f,f,missing,l,w
3,3.88,f,y,g,f,s,missing,g,4.16,6.53,missing,missing,w,missing,missing,f,f,missing,d,u
4,5.85,x,l,w,f,d,missing,w,3.37,8.36,missing,missing,w,missing,missing,f,f,missing,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,9.29,f,missing,n,t,missing,missing,w,12.14,18.81,b,missing,w,u,w,t,g,missing,d,u
3116941,10.88,s,missing,w,t,d,c,p,6.65,26.97,missing,missing,w,missing,missing,f,f,missing,d,u
3116942,7.82,x,e,e,f,a,missing,w,9.51,11.06,missing,missing,y,missing,w,t,z,missing,d,a
3116943,9.45,p,i,n,t,e,missing,p,9.13,17.77,missing,y,w,missing,missing,t,p,missing,d,u


In [47]:
y = train_df['class']
y

id
0          e
1          p
2          e
3          e
4          e
          ..
3116940    e
3116941    e
3116942    p
3116943    e
3116944    p
Name: class, Length: 3116945, dtype: object

In [48]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=52, shuffle=True)
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

(2493556, 20) (2493556,)
(623389, 20) (623389,)


In [50]:
model = LGBMClassifier(colsample_bytree=0.6248142892362624,
                       learning_rate=0.029054776460531954, 
                       max_depth=11,
                       min_child_samples=180, 
                       n_estimators=1188,
                       num_leaves=459,
                       objective='binary', 
                       random_state=52, 
                       reg_alpha=2.701133349161215,
                       reg_lambda=2.3413285519209658e-08, 
                       subsample=0.6193856517174553,
                       subsample_for_bin=36805, 
                       verbosity=-1)
model.fit(x_train, y_train)
pred = model.predict(x_val)

In [51]:
mcc = matthews_corrcoef(y_val, pred)
print(f'Validation MCC: {mcc}')

Validation MCC: 0.984631004315698


In [55]:
test_preds = model.predict(test_df)
submission = pd.read_csv('data/sample_submission.csv')

In [58]:
submission = pd.DataFrame({
    'id': submission['id'],
    'class': test_preds
})
submission.to_csv('data/sample_submission.csv', index=False)

In [57]:
submission

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
...,...,...
2077959,5194904,p
2077960,5194905,p
2077961,5194906,p
2077962,5194907,e
