In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e8:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F76727%2F9045607%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240911%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240911T095542Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D86aa438d2307b5f32733ef403acff5441e2ecb1cb3822236d729e596e205115dcc4d6746ac9050d82e25bc6ca72cace72489db749df65e185b39ff0bd8c6f0206fcfa089f201b5ed67ddf42e3ef08871e11ed54efdc892d95b9ea5691d30f39a6dc0cb3f765a68882bafa85693649e28e3aef287a1e997eec078a06392fbc378fd735fa30469ba516e4a3710ee66b0b9987bdb2eecbf31fe67c0351605dec35e0a1e42a4020dee9c25aed6b1da2fc3c21e8fb1ce97ce1a3c5000d39468e276ff120a439f686c96f815435e305615a6e3c2623f47bd398d1937da06c4e7ed2e429fc76e7fd9344e9a488a5e6aef0decbf4b69cc7eef88b8905f9216ea3f5fd27c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading playground-series-s4e8, 86301661 bytes compressed
Downloaded and uncompressed: playground-series-s4e8
Data source import complete.


# Importing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Reading Data

In [None]:
train_set = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
test_set = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

# Take a look

In [None]:
train_set.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [None]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2077964 entries, 0 to 2077963
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   cap-diameter          float64
 2   cap-shape             object 
 3   cap-surface           object 
 4   cap-color             object 
 5   does-bruise-or-bleed  object 
 6   gill-attachment       object 
 7   gill-spacing          object 
 8   gill-color            object 
 9   stem-height           float64
 10  stem-width            float64
 11  stem-root             object 
 12  stem-surface          object 
 13  stem-color            object 
 14  veil-type             object 
 15  veil-color            object 
 16  has-ring              object 
 17  ring-type             object 
 18  spore-print-color     object 
 19  habitat               object 
 20  season                object 
dtypes: float64(3), int64(1), object(17)
memory usage: 332.9+ MB


In [None]:
df = train_set

In [None]:
for column in df.columns:
    print(df[column].value_counts())
    print('-'*20)

id
0          1
2077967    1
2077958    1
2077959    1
2077960    1
          ..
1038984    1
1038985    1
1038986    1
1038987    1
3116944    1
Name: count, Length: 3116945, dtype: int64
--------------------
class
p    1705396
e    1411549
Name: count, dtype: int64
--------------------
cap-diameter
1.49     8164
3.18     7942
3.14     7361
1.51     7072
4.04     6828
         ... 
58.19       1
31.59       1
24.93       1
58.76       1
54.07       1
Name: count, Length: 3913, dtype: int64
--------------------
cap-shape
x        1436026
f         676238
s         365146
b         318646
o         108835
          ...   
12.62          1
5.15           1
19.04          1
49.21          1
19.06          1
Name: count, Length: 74, dtype: int64
--------------------
cap-surface
t        460777
s        384970
y        327826
h        284460
g        263729
          ...  
1.42          1
has h         1
10.83         1
8.96          1
0.87          1
Name: count, Length: 83, dtype: int64
-

In [None]:
print(df.isnull().sum())

id                            0
class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64


In [None]:
test_set.isnull().sum()

Unnamed: 0,0
id,0
cap-diameter,7
cap-shape,31
cap-surface,446904
cap-color,13
does-bruise-or-bleed,10
gill-attachment,349821
gill-spacing,839595
gill-color,49
stem-height,1


# Classifying parameters

In [None]:
numeric_params = ['cap-diameter', 'stem-width', 'stem-height']
object_params = ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed',
                 'gill-attachment', 'gill-spacing', 'gill-color',
                 'stem-root', 'stem-surface', 'stem-color', 'veil-type',
                 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']

# Imputation

In [None]:
train_set[numeric_params] = SimpleImputer(strategy='median').fit_transform(train_set[numeric_params])
test_set[numeric_params] = SimpleImputer(strategy='median').fit_transform(test_set[numeric_params])

In [None]:
df['cap-diameter'].isnull().any()

False

In [None]:
train_set[object_params] = SimpleImputer(strategy='most_frequent').fit_transform(train_set[object_params])
test_set[object_params] = SimpleImputer(strategy='most_frequent').fit_transform(test_set[object_params])

In [None]:
df.isnull().any()

Unnamed: 0,0
id,False
class,False
cap-diameter,False
cap-shape,False
cap-surface,False
cap-color,False
does-bruise-or-bleed,False
gill-attachment,False
gill-spacing,False
gill-color,False


In [None]:
for column in object_params:
    train_set[column] = LabelEncoder().fit_transform(train_set[column].astype(str))
    test_set[column] = LabelEncoder().fit_transform(test_set[column].astype(str))

In [None]:
df

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.80,53,72,72,8,44,28,59,...,15,51,55,19,21,5,18,17,25,0
1,1,p,4.51,71,56,64,8,44,28,46,...,15,58,47,19,21,18,39,17,25,3
2,2,e,6.94,53,72,49,8,75,28,59,...,15,51,46,19,21,5,18,17,36,3
3,3,e,3.88,53,81,57,8,70,28,37,...,15,51,55,19,21,5,18,17,25,2
4,4,e,5.85,71,65,74,8,47,28,59,...,15,51,55,19,21,5,18,17,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,3116940,e,9.29,53,76,63,20,44,28,59,...,15,51,55,19,21,18,19,17,25,2
3116941,3116941,e,10.88,67,76,74,20,47,28,48,...,15,51,55,19,21,5,18,17,25,2
3116942,3116942,p,7.82,71,53,55,8,44,28,59,...,15,51,57,19,21,18,39,17,25,0
3116943,3116943,e,9.45,64,59,63,20,52,28,48,...,15,58,55,19,21,18,27,17,25,2


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             int64  
 4   cap-surface           int64  
 5   cap-color             int64  
 6   does-bruise-or-bleed  int64  
 7   gill-attachment       int64  
 8   gill-spacing          int64  
 9   gill-color            int64  
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             int64  
 13  stem-surface          int64  
 14  stem-color            int64  
 15  veil-type             int64  
 16  veil-color            int64  
 17  has-ring              int64  
 18  ring-type             int64  
 19  spore-print-color     int64  
 20  habitat               int64  
 21  season                int64  
dtypes: float64(3), int64(18), object(1)
memory

# Plotting the data

In [None]:
px.bar(df['class'].value_counts())

# Creating the model

In [None]:
X = train_set.drop(['class', 'id'], axis=1)
y = train_set['class']

In [None]:
X

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,8.80,53,72,72,8,44,28,59,4.51,15.39,15,51,55,19,21,5,18,17,25,0
1,4.51,71,56,64,8,44,28,46,4.79,6.48,15,58,47,19,21,18,39,17,25,3
2,6.94,53,72,49,8,75,28,59,6.85,9.93,15,51,46,19,21,5,18,17,36,3
3,3.88,53,81,57,8,70,28,37,4.16,6.53,15,51,55,19,21,5,18,17,25,2
4,5.85,71,65,74,8,47,28,59,3.37,8.36,15,51,55,19,21,5,18,17,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,9.29,53,76,63,20,44,28,59,12.14,18.81,15,51,55,19,21,18,19,17,25,2
3116941,10.88,67,76,74,20,47,28,48,6.65,26.97,15,51,55,19,21,5,18,17,25,2
3116942,7.82,71,53,55,8,44,28,59,9.51,11.06,15,51,57,19,21,18,39,17,25,0
3116943,9.45,64,59,63,20,52,28,48,9.13,17.77,15,58,55,19,21,18,27,17,25,2


In [None]:
y

Unnamed: 0,class
0,e
1,p
2,e
3,e
4,e
...,...
3116940,e
3116941,e
3116942,p
3116943,e


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [None]:
model = RandomForestClassifier(random_state=12, max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=200)

In [None]:
model.fit(X_train, y_train)

# Evaluating the model

In [None]:
model.score(X_test, y_test)

0.9917387056877808

In [None]:
feature_importance = model.feature_importances_
feature_importance

array([7.75253234e-02, 5.20909727e-02, 8.60094254e-02, 5.23200768e-02,
       3.10722214e-02, 7.70787591e-02, 5.64602870e-02, 7.40372508e-02,
       6.96917194e-02, 1.29327736e-01, 2.83096134e-02, 5.88471487e-02,
       8.16244523e-02, 3.21186168e-06, 9.54465033e-03, 2.37465575e-02,
       4.14591539e-02, 1.09040358e-02, 2.84892280e-02, 1.14581760e-02])

In [None]:
px.bar(x=X_train.columns, y=feature_importance)

## DONE :)