<a href="https://www.kaggle.com/code/archismancoder/binarypredpoisonousmushrooms-tfdf?scriptVersionId=191252653" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

In [None]:
df_train.info()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.duplicated().sum()

## MEMORY OPTIMIZATION

In [None]:
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


In [None]:
# df_train = reduce_memory_usage(df_train)
# df_test = reduce_memory_usage(df_test)

# VISUALIZATION

In [None]:
df_train.shape

In [None]:
import matplotlib.pyplot as plt

missing_values = df_train.isnull().mean() * 100

# Plot
missing_values.plot(kind='bar', figsize=(8, 4))

plt.title('Percentage of Missing Values by Feature')
plt.ylabel('Percentage')
plt.xlabel('Features')
plt.xticks(rotation=45)
plt.show()

In [None]:
import seaborn as sns

plt.figure(figsize=(18,12))
plt.title("Visualizing Missing Values")
sns.heatmap(df_train.isnull(), cbar=False, cmap=sns.color_palette("Spectral_r", n_colors=13), yticklabels=False);

In [None]:
categorical_cols = [col for col in df_train.columns if df_train[col].dtype == 'category' and col != 'class']
continuous_cols = [col for col in df_train.columns if df_train[col].dtypes == 'float16']
FEATURES = [col for col in df_train.columns if col not in ['id', 'class']]
TARGET = 'class'

In [None]:
# sns.set_style('darkgrid')

def categorical_distributions_plot(df, cols, rows_num=6, cols_num=3):
    fig = plt.figure(figsize=(16, 32))
    
    for idx, col in enumerate(cols):
        ax = fig.add_subplot(rows_num, cols_num, idx + 1)
        temp = df[col].value_counts().nlargest(10).reset_index()
        temp.columns = [col, 'count']
        sns.barplot(y=col, x='count', data=temp, palette='Set2', ax=ax, order=temp[col])
        
        ax.set_xlabel('Count', fontsize=12)
        ax.set_ylabel(col, fontsize=12)
        ax.set_title(f'Distribution of {col}', fontsize=14)
        ax.xaxis.set_tick_params(rotation=45)
        ax.yaxis.set_tick_params(rotation=0)
        
        for container in ax.containers:
            ax.bar_label(container, fontsize=10)
    
    fig.tight_layout()
    plt.show()


In [None]:
categorical_distributions_plot(df_train, categorical_cols)

In [None]:
def categorical_vs_target_plot(df, cols, target, rows_num=6, cols_num=3):
    fig = plt.figure(figsize=(16, 14))
    
    for idx, col in enumerate(cols):
        ax = fig.add_subplot(rows_num, cols_num, idx + 1)
        target_distribution = df.groupby(col)[target].value_counts(normalize=True).unstack().fillna(0)
        top_categories = df[col].value_counts().nlargest(10).index
        target_distribution = target_distribution.loc[top_categories]
        target_distribution = target_distribution.sort_values(by=target_distribution.columns.tolist(), ascending=False)
        target_distribution.plot(kind='bar', stacked=True, colormap='Set2', edgecolor='black', alpha=0.7, ax=ax)
        
        ax.set_xlabel(col, fontsize=12)
        ax.set_ylabel('Proportion', fontsize=12)
        ax.set_title(f'{col} and {target}', fontsize=14)
        ax.xaxis.set_tick_params(rotation=45)
        ax.yaxis.set_tick_params(rotation=0)
        ax.legend(title=target, fontsize=10)
    
    fig.tight_layout()
    plt.show()

In [None]:
categorical_vs_target_plot(df_train, categorical_cols, 'class')

# PREPROCESSING

In [None]:
df_train.drop(['id'], axis = 1, inplace = True)
df_test.drop(['id'], axis = 1, inplace = True)

In [None]:
df_train['class'] = df_train['class'].map({'p': 1, 'e': 0})

In [None]:
for col in categorical_cols:
    if col in FEATURES:
        df_train[col] = df_train[col].astype(str).fillna('NaN')
        df_test[col] = df_test[col].astype(str).fillna('NaN')

In [None]:
df_train[FEATURES].head()

In [None]:
df_train

# MODEL

## TFDF MODEL

### TRAIN & VALIDATION SPLITTING

In [None]:
import tensorflow_decision_forests as tfdf
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [None]:
def test_train_split(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, validation_ds_pd = test_train_split(df_train)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(validation_ds_pd)))

In [None]:
validation_ds_pd.info()

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=TARGET, task = tfdf.keras.Task.CLASSIFICATION)
validation_ds = tfdf.keras.pd_dataframe_to_tf_dataset(validation_ds_pd, label=TARGET, task = tfdf.keras.Task.CLASSIFICATION)

In [None]:
train_ds

In [None]:
tfdf.keras.get_all_models()

In [None]:
rf = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.CLASSIFICATION)
rf.compile()

In [None]:
rf.fit(x=train_ds)

In [None]:
tfdf.model_plotter.plot_model_in_colab(rf, tree_idx=0, max_depth=5)

In [None]:
inspector = rf.make_inspector()
inspector.evaluation()

In [None]:
evaluation = rf.evaluate(x=validation_ds,return_dict=True)

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

In [None]:
evaluation

# TESTING

In [None]:
df_test_tfdf = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')
ids = df_test_tfdf.pop('id')

In [None]:
test_ds_tf = tfdf.keras.pd_dataframe_to_tf_dataset(
    df_test_tfdf,
    task = tfdf.keras.Task.CLASSIFICATION)

preds = rf.predict(test_ds_tf)

output = pd.DataFrame({'id': ids,
                       'class': preds.squeeze()})

output.head()

In [None]:
y_pred = np.where(preds>0.5,1,0)

In [None]:
output2 = pd.DataFrame({'id': ids,
                       'class': y_pred.squeeze()})

In [None]:
output2["class"] = np.where(output2["class"] == 1, 'p', 'e')
output_test_tfdf = output2

In [None]:
output_test_tfdf.head(10)

In [None]:
output_test_tfdf.to_csv('submission_tfdf.csv', index=False)