# Import necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

# Load and Prepare the Mushroom Dataset

In [None]:
dataset_link = "https://raw.githubusercontent.com/massudavide/Mushroom-Dataset/refs/heads/master/mushroom_data_all.csv"
try:
    df = pd.read_csv(dataset_link)
    print("Mushroom dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'mushrooms.csv' not found. Please update the file path.")

Mushroom dataset loaded successfully!


# Exploratory Data Analysis (EDA) & Preprocessing

In [None]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
class_edible                0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64


There are no missing values in the dataset.

In [None]:
# checking for values in each column/feature
for col in df.columns:
    print(col,":",df[col].unique())

class_edible : ['p' 'e']
cap-shape : ['x' 'b' 's' 'f' 'k' 'c']
cap-surface : ['s' 'y' 'f' 'g']
cap-color : ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']
bruises : ['t' 'f']
odor : ['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']
gill-attachment : ['f' 'a']
gill-spacing : ['c' 'w']
gill-size : ['n' 'b']
gill-color : ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
stalk-shape : ['e' 't']
stalk-root : ['e' 'c' 'b' 'r' '?']
stalk-surface-above-ring : ['s' 'f' 'k' 'y']
stalk-surface-below-ring : ['s' 'f' 'y' 'k']
stalk-color-above-ring : ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
stalk-color-below-ring : ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
veil-type : ['p']
veil-color : ['w' 'n' 'o' 'y']
ring-number : ['o' 't' 'n']
ring-type : ['p' 'e' 'l' 'f' 'n']
spore-print-color : ['k' 'n' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
population : ['s' 'n' 'a' 'v' 'y' 'c']
habitat : ['u' 'g' 'm' 'd' 'p' 'w' 'l']


Observations:
1.   It can be seen that `stalk-root` has a value `?`, it seems like a placeholder for missing data.
2.    `veil-type` has only one value, therefoe it provides no predictive information. It is reasonable to drop this column.

In [None]:
# Count the number of '?' in 'stalk-root'
print(f"\nNumber of '?' in 'stalk-root': {df['stalk-root'].value_counts()['?']}")
print(f"Percentage of '?' in 'stalk-root': {df['stalk-root'].value_counts()['?']/len(df['stalk-root'])*100:.2f}%")


Number of '?' in 'stalk-root': 2480
Percentage of '?' in 'stalk-root': 30.53%


Since `stalk-root` has only 30.53% of missing data, I will replace `?` with the most frequent value (mode) in this feature.

Note: If more than 50% of the data would have been missing from `stalk-root`, we could have dropped this column as well.

In [None]:
# I am going to replace '?' with the mode (most frequent item) of the column.
stalk_root_mode = df['stalk-root'].mode()[0]
df['stalk-root'] = df['stalk-root'].replace('?', stalk_root_mode)

In [None]:
# Checking the unique values in 'stalk-root' column after replacing '?'
print("\nUnique values in 'stalk-root' column after replacing '?':")
print(df['stalk-root'].unique())


Unique values in 'stalk-root' column after replacing '?':
['e' 'c' 'b' 'r']


In [None]:
# Drop 'veil-type' as it has only one value and provides no predictive information.
df.drop('veil-type', axis=1, inplace=True)
print("'veil-type' column has been dropped.")

'veil-type' column has been dropped.


# Model Training

In [None]:
# Create "DNA" sentences with unique "genes"
# Each feature-value pair becomes a unique token (e.g., 'odor_p')
def create_dna_sentence(row):
    # Use a short prefix for each column to keep tokens manageable. Tokens are the words converted to Numbers.
    # For example, 'cap-shape' becomes 'cs'
    prefixes = {col: ''.join([word[0] for word in col.split('-')]) for col in row.index}

    # Create a list of "genes" like ['cs_x', 'cs_s', 'cc_n', ...]
    genes = [f"{prefixes[col]}_{val}" for col, val in row.items()]

    # Join them into a space-separated string
    return " ".join(genes)

# Apply this function to every row
df['dna_sentence'] = df.apply(create_dna_sentence, axis=1)


# Prepare the target variable (0 for edible, 1 for poisonous)
le = LabelEncoder()
df['class_encoded'] = le.fit_transform(df['class_edible'])


# Tokenize the "DNA" Sentences
dna_sentences = df['dna_sentence'].values
labels = df['class_encoded'].values

# Create a word-level tokenizer (it splits by spaces)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dna_sentences)

# Convert sentences to sequences of integers
sequences = tokenizer.texts_to_sequences(dna_sentences)

# Pad sequences to ensure they all have the same length
max_length = len(sequences[0])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')


# Split Data for Training and Testing
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.3, random_state=42)


# Build and Train the Embedding Model
# The vocabulary size is now the number of unique "genes"
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 16

model = Sequential([
    # This layer now learns a vector for each unique "gene" (e.g., 'odor_p')
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print(model.summary())

# Train the model
history = model.fit(X_train, y_train,
                    epochs=15, # Train for a few more epochs
                    validation_data=(X_test, y_test),
                    verbose=2)

# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nAdvanced DNA Embedding Model Accuracy: {accuracy*100:.2f}%")




None
Epoch 1/15
178/178 - 2s - 9ms/step - accuracy: 0.9460 - loss: 0.5206 - val_accuracy: 0.9897 - val_loss: 0.2444
Epoch 2/15
178/178 - 1s - 5ms/step - accuracy: 0.9981 - loss: 0.1047 - val_accuracy: 1.0000 - val_loss: 0.0428
Epoch 3/15
178/178 - 1s - 4ms/step - accuracy: 0.9998 - loss: 0.0234 - val_accuracy: 1.0000 - val_loss: 0.0152
Epoch 4/15
178/178 - 1s - 4ms/step - accuracy: 1.0000 - loss: 0.0094 - val_accuracy: 1.0000 - val_loss: 0.0073
Epoch 5/15
178/178 - 0s - 3ms/step - accuracy: 1.0000 - loss: 0.0049 - val_accuracy: 1.0000 - val_loss: 0.0043
Epoch 6/15
178/178 - 1s - 3ms/step - accuracy: 1.0000 - loss: 0.0030 - val_accuracy: 1.0000 - val_loss: 0.0028
Epoch 7/15
178/178 - 0s - 3ms/step - accuracy: 1.0000 - loss: 0.0020 - val_accuracy: 1.0000 - val_loss: 0.0019
Epoch 8/15
178/178 - 0s - 3ms/step - accuracy: 1.0000 - loss: 0.0014 - val_accuracy: 1.0000 - val_loss: 0.0014
Epoch 9/15
178/178 - 1s - 3ms/step - accuracy: 1.0000 - loss: 0.0010 - val_accuracy: 1.0000 - val_loss: 0.0