# AI in Biology Final Project 
### Anna Mattessich and Keerthi Mula

## Load Libraries 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import Callback

## Import Data 
Data selected from https://archive.ics.uci.edu/ (UC Irvine Machine Learning Repository)

Data about mushrooms and mushroom identification. 

Mushroom. (1987). UCI Machine Learning Repository. https://doi.org/10.24432/C5959T.

In [28]:
# Data saves in folder titled Mushroom Data, but taking data directly from website
# URL of the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'

# Define column names based on the description provided
column_names = ['poisonous', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
                'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
                'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
                'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring',
                'veil-type', 'veil-color', 'ring-number', 'ring-type',
                'spore-print-color', 'population', 'habitat']

# Load the dataset into a DataFrame
df = pd.read_csv(url, header=None, names=column_names)

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## Variable Meaning
cap-shape:                bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface:              fibrous=f,grooves=g,scaly=y,smooth=s

cap-color:                brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y

bruises?:                 bruises=t,no=f

odor:                     almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s

gill-attachment:          attached=a,descending=d,free=f,notched=n

gill-spacing:             close=c,crowded=w,distant=d

gill-size:                broad=b,narrow=n

gill-color:               black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y

stalk-shape:              enlarging=e,tapering=t

stalk-root:               bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y

stalk-color-below-ring:   brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y

veil-type:                partial=p,universal=u

veil-color:               brown=n,orange=o,white=w,yellow=y

ring-number:              none=n,one=o,two=t

ring-type:                cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z

spore-print-color:        black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y

population:               abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y

habitat:                  grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d

## Introduction
We are trying to answer the question: Can we determine if a mushroom is poisonous based on its habitat, color, and shape?

To answer this question we are going to observe the variables of habitat, cap-color, gill-color, stalk-color-above-ring, stalk-color-below-ring, veil-color, spore-print-color, cap-shape, rod-shape. We will select and simplify the data from the data set. Then we are going to make a logistic regression model that can look at these and determine if a mushroom is poisonous or not. 

## Data pre-processing
What's needed to load the data, clean the data, normalize, etc.

In [29]:
## Determine which columns are present
print(df.columns)

Index(['poisonous', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


In [30]:
# Clean data
# Selecting only the desired columns
simplified_df = df[['poisonous', 'cap-shape', 'cap-color', 'gill-color', 
                    'stalk-color-above-ring', 'stalk-color-below-ring', 
                    'veil-color', 'population', 'habitat']]

# Display the first few rows of the simplified DataFrame
simplified_df.head()

Unnamed: 0,poisonous,cap-shape,cap-color,gill-color,stalk-color-above-ring,stalk-color-below-ring,veil-color,population,habitat
0,p,x,n,k,w,w,w,s,u
1,e,x,y,k,w,w,w,n,g
2,e,b,w,n,w,w,w,n,m
3,p,x,w,n,w,w,w,s,u
4,e,x,g,k,w,w,w,a,g


In [31]:
# Inspect unique values in each categorical column
for column in categorical_columns:
    print(column, simplified_df[column].unique())

cap-shape ['x' 'b' 's' 'f' 'k' 'c']
cap-color ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']
gill-color ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
stalk-color-above-ring ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
stalk-color-below-ring ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
veil-color ['w' 'n' 'o' 'y']
population ['s' 'n' 'a' 'v' 'y' 'c']
habitat ['u' 'g' 'm' 'd' 'p' 'w' 'l']


In [33]:
import numpy as np

# Convert 'x' to NaN
simplified_df_nan = simplified_df.replace('x', np.nan)

# Drop rows containing NaN
simplified_df_numeric = simplified_df_nan.dropna()

# Reset index
simplified_df_numeric.reset_index(drop=True, inplace=True)

# Display the first few rows of the new DataFrame
simplified_df_numeric.head()


Unnamed: 0,poisonous,cap-shape,cap-color,gill-color,stalk-color-above-ring,stalk-color-below-ring,veil-color,population,habitat
0,e,b,w,n,w,w,w,n,m
1,e,b,w,g,w,w,w,n,m
2,e,b,w,n,w,w,w,s,m
3,e,b,y,g,w,w,w,s,m
4,e,b,y,w,w,w,w,s,g


In [40]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the columns that need one-hot encoding
categorical_columns = ['cap-shape', 'cap-color', 'gill-color', 
                       'stalk-color-above-ring', 'stalk-color-below-ring', 
                       'veil-color', 'population', 'habitat']

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)  # One-hot encode categorical columns
    ])

# Define the pipeline with preprocessing
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
normalized_data = pipeline.fit_transform(simplified_df)

# Convert the normalized data array back to a DataFrame
dense_df = pd.DataFrame(normalized_data.toarray())
dense_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Model setup
Setup one or more models

In [41]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Define the columns that need one-hot encoding
categorical_columns = ['cap-color', 'gill-color', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-color', 'habitat']

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)  # One-hot encode categorical columns
    ])

# Define the pipeline with preprocessing
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(max_iter=1000))])

# 1. Split the data
X = simplified_df[['cap-color', 'gill-color', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-color', 'habitat']]
y = simplified_df['poisonous']

# 2. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train and evaluate the model using the pipeline
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)



Accuracy: 0.9255384615384615

Classification Report:
               precision    recall  f1-score   support

           e       0.90      0.96      0.93       843
           p       0.96      0.88      0.92       782

    accuracy                           0.93      1625
   macro avg       0.93      0.92      0.93      1625
weighted avg       0.93      0.93      0.93      1625



## Hyperparameter tuning
Do some playing with the model hyperparameters (learning rate, optimizer, batch size, epochs, whatever makes sense)

## Results
How did the model do

## Discussion
Summarize what worked, what didn't etc.