# Task 1 Machine learning on mushroom dataset




In [1]:
import sklearn
import pandas as pd 
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder 
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from dython.nominal import associations
import matplotlib.pyplot as plt
from dython import nominal

from sklearn.model_selection import cross_val_score



from sklearn.model_selection import train_test_split
pd.options.display.max_columns = 500
pd.set_option('display.max_rows', 200)

ModuleNotFoundError: No module named 'dython'

Here we import all of our necessary libraries, sklearn for predicitive data analysis, classification, regression, clustering and more. Pandas for data analysis and data manipulation and so forth. We use numpy for numerical computing, this provides tools for working with arrays and matrices and other numerical data. We import KneighborsClassifier, LogsiticRegression and MLPClassifier as those are the machine learning models we will use in this task. Dython we use for work on nominal data. Matplotlib

In [None]:
df = pd.read_table('agaricus-lepiota.csv', delimiter=',', header=None)
df.head()

We load read our data and turn it into a data frame, denoted by "df". A two-dimensional labeled data structure. 

In [None]:
#Checking for any duplicates that might skew the machines ability to learn
duplicates = df.duplicated().sum()
count = df.shape[0]
print(f'{duplicates} duplicate rows in {count} rows')

Here we check for any duplicates in our dataframe ("df"), this is an important step in cleaning and preprocessing before applying machine learning models. This is due to the fact that duplicates can inflate performance by overfitting. However we read that there are 0 duplicate rows in our 8124 rows, therefore we dont need to get rid of any.

In [None]:
column_labels = [
    'class', 'cap shape', 'cap surface', 'cap color', 'bruised', 'odor',
    'gill attachment', 'gill spacing', 'gill size', 'gill color', 
    'stalk shape', 'stalk root', 'stalk surface above ring',
    'stalk surface below ring', 'stalk color above ring',
    'stalk color below ring', 'veil type', 'veil color', 'ring number',
    'ring type', 'spore print color', 'population', 'habitat'
]

df.columns = column_labels

This step is done to replace the numbers in the top row to something more descriptive, here we use the classifications from the "agaricus-lepiota" text file. We can now see how the "1" have turned into "class", and the other numbers have turned into other descriptive words.

In [None]:
df.head()

In [None]:
df['class'].replace(['e', 'p'], [0, 1], inplace=True)
df['stalk root'].value_counts()

This code is replacing the values in 'class'. It replaces 'e' (edible) with 0 and 'p' (poisonous) with 1. We change them from categorical to numerical.  
We see that there are alot of the mushrooms that have "stalk root" type "?", a whooping 30.5% of the mushrooms have "?" as its stalk root feature.

In [None]:
df['veil type'].value_counts()

We count the veil type as well, and find that all the mushrooms have the same veil type. We therefor decide to drop it, as it wont impact performance.

In [None]:
df.columns.drop(['stalk root'])
df.columns.drop(['veil type'])

In [None]:
print(len(df.columns))

In [None]:
theil_data = [[nominal.theils_u(df["class"], df[col]) for col in df]]
plt.figure(figsize=(20,1))
sns.heatmap(theil_data, annot=True, fmt='.2f', xticklabels=df.columns)

This code is calculating the Theil's U statistic between the target variable (the 'class' column) and each feature column in the DataFrame.

The Theil's U statistic is a measure of the strength of association between two categorical variables. It ranges from 0 to 1, where 0 indicates no association and 1 indicates a strong association. The Theil's U statistic is useful for identifying which features are most informative for predicting the target variable.
The code is using a list comprehension to calculate the Theil's U statistic for each column in the DataFrame using the theils_u() function from the dython.nominal module.

The code then creates a heatmap visualization of the Theil's U statistic using the sns.heatmap() function from the Seaborn library. The heatmap is annotated with the Theil's U values for each feature and target pair.

From this heatmap we can deduce that odor has a very close correlation to the precense of poison in the mushroom. Now we can take a closer look at odor to see how it correlates with the presence of poison.

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
ax=sns.countplot(x='odor',hue='class',data=df)
for p in ax.patches:
    patch_height = p.get_height()
    if np.isnan(patch_height):
        patch_height = 0    
    ax.annotate('{}'.format(int(patch_height)), (p.get_x()+0.05, patch_height+10))
plt.show()

In this graph we see that the presence of "odor f", meaning foul odor guarantees the presence of poison, there are a multitude of other smells that also guarantees poison. However "odor n", meaning no odor is weighted towards non-poisonous, however there are 120 instances of mushrooms that smell nothing that also are poisonous.

In [None]:
for column in df.columns.drop(["class"]):
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df.drop([column], axis=1, inplace=True)

This code is performing one-hot encoding on each categorical feature column in the DataFrame.

For each column, it creates a new DataFrame dummies using the pd.get_dummies() function, which converts the column into a set of binary columns (one for each unique value in the original column), where a 1 indicates that the corresponding value is present and a 0 indicates that it is not.

This process is done so that we convert categorical features into a numerical format, here we avoid assigning an arbitrary ordering to the categorical values that might not be meaningful for the model. And as there is no natural ordering for things like 'cap shape' or colors one-hot is well suited for this task. 

In [None]:
df.head()

Here we see the effects of one-hot, we have split all the categories 

In [None]:
target = df["class"]
data = df.drop("class", axis=1)
print(target.shape)
print(data.shape)
data.head()


In [None]:
X_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=90)
knn_scores = cross_val_score(knn, data, target, cv=5)  # 5-fold cross-validation
knn_mean_score = np.mean(knn_scores)

mse = mean_squared_error(y_test, pred)
rmse = sqrt(mse)
print(f"KNeighbor: {score}")

In [None]:
log_model = LogisticRegression()
log_scores = cross_val_score(log_model, data, target, cv=5)  # 5-fold cross-validation
log_mean_score = np.mean(log_scores)


In [None]:
nn_model = MLPClassifier(hidden_layer_sizes=(128, 128))
nn_scores = cross_val_score(nn_model, data, target, cv=5)  # 5-fold cross-validation
nn_mean_score = np.mean(nn_scores)


In [None]:
y_pred = nn_model.predict(x_test)
print(y_pred)

In [None]:
cm = confusion_matrix(y_pred, y_test)
print(cm)

In [None]:
print(f"KNeighbor (CV): {knn_mean_score}")
print(f"Logistic Regression (CV): {log_mean_score}")
print(f"Neural Network (CV): {nn_mean_score}")


In [None]:
corr = df.corr()

corr = pd.DataFrame(corr["class"])
corr.sort_values(by=['class'], ascending=False)