In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

print(df_test.shape)
print(df_train.shape)

In [None]:
#Data Inspection
df_train.head()

In [None]:
df_train.isnull().sum()


In [None]:
#training dataset

categorical_cols = df_train.select_dtypes(include = ['object']).columns
for col in categorical_cols:
    print(f"\n unique values in {col}")
    print(df_train[col].value_counts())

In [None]:

#test data set
categorical_columns = df_test.select_dtypes(include=['object']).columns
for col in categorical_columns:
    print(f"\nUnique values in {col}:")
    print(df_test[col].value_counts())

In [None]:
# all our categorical values are mixed with numerical data, probably due to the noise added so we will remove
# and set expectations for what the values should be, found these expectations by look at the intersection of training
# and test data set
df_train.head()



In [None]:

category_mappings = {
    'season': ['a', 'u', 'w', 's'],
    'cap-shape': ['x', 'f', 's', 'b', 'o'],
    'cap-surface': ['t', 's', 'y', 'h', 'g'],
    'cap-color': ['n', 'y', 'w', 'g', 'e'],
    'does-bruise-or-bleed': ['f', 't'],
    'gill-attachment': ['a', 'd', 'x', 'e', 's'],
    'gill-spacing': ['c', 'd', 'f'],
    'gill-color': ['w', 'n', 'y', 'p', 'g'],
    'stem-root': ['b', 's', 'r', 'c', 'f'],
    'stem-surface': ['s', 'y', 'i', 't', 'g', 'k', 'h', 'f'],
    'stem-color': ['w', 'n', 'y', 'g', 'o', 'e', 'u', 'p', 'k', 'r', 'l', 'b'],
    'veil-type': ['u', 'w'],
    'veil-color': ['w', 'y', 'n', 'u', 'k', 'e'],
    'has-ring': ['f', 't'],
    'ring-type': ['f', 'e', 'z', 'l', 'r', 'p', 'g', 'm'],
    'spore-print-color': ['k', 'p', 'w', 'n', 'r', 'u', 'g'],
    'habitat': ['d', 'g', 'l', 'm', 'h', 'w', 'p', 'u'],
}

In [None]:
def clean_category(column, df, valid_category,threshold):
    df[column] = df[column].astype(str)
    counts = df[column].value_counts(normalize = True)


    def map_category(value):
        if value.replace('.','').isdigit():
            return 'Other'

        elif value.lower() in [cat.lower() for cat in valid_category]:
            return next(cat for cat in valid_category if cat.lower() == value.lower())
        elif counts.get(value,0) < threshold:
            return 'Other'
        else:
            return value
    df[column] = df[column].apply(map_category)

    return df
    

In [None]:
for column, valid in category_mappings.items():
    df_train  = clean_category(column,df_train,valid,0.001)
    df_test = clean_category(column,df_test,valid,0.001)

In [None]:
for column in category_mappings.keys():
    print(f"\nUniques in {column} after cleaning:")
    print(df_test[column].value_counts())

In [None]:
# for interpretability. 
#df_train['class'] = df_train['class'].replace({'e': 'Edible', 'p': 'Poisonous'})
#df_train['season'] = df_train['season'].replace({'a':'autum','u':'summer', 'w': 'winter', 's':'spring'})

In [None]:
counts = df_test.select_dtypes(include = ['object']).columns.value_counts()
print(counts)

In [None]:
#Visualization

In [None]:
counts = df_train['class'].value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=counts.index, y=counts.values, palette='viridis')
plt.title('Distribution of Edible and Poisonous Mushrooms')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

In [None]:
cap_color_counts = df_train['cap-color'].value_counts()

plt.figure(figsize=(12, 6))
sns.barplot(x=cap_color_counts.index, y=cap_color_counts.values, palette='viridis')
plt.title('Distribution of Cap Colors')
plt.xlabel('Cap Color')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='stem-height', y='stem-width', hue='class', data=df_train, palette='viridis', alpha=0.7)
plt.title('Stem Height vs. Stem Width')
plt.xlabel('Stem Height (cm)')
plt.ylabel('Stem Width (mm)')
plt.legend(title='Class')
plt.show()

In [None]:
#too noisy can't extrapoate anything, poisnous stem  width and height  tend to be proportional ?

In [None]:
df_test.head()

In [None]:
output_train = 'data/cleanedtrain.csv'
output_test = 'data/cleanedtest.csv'
df_train.to_csv(output_train, index=False)
df_test.to_csv(output_test,index = False)
