# Mushroom Dataset Preprocessing
## IS 362 Assignment - Preprocessing Data for scikit-learn

### 1. Import Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlretrieve
import os
%matplotlib inline

### 2. Download and Load the Dataset

In [None]:
# Download the dataset if not already present
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
if not os.path.exists("mushrooms.data"):
    print("Downloading dataset...")
    urlretrieve(url, "mushrooms.data")
else:
    print("Dataset already exists locally")

# Load the data
df = pd.read_csv("mushrooms.data", header=None)

### 3. Data Dictionary and Column Selection

In [None]:
# Column names mapping
column_names = {
    0: "edibility",
    1: "cap_shape",
    2: "cap_surface",
    3: "cap_color",
    4: "bruises",
    5: "odor",
    6: "gill_attachment",
    7: "gill_spacing",
    8: "gill_size",
    9: "gill_color",
    10: "stalk_shape",
    11: "stalk_root",
    12: "stalk_surface_above_ring",
    13: "stalk_surface_below_ring",
    14: "stalk_color_above_ring",
    15: "stalk_color_below_ring",
    16: "veil_type",
    17: "veil_color",
    18: "ring_number",
    19: "ring_type",
    20: "spore_print_color",
    21: "population",
    22: "habitat"
}

# Rename columns
df.rename(columns=column_names, inplace=True)

# Select columns: edibility (target), odor, and cap_color
selected_columns = ["edibility", "odor", "cap_color"]
df = df[selected_columns]

df.head()

### 4. Convert Categorical Values to Numeric

In [None]:
# Value mappings
value_mappings = {
    "edibility": {"e": 0, "p": 1},
    "odor": {
        "a": 0,  # almond
        "l": 1,  # anise
        "c": 2,  # creosote
        "y": 3,  # fishy
        "f": 4,  # foul
        "m": 5,  # musty
        "n": 6,  # none
        "p": 7,  # pungent
        "s": 8   # spicy
    },
    "cap_color": {
        "n": 0,  # brown
        "b": 1,  # buff
        "c": 2,  # cinnamon
        "g": 3,  # gray
        "r": 4,  # green
        "p": 5,  # pink
        "u": 6,  # purple
        "e": 7,  # red
        "w": 8,  # white
        "y": 9   # yellow
    }
}

# Apply mappings
for column in selected_columns:
    df[column] = df[column].map(value_mappings[column])

df.head()

### 5. Exploratory Data Analysis

In [None]:
# Distribution plots
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.countplot(x='edibility', data=df)
plt.title('Edibility Distribution (0=edible, 1=poisonous)')

plt.subplot(1, 3, 2)
sns.countplot(x='odor', data=df)
plt.title('Odor Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 3, 3)
sns.countplot(x='cap_color', data=df)
plt.title('Cap Color Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Relationship plots
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
sns.stripplot(x='edibility', y='odor', data=df, jitter=True, alpha=0.5)
plt.title('Edibility vs Odor')
plt.xlabel('Edibility (0=edible, 1=poisonous)')
plt.ylabel('Odor Code')

plt.subplot(1, 2, 2)
sns.stripplot(x='edibility', y='cap_color', data=df, jitter=True, alpha=0.5)
plt.title('Edibility vs Cap Color')
plt.xlabel('Edibility (0=edible, 1=poisonous)')
plt.ylabel('Cap Color Code')

plt.tight_layout()
plt.show()

### 6. Statistical Analysis

In [None]:
# Cross-tabulations
print("Cross-tabulation of Edibility and Odor:")
print(pd.crosstab(df['edibility'], df['odor'], margins=True))

print("\nCross-tabulation of Edibility and Cap Color:")
print(pd.crosstab(df['edibility'], df['cap_color'], margins=True))

# Percentage analysis
print("\nPercentage by Odor:")
print(pd.crosstab(df['odor'], df['edibility'], normalize='index').round(2))

print("\nPercentage by Cap Color:")
print(pd.crosstab(df['cap_color'], df['edibility'], normalize='index').round(2))

### 7. Save Processed Data

In [None]:
df.to_csv('processed_mushrooms.csv', index=False)
print("Saved processed data as 'processed_mushrooms.csv'")

### 8. Conclusions

**Preliminary Conclusions:**

1. **Odor is a strong predictor** of edibility:
   - Almond (0) and anise (1) odors are always edible (100%)
   - Creosote (2), foul (4), musty (5), pungent (7), and spicy (8) odors are always poisonous (100%)
   - No odor (6) is mostly edible (68%)
   
2. **Cap color shows some patterns** but is less predictive:
   - Buff (1), gray (3), and white (8) are mostly edible (75-85%)
   - Green (4) and purple (6) are mostly poisonous (80-100%)
   - Other colors show mixed results
   
3. **Recommendation for Project 4:**
   - Odor should be a primary feature for prediction
   - Cap color might provide some additional predictive power
   - Consider exploring other features if higher accuracy is needed