# Duke Phung Project 4
## Import all necessary modules to perform analysis.  The DataFrame is then imported and all data within the columns used are converted to numeric values.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

mushrooms = pd.read_csv('mushroomData.csv', usecols=['Consumable', 'Odor', 'Spore-sprint-color'])
mushrooms.rename(columns={'Odor': 'Odor_type', 'Spore-sprint-color': 'Spore_color'},
                inplace=True)



In [2]:
# consumable code: 0 = edible, 1 = poisonous
consumable = {'e': 0, 'p': 1}
mushrooms['Consumable'].replace(consumable, inplace=True)

# odor code: 1 = pungent, 2 = almond, 3 = anise, 4 = none, 5 = foul, 6 = creosote, 7 = fishy, 8 = spicy, 9 = musty
odor = {'p': 1, 'a': 2, 'l': 3, 'n': 4, 'f': 5, 'c': 6, 'y': 7, 's': 8, 'm': 9}
mushrooms.Odor_type.replace(odor, inplace=True)

# print_color: 1 = black, 2 = brown, 3 = purple, 4 = chocolate, 5 = white, 6 = green, 7 = orange, 8 = yellow, 9 = buff
print_color = {'k': 1, 'n': 2, 'u': 3, 'h': 4, 'w': 5, 'r': 6, 'o': 7, 'y': 8, 'b': 9}
mushrooms.Spore_color.replace(print_color, inplace=True)

mushrooms.head()

Unnamed: 0,Consumable,Odor_type,Spore_color
0,1,1,1
1,0,2,2
2,0,3,2
3,1,1,1
4,0,4,2


# Using the train/test split evaluation method, the feature odor type will be the first to predict the response.  Since the test_size is not specified, the default size of 25% will be used for the test size.

In [3]:
X = mushrooms[['Odor_type']]
y = mushrooms['Consumable']

In [4]:
# Step 1: split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

In [5]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(6093, 1)
(6093,)
(2031, 1)
(2031,)


### Using KNN model to validate predictive accuracy using odor feature to edible response.  

In [6]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.9857213195470211


### Using Logistic Regression to predict accuracy using same feature and response.  As seen from the response below, Logistic regression had an accuracy of 95% while K Nearest Neighbor has an accuracy of 98.8%.

In [7]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.9527326440177253


# Now we will use another feature spore color and measure the accuracy in predicting the response.

In [8]:
X = mushrooms[['Spore_color']]
y = mushrooms['Consumable']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

In [9]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(6093, 1)
(6093,)
(2031, 1)
(2031,)


### First determine the accuracy the secondary feature is in predicting the response using the KNN model.

In [10]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.8537666174298375


### Now we use the Logistic Regression to measure the accuracy of the feature to predict the response.  The KNN model may be more accurate than Logistic Regression but the feature still only has an 85% accuracy in predicting the response.

In [11]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.828163466272772


# Finally, we can use both the odor type and the spore color together to determine if either KNN and/or Logistic Regression models will have better accuracy.

In [12]:
combo_cols = ['Odor_type', 'Spore_color']
X = mushrooms[combo_cols]
y = mushrooms['Consumable']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

### Using the train/test split evaluation, we will test the accuracy of Logistic Regression.  The output below indicates an 85.8% accuracy.

In [13]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.8581979320531757


### The same train/test data is now used in the KNN model.  The output indicates a 99.6% accuracy in predicting the response using both features.  

In [14]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.9955686853766618


### Final Conclusion:  Odor type and spore color are features used in predicting consumability.  Odor type can more accurately predict if the mushroom is poisonous compared to spore color with an accuracy of 98.5% and 82.8%, respectively.  When used together, spore color and odor type has an accuracy of 99.6%.  My conclusion is that spore color should be used in conjunction with odor type to determine the response.  Since more features are available within the dataset, further experimentation may be able to raise the accuracy even further.