In [97]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score
from sklearn.semi_supervised import LabelSpreading
%matplotlib qt

In [98]:
df_label = pd.read_csv('Synthetic_AL_5.csv')
df_unlabeled = pd.read_csv('agaricus-lepiota_labeled_columns.csv')

In [99]:
# Missing values on the unlabeled dataset have '?' instead of NaN
np.sum(df_unlabeled.iloc[:,:] == '?')

Edible?                        0
Cap-shape                      0
Cap-surface                    0
Cap-color                      0
Bruises?                       0
Odor                           0
Gill-attacment                 0
Gill-spacing                   0
Gill-size                      0
Gill-color                     0
Stalk-shape                    0
Stalk-root                  2480
Stalk-surface-above-ring       0
Stalk-surface-below-ring       0
Stalk-color-above-ring         0
Stalk-color-below-ring         0
Veil-Type                      0
Veil-Color                     0
Ring-number                    0
Ring-Type                      0
Spore-print-color              0
Population                     0
Habitat                        0
dtype: int64

In [100]:
# Replace '?' with np.nan
indx = df_unlabeled.iloc[:,:] == '?'
df_unlabeled.iloc[indx] = np.nan

np.sum(pd.isnull(df_unlabeled))

Edible?                        0
Cap-shape                      0
Cap-surface                    0
Cap-color                      0
Bruises?                       0
Odor                           0
Gill-attacment                 0
Gill-spacing                   0
Gill-size                      0
Gill-color                     0
Stalk-shape                    0
Stalk-root                  2480
Stalk-surface-above-ring       0
Stalk-surface-below-ring       0
Stalk-color-above-ring         0
Stalk-color-below-ring         0
Veil-Type                      0
Veil-Color                     0
Ring-number                    0
Ring-Type                      0
Spore-print-color              0
Population                     0
Habitat                        0
dtype: int64

In [101]:
# Find how many missing values we have on the labeled dataset
np.sum(pd.isnull(df_label))

Edible?                      0
Cap-shape                    0
Cap-surface                  5
Cap-color                    0
Bruises?                    25
Odor                        25
Gill-attacment               0
Gill-spacing                 0
Gill-size                    0
Gill-color                   0
Stalk-shape                  0
Stalk-root                  95
Stalk-surface-above-ring     0
Stalk-surface-below-ring     0
Stalk-color-above-ring       0
Stalk-color-below-ring       0
Veil-Type                    0
Veil-Color                   0
Ring-number                  0
Ring-Type                    0
Spore-print-color            0
Population                  10
Habitat                      0
Species                      0
dtype: int64

We see that there is a few missing values on our labeled dataset. We also see that a lot of the samples were missing the stalk-root feature. The absence of this feature might be helpful in identifying those samples which do have the feature.

Because of this, we will fill the empty data with a value that represents that the data is missing ('?')

In [102]:
df_label = df_label.fillna('?')
df_unlabeled = df_unlabeled.fillna('?')

For the Label Spreading algorithm we have to make one big dataset with our labeled and unlabeled datapoints. We will encode the known labels and then  assign the label of -1 to all unlabeled samples

In [103]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels for our labeled dataset

le = LabelEncoder()
enc_label = le.fit_transform(df_label['Species'])

df_label['Species'] = enc_label

# Assign a dummy label to our unlabeled dataset

df_unlabeled['Species'] = -1;

# Combine our datasets

df = pd.concat([df_label, df_unlabeled],ignore_index=True)

df = df.sample(frac = 1)

In [104]:
df.head()

Unnamed: 0,Edible?,Cap-shape,Cap-surface,Cap-color,Bruises?,Odor,Gill-attacment,Gill-spacing,Gill-size,Gill-color,...,Stalk-color-above-ring,Stalk-color-below-ring,Veil-Type,Veil-Color,Ring-number,Ring-Type,Spore-print-color,Population,Habitat,Species
2390,e,x,y,n,t,n,f,c,b,u,...,w,p,p,w,o,p,n,y,d,-1
3124,p,x,f,g,f,f,f,c,b,h,...,p,p,p,w,o,l,h,y,p,-1
6306,p,x,s,n,f,f,f,c,n,b,...,p,w,p,w,o,e,w,v,l,-1
3524,e,f,y,n,t,n,f,c,b,u,...,g,w,p,w,o,p,k,v,d,-1
2038,e,x,f,g,f,n,f,w,b,h,...,w,w,p,w,o,e,n,s,g,-1


The next part is an important step. We known that edibility of all samples, which is good for someone that's out and about eating mushrooms in the wild. But if we are to predict the species of a mushroom just from it's physical characteristics we cannot possibly know the edibility of a mushroom without knowing it's species. Therefore we chose to drop this column before proceeding any further (no training of any kind has been done so far.

In [105]:
df = df.drop('Edible?',axis = 1)

In [106]:
# Split the labels from the input dataset

X = df.drop('Species',axis = 1)
y = df['Species']

In [107]:
# Before finally applying our algorithm we need to encode our inputs
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
X = enc.fit_transform(X)

In [108]:
# We deal with outliers 
from sklearn.ensemble import IsolationForest

outlier_det = IsolationForest(contamination = 0.01)

outlier_det.fit(X)

X_clean = outlier_det.predict(X)

X_with_outlier = X.toarray()

X = X_with_outlier[list(X_clean == 1),:]
y = y[list(X_clean == 1)]






In [109]:
# We run a 40 PCA as this was the value at the elbow of the PCA analysis
from sklearn.decomposition import PCA

pca_16 = PCA(n_components=40)

X = pca_16.fit_transform(X)


In [110]:
# We will run the out-of-the-box label spreading algorithm
from sklearn.semi_supervised import LabelSpreading

label_prop_model = LabelSpreading(kernel = 'rbf', gamma = 23, alpha = 0.2)

label_prop_model.fit(X,y)

y_pred = label_prop_model.predict(X)

y_color = y_pred

y_pred = le.inverse_transform(y_pred)

In [111]:
# We now run 3PCA on our dataset for visualization purposes

pca = PCA(n_components=3)

X_3pca = pca.fit_transform(X)

For reference, we will plot the unlabeled dataset and the original labeled points

In [112]:
# Plot the original datasets together
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(projection='3d')

s = (y != -1).astype(int)

#indx = np.where(s==1)[0]

#alphas = s
#alphas[:] = 0.5
#alphas[indx] = 1

# Plot
ax.scatter(xs = X_3pca[:,0] , ys = X_3pca[:,1] , zs = X_3pca[:,2] , alpha = 0.2, c = s, cmap = 'coolwarm')

plt.title('True labels')
plt.show()

In [113]:
# We will also plot the location of the outliers

X_with_outliers = df.drop('Species',axis = 1)

X_with_outliers = enc.transform(X_with_outliers)

X_3pca_outlier = pca.fit_transform(X_with_outliers.toarray())

fig = plt.figure()
ax = fig.add_subplot(projection='3d')

s = (X_clean == -1).astype(int)

#indx = np.where(s==1)[0]

#alphas = s
#alphas[:] = 0.5
#alphas[indx] = 1

# Plot
ax.scatter(xs = X_3pca_outlier[:,0] , ys = X_3pca_outlier[:,1] , zs = X_3pca_outlier[:,2] , alpha = 0.2, c = s, cmap = 'coolwarm')

plt.title('Outliers')
plt.show()




In [136]:
# Plot the 3 PC with the predicted labels

fig = plt.figure()
ax = fig.add_subplot(projection='3d')

# Which species do we want to see

sp = 22

s = (y_color==sp).astype(int)

# Plot
ax.scatter(xs = X_3pca[:,0] , ys = X_3pca[:,1] , zs = X_3pca[:,2] , alpha = 0.2, c = s, cmap = 'coolwarm')
#Axes3D.scatter(xs = X_3pca[:,0] , ys = X_3pca[:,1] , zs = X_3pca[:,2] ,zdir ='z', c=y_pred)
plt.title(le.inverse_transform([sp])[0])
plt.show()

  fig = plt.figure()


In [137]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import matplotlib.cm as cm

silhouette_avg = silhouette_score(X, y_color)

sample_silhouette_values = silhouette_samples(X, y_color)

fig, (ax1) = plt.subplots(1, 1)
fig.set_size_inches(7, 20)

# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.35, 0.8])
ax1.set_ylim([0, len(X) + (np.size(list(set(y_color))) + 1) * 140])

y_lower = 10
for i in list(set(y_color)):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = \
        sample_silhouette_values[y_color == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.nipy_spectral(float(i) / np.size(list(set(y_color))))
    ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

    # Label the silhouette plots with their cluster numbers at the middle
    ax1.text(-0.3, y_lower + 0.5 * size_cluster_i, le.inverse_transform([i])[0])

    # Compute the new y_lower for next plot
    y_lower = y_upper + 150 # 10 for the 0 samples

ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")

# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

ax1.set_yticks([])  # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

  fig, (ax1) = plt.subplots(1, 1)


[<matplotlib.axis.XTick at 0x23682387520>,
 <matplotlib.axis.XTick at 0x236823871f0>,
 <matplotlib.axis.XTick at 0x23685ce7cd0>,
 <matplotlib.axis.XTick at 0x23681053400>,
 <matplotlib.axis.XTick at 0x23681051430>,
 <matplotlib.axis.XTick at 0x236823626d0>,
 <matplotlib.axis.XTick at 0x23681053af0>]

Now that we have the infrastructure of our model, and a metric to evaluate it (mean silhouette coefficient), we can optimize the model for the highest performance. High performance in our case Would mean that all sets of mushroom labels are more or less evenly distributed and all are part of a well defined cluster.

In [195]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

#X = df.drop('Species',axis = 1)
#X = enc.fit_transform(X)
y = df['Species']

silh_score = make_scorer(silhouette_score)

#parameters = {'kernel':('knn','rbf'), 'gamma': np.linspace(0,1,11), 'n_neighbors': np.arange(23), 'n_jobs': [-1]}
parameters = {'kernel':['rbf'], 'gamma': [10,20], 'alpha': [0.2,0.5,0.7], 'n_jobs': [-1]}

clf = GridSearchCV(label_prop_model,parameters)#,scoring = silh_score)

clf.fit(X.toarray(),y_color)
#(np.array(y)).reshape(-1, 1)

GridSearchCV(estimator=LabelSpreading(),
             param_grid={'alpha': [0.2, 0.5, 0.7], 'gamma': [10, 20],
                         'kernel': ['rbf'], 'n_jobs': [-1]})

In [196]:
clf.cv_results_

{'mean_fit_time': array([2.14140482, 2.24591174, 2.94278245, 3.28548503, 4.02091799,
        4.56457705]),
 'std_fit_time': array([0.08852708, 0.01276748, 0.10005957, 0.10280252, 0.09108414,
        0.1360113 ]),
 'mean_score_time': array([0.23157973, 0.23590355, 0.22447634, 0.22839527, 0.23010011,
        0.23843503]),
 'std_score_time': array([0.01804504, 0.00619902, 0.01799067, 0.01543918, 0.02035703,
        0.01300039]),
 'param_alpha': masked_array(data=[0.2, 0.2, 0.5, 0.5, 0.7, 0.7],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_gamma': masked_array(data=[10, 20, 10, 20, 10, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'rbf', 'rbf', 'rbf', 'rbf', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_jobs': mas

In [120]:
clf.best_score_

0.8714673490949123

In [194]:
clf.best_params_

{'alpha': 0.2, 'gamma': 20, 'kernel': 'rbf', 'n_jobs': -1, 'n_neighbors': 7}