In [481]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

In [134]:
df = pd.read_csv('https://raw.githubusercontent.com/DSEI21000-S21/project-mushroom-data-classification/main/Data/Agaricus_Lepiota_Labeled_FINAL.csv')

In [135]:
df

Unnamed: 0,Edible?,Cap-shape,Cap-surface,Cap-color,Bruises?,Odor,Gill-attacment,Gill-spacing,Gill-size,Gill-color,...,Stalk-color-above-ring,Stalk-color-below-ring,Veil-Type,Veil-Color,Ring-number,Ring-Type,Spore-print-color,Population,Habitat,Species
0,p,x,f,n,f,n,f,w,n,w,...,n,n,p,w,o,e,w,y,l,Lepiota acutesquamosa
1,,f,y,c,,,,,,,...,,,,,,,,s,,Lepiota acutesquamosa
2,,k,,,,,,,,,...,,,,,,,,,,Lepiota acutesquamosa
3,e,k,s,p,t,n,f,c,b,w,...,w,w,p,w,o,e,w,c,w,Lepiota americana
4,,x,y,c,,,,,,e,...,,,,,,,,y,,Lepiota americana
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,,,,,,,,,,u,...,,,,,,,,n,,Agaricus subrutilescens
78,,,,,,,,,,h,...,,,,,,,,,,Agaricus subrutilescens
79,p,x,s,w,t,f,f,c,b,w,...,w,w,p,w,o,f,h,s,u,Agaricus xanthodermus
80,,f,,g,,,,,,p,...,,,,,,p,,v,p,Agaricus xanthodermus


In [136]:
# Checking number of species

len(df.Species.value_counts())

23

In [265]:
# Choosing 350 to have enough training set (350 * 23) 

samples_per_species = 350

In [266]:
species = list(set(df.iloc[:,-1].values));

In [267]:
df_bin = [];

for sp in species:
    temp = df.loc[df.iloc[:,-1] == sp,:] 
    mask = temp.notna()
    dummy = pd.DataFrame(list(range(samples_per_species)))    
        
    for ft in df.columns:
        col = [];
            
        if (mask.loc[:,ft].values.sum() > 0):  
            bin_val = temp.loc[mask.loc[:,ft]==True,ft]
        
            for i in range(samples_per_species):
                rsample = bin_val.sample(n=1,replace = True).values[0]
                col.append([rsample])
                
        else:
            
            for i in range(samples_per_species):
                col.append([np.nan])
                
        dummy = pd.concat([dummy,pd.DataFrame(col)],axis=1)
                
    chunk = dummy.iloc[:,1:np.size(dummy.columns)]
    df_bin.append(chunk)
                
    
out = pd.concat(df_bin)
out.columns = df.columns
df.dtypes
out.head()

Unnamed: 0,Edible?,Cap-shape,Cap-surface,Cap-color,Bruises?,Odor,Gill-attacment,Gill-spacing,Gill-size,Gill-color,...,Stalk-color-above-ring,Stalk-color-below-ring,Veil-Type,Veil-Color,Ring-number,Ring-Type,Spore-print-color,Population,Habitat,Species
0,p,f,s,w,f,n,f,c,n,w,...,w,w,p,w,o,p,w,y,d,Lepiota rubrotincta
1,p,x,y,p,f,n,f,c,n,w,...,w,w,p,w,o,p,w,y,d,Lepiota rubrotincta
2,p,k,s,w,f,n,f,c,n,w,...,w,w,p,w,o,p,w,c,w,Lepiota rubrotincta
3,p,k,s,p,f,n,f,c,n,w,...,w,w,p,w,o,p,w,s,d,Lepiota rubrotincta
4,p,f,y,w,f,n,f,c,n,w,...,w,w,p,w,o,p,w,c,l,Lepiota rubrotincta


In [268]:
len(out)

8050

In [518]:
df_unlabeled = pd.read_csv('Label_Spreading_Output.csv')

In [519]:
df_unlabeled

Unnamed: 0,Cap-shape,Cap-surface,Cap-color,Bruises?,Odor,Gill-attacment,Gill-spacing,Gill-size,Gill-color,Stalk-shape,...,Stalk-color-above-ring,Stalk-color-below-ring,Veil-Type,Veil-Color,Ring-number,Ring-Type,Spore-print-color,Population,Habitat,Species
0,b,f,n,f,n,f,c,n,p,e,...,w,w,p,w,o,p,k,v,u,Agaricus bitorquis
1,x,y,g,t,n,f,c,b,n,t,...,g,g,p,w,o,p,n,y,d,Agaricus haemorrhoidarius
2,k,y,n,f,y,f,c,n,b,t,...,w,w,p,w,o,e,w,v,p,Lepiota cristata
3,x,s,n,f,s,f,c,n,b,t,...,w,p,p,w,o,e,w,v,p,Lepiota cristata
4,x,y,g,f,f,f,c,b,g,e,...,p,n,p,w,o,l,h,v,d,Agaricus meleagris
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8234,f,y,n,f,s,f,c,n,b,t,...,w,w,p,w,o,e,w,v,p,Lepiota cristata
8235,x,s,y,t,a,f,c,b,w,e,...,w,w,p,w,o,p,n,s,m,Agaricus arvensis
8236,b,y,y,t,a,f,c,b,n,e,...,w,w,p,w,o,p,n,s,g,Agaricus arvensis
8237,f,f,n,t,n,f,c,b,n,t,...,g,g,p,w,o,p,k,v,d,Agaricus haemorrhoidarius


In [370]:
# Dropping edible column because of it's high collinearlity with Species type

labeled_df_no_edible = out.drop(['Edible?'], axis=1)

In [520]:
labeled_df_no_edible

Unnamed: 0,Cap-shape,Cap-surface,Cap-color,Bruises?,Odor,Gill-attacment,Gill-spacing,Gill-size,Gill-color,Stalk-shape,...,Stalk-color-above-ring,Stalk-color-below-ring,Veil-Type,Veil-Color,Ring-number,Ring-Type,Spore-print-color,Population,Habitat,Species
0,f,s,w,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,y,d,Lepiota rubrotincta
1,x,y,p,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,y,d,Lepiota rubrotincta
2,k,s,w,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,c,w,Lepiota rubrotincta
3,k,s,p,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,s,d,Lepiota rubrotincta
4,f,y,w,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,c,l,Lepiota rubrotincta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,f,s,w,t,l,f,c,n,w,t,...,w,w,p,w,o,p,n,c,d,Agaricus abruptibulbus
346,x,s,w,t,a,f,c,n,p,t,...,w,w,p,w,o,p,u,c,d,Agaricus abruptibulbus
347,x,s,w,t,l,f,c,n,n,t,...,w,w,p,w,o,p,n,c,d,Agaricus abruptibulbus
348,x,f,w,t,l,f,c,n,w,t,...,w,w,p,w,o,p,n,c,d,Agaricus abruptibulbus


In [521]:
df_all = pd.concat([labeled_df_no_edible, df_unlabeled])
df_all.reset_index(drop=True, inplace=True)
df_all

Unnamed: 0,Cap-shape,Cap-surface,Cap-color,Bruises?,Odor,Gill-attacment,Gill-spacing,Gill-size,Gill-color,Stalk-shape,...,Stalk-color-above-ring,Stalk-color-below-ring,Veil-Type,Veil-Color,Ring-number,Ring-Type,Spore-print-color,Population,Habitat,Species
0,f,s,w,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,y,d,Lepiota rubrotincta
1,x,y,p,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,y,d,Lepiota rubrotincta
2,k,s,w,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,c,w,Lepiota rubrotincta
3,k,s,p,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,s,d,Lepiota rubrotincta
4,f,y,w,f,n,f,c,n,w,e,...,w,w,p,w,o,p,w,c,l,Lepiota rubrotincta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16284,f,y,n,f,s,f,c,n,b,t,...,w,w,p,w,o,e,w,v,p,Lepiota cristata
16285,x,s,y,t,a,f,c,b,w,e,...,w,w,p,w,o,p,n,s,m,Agaricus arvensis
16286,b,y,y,t,a,f,c,b,n,e,...,w,w,p,w,o,p,n,s,g,Agaricus arvensis
16287,f,f,n,t,n,f,c,b,n,t,...,g,g,p,w,o,p,k,v,d,Agaricus haemorrhoidarius


In [522]:
X = df_all.drop(['Species'],axis=True)
X

Unnamed: 0,Cap-shape,Cap-surface,Cap-color,Bruises?,Odor,Gill-attacment,Gill-spacing,Gill-size,Gill-color,Stalk-shape,...,Stalk-surface-below-ring,Stalk-color-above-ring,Stalk-color-below-ring,Veil-Type,Veil-Color,Ring-number,Ring-Type,Spore-print-color,Population,Habitat
0,f,s,w,f,n,f,c,n,w,e,...,k,w,w,p,w,o,p,w,y,d
1,x,y,p,f,n,f,c,n,w,e,...,s,w,w,p,w,o,p,w,y,d
2,k,s,w,f,n,f,c,n,w,e,...,k,w,w,p,w,o,p,w,c,w
3,k,s,p,f,n,f,c,n,w,e,...,s,w,w,p,w,o,p,w,s,d
4,f,y,w,f,n,f,c,n,w,e,...,s,w,w,p,w,o,p,w,c,l
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16284,f,y,n,f,s,f,c,n,b,t,...,s,w,w,p,w,o,e,w,v,p
16285,x,s,y,t,a,f,c,b,w,e,...,s,w,w,p,w,o,p,n,s,m
16286,b,y,y,t,a,f,c,b,n,e,...,s,w,w,p,w,o,p,n,s,g
16287,f,f,n,t,n,f,c,b,n,t,...,s,g,g,p,w,o,p,k,v,d


In [523]:
X = pd.get_dummies(X)
X

Unnamed: 0,Cap-shape_b,Cap-shape_c,Cap-shape_f,Cap-shape_k,Cap-shape_s,Cap-shape_x,Cap-surface_f,Cap-surface_g,Cap-surface_s,Cap-surface_y,...,Population_s,Population_v,Population_y,Habitat_d,Habitat_g,Habitat_l,Habitat_m,Habitat_p,Habitat_u,Habitat_w
0,0,0,1,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16284,0,0,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
16285,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
16286,1,0,0,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
16287,0,0,1,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0


In [524]:
y = df_all['Species']
y

0              Lepiota rubrotincta
1              Lepiota rubrotincta
2              Lepiota rubrotincta
3              Lepiota rubrotincta
4              Lepiota rubrotincta
                   ...            
16284             Lepiota cristata
16285            Agaricus arvensis
16286            Agaricus arvensis
16287    Agaricus haemorrhoidarius
16288    Agaricus haemorrhoidarius
Name: Species, Length: 16289, dtype: object

In [525]:
X_train = X.iloc[0:8050,:]
print(len(X_train))
X_train.head()

8050


Unnamed: 0,Cap-shape_b,Cap-shape_c,Cap-shape_f,Cap-shape_k,Cap-shape_s,Cap-shape_x,Cap-surface_f,Cap-surface_g,Cap-surface_s,Cap-surface_y,...,Population_s,Population_v,Population_y,Habitat_d,Habitat_g,Habitat_l,Habitat_m,Habitat_p,Habitat_u,Habitat_w
0,0,0,1,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [526]:
y_train = df_all.iloc[0:8050,:]['Species']
print(len(y_train))
y_train

8050


0          Lepiota rubrotincta
1          Lepiota rubrotincta
2          Lepiota rubrotincta
3          Lepiota rubrotincta
4          Lepiota rubrotincta
                 ...          
8045    Agaricus abruptibulbus
8046    Agaricus abruptibulbus
8047    Agaricus abruptibulbus
8048    Agaricus abruptibulbus
8049    Agaricus abruptibulbus
Name: Species, Length: 8050, dtype: object

In [528]:
X_test = X.iloc[8050:,:]
print(len(X_test))
X_test.head()

8239


Unnamed: 0,Cap-shape_b,Cap-shape_c,Cap-shape_f,Cap-shape_k,Cap-shape_s,Cap-shape_x,Cap-surface_f,Cap-surface_g,Cap-surface_s,Cap-surface_y,...,Population_s,Population_v,Population_y,Habitat_d,Habitat_g,Habitat_l,Habitat_m,Habitat_p,Habitat_u,Habitat_w
8050,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
8051,0,0,0,0,0,1,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
8052,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
8053,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
8054,0,0,0,0,0,1,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0


In [530]:
y_test = df_all.iloc[8050:,:]['Species']
print(len(y_test))
y_test

8239


8050            Agaricus bitorquis
8051     Agaricus haemorrhoidarius
8052              Lepiota cristata
8053              Lepiota cristata
8054            Agaricus meleagris
                   ...            
16284             Lepiota cristata
16285            Agaricus arvensis
16286            Agaricus arvensis
16287    Agaricus haemorrhoidarius
16288    Agaricus haemorrhoidarius
Name: Species, Length: 8239, dtype: object

In [407]:
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [531]:
params = {
    'C' : np.linspace(0.001,0.5,5)
         }
 
grid_search_cv =  GridSearchCV( 
    estimator = LogisticRegression(), 
    param_grid = params)


In [532]:
grid_search_cv.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': array([0.001  , 0.12575, 0.2505 , 0.37525, 0.5    ])})

In [533]:
print(grid_search_cv.best_params_)

{'C': 0.12575}


In [534]:
model = grid_search_cv.best_estimator_

In [535]:
accuracy = model.score(X_test, y_test)
print("\n\nAccuracy %f" % accuracy)



Accuracy 0.928511


In [536]:
y_pred = lr.predict(X_test)

In [537]:
len(y_pred)

8239

In [539]:
print(metrics.classification_report(y_test, y_pred))

                           precision    recall  f1-score   support

   Agaricus abruptibulbus       0.20      0.92      0.33       113
        Agaricus arvensis       0.94      1.00      0.97       494
        Agaricus augustus       0.98      0.88      0.93       220
       Agaricus bitorquis       1.00      1.00      1.00       101
    Agaricus californicus       0.94      1.00      0.97       261
      Agaricus campestris       1.00      1.00      1.00       773
Agaricus haemorrhoidarius       0.99      0.76      0.86      1719
       Agaricus hondensis       0.99      0.99      0.99       199
       Agaricus meleagris       1.00      1.00      1.00      1301
  Agaricus subrutilescens       1.00      1.00      1.00        53
    Agaricus xanthodermus       1.00      0.91      0.95       365
    Lepiota acutesquamosa       0.82      1.00      0.90        53
        Lepiota americana       0.89      1.00      0.94       197
        Lepiota artodisca       0.83      1.00      0.91     