 # SoyBean Predictions

In [1]:
import pandas as pd
import numpy as np
import sklearn as sl
from collections import defaultdict


 ## Load and Transform Data

 ### Create Column Names

In [2]:
Columns = ['Disease','date','plant-stand','precip','temp','hail','crop-hist','area-damaged','severity','seed-tmt',
           'germination','plant-growth','leaves','leafspots-halo','leafspots-marg','leafspots-size','leaf-shread',
           'leaf-malf','leaf-mild','stem','lodging','stem-cankers','canker-lesion','fruiting-bodies','external decay',
           'mycelium','int-discolor','sclerotia','fruit-pods','fruit spots','seed','mold-growth','seed-discolor',
           'seed-size','shriveling','roots']


 one thing to note, the missing values in this dataset are denoted by a "?", so we will be changing that to a NaN type

 ### Read Data

In [3]:
soybeansU = pd.read_csv('Soybean.csv')
soybeansU.columns = Columns
soybeansU = soybeansU.replace('?',np.nan)


 Now lets check the info about this data

In [4]:
soybeansU.info()
soybeansU.shape


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 36 columns):
Disease            306 non-null object
date               305 non-null object
plant-stand        298 non-null object
precip             295 non-null object
temp               299 non-null object
hail               265 non-null object
crop-hist          305 non-null object
area-damaged       305 non-null object
severity           265 non-null object
seed-tmt           265 non-null object
germination        270 non-null object
plant-growth       305 non-null object
leaves             306 non-null int64
leafspots-halo     281 non-null object
leafspots-marg     281 non-null object
leafspots-size     281 non-null object
leaf-shread        280 non-null object
leaf-malf          281 non-null object
leaf-mild          276 non-null object
stem               305 non-null object
lodging            265 non-null object
stem-cankers       295 non-null object
canker-lesion      295 non-null object

(306, 36)

 So the dataframe is 306 rows by 36 columns

In [5]:
soybeansU.head(35) # Check to make sure it was imported prpperly 


Unnamed: 0,Disease,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
0,diaporthe-stem-canker,4,0,2,1,0.0,2,0,2.0,1.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
1,diaporthe-stem-canker,3,0,2,1,0.0,1,0,2.0,1.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
2,diaporthe-stem-canker,3,0,2,1,0.0,1,0,2.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
3,diaporthe-stem-canker,6,0,2,1,0.0,2,0,1.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
4,diaporthe-stem-canker,5,0,2,1,0.0,3,0,1.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
5,diaporthe-stem-canker,5,0,2,1,0.0,2,0,1.0,1.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
6,diaporthe-stem-canker,4,0,2,1,1.0,1,0,1.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
7,diaporthe-stem-canker,6,0,2,1,0.0,3,0,1.0,1.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
8,diaporthe-stem-canker,4,0,2,1,0.0,2,0,2.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
9,charcoal-rot,6,0,0,2,0.0,1,3,1.0,1.0,...,2,1,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0


 If you look at the data you notice all the of data are numerical. This is actually categories decoded by numbers. 0 to represent the first category, 1 to represent the 2 category and so on. This is the breakdown

    1. date:april,may,june,july,august,september,october,?.
    2. plant-stand:	normal,lt-normal,?.
    3. precip:		lt-norm,norm,gt-norm,?.
    4. temp:		lt-norm,norm,gt-norm,?.
    5. hail:		yes,no,?.
    6. crop-hist:	diff-lst-year,same-lst-yr,same-lst-two-yrs,same-lst-sev-yrs,?.
    7. area-damaged:	scattered,low-areas,upper-areas,whole-field,?.
    8. severity:	minor,pot-severe,severe,?.
    9. seed-tmt:	none,fungicide,other,?.
    10. germination:	90-100%,80-89%,lt-80%,?.
    11. plant-growth:	norm,abnorm,?.
    12. leaves:		norm,abnorm.
    13. leafspots-halo:	absent,yellow-halos,no-yellow-halos,?.
    14. leafspots-marg:	w-s-marg,no-w-s-marg,dna,?.
    15. leafspot-size:	lt-1/8,gt-1/8,dna,?.
    16. leaf-shread:	absent,present,?.
    17. leaf-malf:	absent,present,?.
    18. leaf-mild:	absent,upper-surf,lower-surf,?.
    19. stem:		norm,abnorm,?.
    20. lodging:    	yes,no,?.
    21. stem-cankers:	absent,below-soil,above-soil,above-sec-nde,?.
    22. canker-lesion:	dna,brown,dk-brown-blk,tan,?.
    23. fruiting-bodies:	absent,present,?.
    24. external decay:	absent,firm-and-dry,watery,?.
    25. mycelium:	absent,present,?.
    26. int-discolor:	none,brown,black,?.
    27. sclerotia:	absent,present,?.
    28. fruit-pods:	norm,diseased,few-present,dna,?.
    29. fruit spots:	absent,colored,brown-w/blk-specks,distort,dna,?.
    30. seed:		norm,abnorm,?.
    31. mold-growth:	absent,present,?.
    32. seed-discolor:	absent,present,?.
    33. seed-size:	norm,lt-norm,?.
    34. shriveling:	absent,present,?.
    35. roots:		norm,rotted,galls-cysts,?.


 ### Handling missing values

 Lets check is there is missing values

In [6]:
soybeansU.isna().sum()


Disease             0
date                1
plant-stand         8
precip             11
temp                7
hail               41
crop-hist           1
area-damaged        1
severity           41
seed-tmt           41
germination        36
plant-growth        1
leaves              0
leafspots-halo     25
leafspots-marg     25
leafspots-size     25
leaf-shread        26
leaf-malf          25
leaf-mild          30
stem                1
lodging            41
stem-cankers       11
canker-lesion      11
fruiting-bodies    35
external decay     11
mycelium           11
int-discolor       11
sclerotia          11
fruit-pods         25
fruit spots        35
seed               29
mold-growth        29
seed-discolor      35
seed-size          29
shriveling         35
roots               7
dtype: int64

 There seems to be quite a bit of missing values, with the coloumn with the highest amount of missing values being lodging

 There are 19 classes and they are described as such:

 Class Distribution:
 1. diaporthe-stem-canker: 10
 2. charcoal-rot: 10
 3. rhizoctonia-root-rot: 10
 4. phytophthora-rot: 40
 5. brown-stem-rot: 20
 6. powdery-mildew: 10
 7. downy-mildew: 10
 8. brown-spot: 40
 9. bacterial-blight: 10
 10. bacterial-pustule: 10
 11. purple-seed-stain: 10
 12. anthracnose: 20
 13. phyllosticta-leaf-spot: 10
 14. alternarialeaf-spot: 40
 15. frog-eye-leaf-spot: 40
 16. diaporthe-pod-&-stem-blight: 6
 17. cyst-nematode: 6
 18. 2-4-d-injury: 1
 19. herbicide-injury: 4

 Let's map these classes.

In [7]:
class_mapping = {label:idx for idx, label in 
                 enumerate(np.unique(soybeansU["Disease"]), start = 1)}
class_mapping


{'2-4-d-injury': 1,
 'alternarialeaf-spot': 2,
 'anthracnose': 3,
 'bacterial-blight': 4,
 'bacterial-pustule': 5,
 'brown-spot': 6,
 'brown-stem-rot': 7,
 'charcoal-rot': 8,
 'cyst-nematode': 9,
 'diaporthe-pod-&-stem-blight': 10,
 'diaporthe-stem-canker': 11,
 'downy-mildew': 12,
 'frog-eye-leaf-spot': 13,
 'herbicide-injury': 14,
 'phyllosticta-leaf-spot': 15,
 'phytophthora-rot': 16,
 'powdery-mildew': 17,
 'purple-seed-stain': 18,
 'rhizoctonia-root-rot': 19}

 As you see the right side of each class is their new mapping starting from 1 and ending at 19

 Now let's apply it to the column

In [8]:
soybeansU['Disease'] = soybeansU['Disease'].map(class_mapping)
soybeansU.head(10)


Unnamed: 0,Disease,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
0,11,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
1,11,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,11,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
3,11,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0
4,11,5,0,2,1,0,3,0,1,0,...,0,0,0,4,0,0,0,0,0,0
5,11,5,0,2,1,0,2,0,1,1,...,0,0,0,4,0,0,0,0,0,0
6,11,4,0,2,1,1,1,0,1,0,...,0,0,0,4,0,0,0,0,0,0
7,11,6,0,2,1,0,3,0,1,1,...,0,0,0,4,0,0,0,0,0,0
8,11,4,0,2,1,0,2,0,2,0,...,0,0,0,4,0,0,0,0,0,0
9,8,6,0,0,2,0,1,3,1,1,...,2,1,0,4,0,0,0,0,0,0


 As you can see, the column `Disease` is now numbers.
 now we will impute the NaNs with the most frequen class in each coloumn

In [9]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values = np.nan, strategy= 'most_frequent')
imp = imp.fit(soybeansU)
imputed_data = imp.transform(soybeansU.values)
imputed_data


array([[11, '4', '0', ..., '0', '0', '0'],
       [11, '3', '0', ..., '0', '0', '0'],
       [11, '3', '0', ..., '0', '0', '0'],
       ...,
       [14, '0', '1', ..., '0', '0', '1'],
       [14, '1', '1', ..., '0', '0', '1'],
       [14, '1', '1', ..., '0', '0', '1']], dtype=object)

 Now, lets split the data, to X and Y

In [16]:
Y = imputed_data[:,0]
X = imputed_data[:,1:]
oldx = X
print(X)
print(Y)


[['4' '0' '2' ... '0' '0' '0']
 ['3' '0' '2' ... '0' '0' '0']
 ['3' '0' '2' ... '0' '0' '0']
 ...
 ['0' '1' '2' ... '0' '0' '1']
 ['1' '1' '2' ... '0' '0' '1']
 ['1' '1' '2' ... '0' '0' '1']]
[11 11 11 11 11 11 11 11 11 8 8 8 8 8 8 8 8 8 8 19 19 19 19 19 19 19 19 19
 19 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 17 17 17 17 17 17 17 17 17 17 12 12 12 12 12 12 12 12
 12 12 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 18 18 18 18 18 18 18
 18 18 18 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 15 15 15 15 15 15 15 15
 15 15 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13
 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 10 10 10 10
 10 10 9 9 9 9 9 9 1 14 14 14 14]


 Now we do a one hot encoder to binarize the equations

In [11]:
X[0].dtype

dtype('O')

In [19]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder_X = OneHotEncoder(categorical_features = 'all')
X = onehotencoder_X.fit_transform(X).toarray()
print(oldx)
X




[['4' '0' '2' ... '0' '0' '0']
 ['3' '0' '2' ... '0' '0' '0']
 ['3' '0' '2' ... '0' '0' '0']
 ...
 ['0' '1' '2' ... '0' '0' '1']
 ['1' '1' '2' ... '0' '0' '1']
 ['1' '1' '2' ... '0' '0' '1']]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 1., 1., ..., 1., 1., 0.],
       [0., 1., 1., ..., 1., 1., 0.],
       [0., 1., 1., ..., 1., 1., 0.],
       ...,
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 1., 1., ..., 1., 1., 0.],
       [0., 1., 1., ..., 1., 1., 0.]])

In [23]:
print(oldx.shape)
print(X.shape)

(306, 35)
(306, 392)
