# SoyBean Predictions

In [2]:
import pandas as pd
import numpy as np
import sklearn as sl
from collections import defaultdict

## Load and Transform Data

### Create Column Names

In [20]:
Columns = ['Disease','date','plant-stand','precip','temp','hail','crop-hist','area-damaged','severity','seed-tmt',
           'germination','plant-growth','leaves','leafspots-halo','leafspots-marg','leafspots-size','leaf-shread',
           'leaf-malf','leaf-mild','stem','lodging','stem-cankers','canker-lesion','fruiting-bodies','external decay',
           'mycelium','int-discolor','sclerotia','fruit-pods','fruit spots','seed','mold-growth','seed-discolor',
           'seed-size','shriveling','roots']

one thing to note, the missing values in this dataset are denoted by a "?", so we will be changing that to a NaN type

### Read Data

In [21]:
soybeansU = pd.read_csv('Soybean.csv')
soybeansU.columns = Columns
soybeansU = soybeansU.replace('?',np.nan)

Now lets check the info about this data

In [22]:
soybeansU.info()
soybeansU.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 36 columns):
Disease            306 non-null object
date               305 non-null object
plant-stand        298 non-null object
precip             295 non-null object
temp               299 non-null object
hail               265 non-null object
crop-hist          305 non-null object
area-damaged       305 non-null object
severity           265 non-null object
seed-tmt           265 non-null object
germination        270 non-null object
plant-growth       305 non-null object
leaves             306 non-null int64
leafspots-halo     281 non-null object
leafspots-marg     281 non-null object
leafspots-size     281 non-null object
leaf-shread        280 non-null object
leaf-malf          281 non-null object
leaf-mild          276 non-null object
stem               305 non-null object
lodging            265 non-null object
stem-cankers       295 non-null object
canker-lesion      295 non-null object

(306, 36)

So the dataframe is 306 rows by 36 columns

In [7]:
soybeansU.head(35) # Check to make sure it was imported prpperly 

Unnamed: 0,Disease,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
0,diaporthe-stem-canker,4,0,2,1,0.0,2,0,2.0,1.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
1,diaporthe-stem-canker,3,0,2,1,0.0,1,0,2.0,1.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
2,diaporthe-stem-canker,3,0,2,1,0.0,1,0,2.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
3,diaporthe-stem-canker,6,0,2,1,0.0,2,0,1.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
4,diaporthe-stem-canker,5,0,2,1,0.0,3,0,1.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
5,diaporthe-stem-canker,5,0,2,1,0.0,2,0,1.0,1.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
6,diaporthe-stem-canker,4,0,2,1,1.0,1,0,1.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
7,diaporthe-stem-canker,6,0,2,1,0.0,3,0,1.0,1.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
8,diaporthe-stem-canker,4,0,2,1,0.0,2,0,2.0,0.0,...,0,0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0
9,charcoal-rot,6,0,0,2,0.0,1,3,1.0,1.0,...,2,1,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0


### Handling missing values

Lets check is there is missing values

In [26]:
soybeansU.isna().sum()


Disease             0
date                1
plant-stand         8
precip             11
temp                7
hail               41
crop-hist           1
area-damaged        1
severity           41
seed-tmt           41
germination        36
plant-growth        1
leaves              0
leafspots-halo     25
leafspots-marg     25
leafspots-size     25
leaf-shread        26
leaf-malf          25
leaf-mild          30
stem                1
lodging            41
stem-cankers       11
canker-lesion      11
fruiting-bodies    35
external decay     11
mycelium           11
int-discolor       11
sclerotia          11
fruit-pods         25
fruit spots        35
seed               29
mold-growth        29
seed-discolor      35
seed-size          29
shriveling         35
roots               7
dtype: int64