# SoyBean Predictions

## Load Libraries

In [3]:
import pandas as pd
import numpy as np
import sklearn as sl
from collections import defaultdict
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
%matplotlib inline

## Load and Transform Data

### Create Column Names

In [4]:
Columns = ['Disease','date','plant-stand','precip','temp','hail','crop-hist','area-damaged','severity','seed-tmt',
           'germination','plant-growth','leaves','leafspots-halo','leafspots-marg','leafspots-size','leaf-shread',
           'leaf-malf','leaf-mild','stem','lodging','stem-cankers','canker-lesion','fruiting-bodies','external decay',
           'mycelium','int-discolor','sclerotia','fruit-pods','fruit spots','seed','mold-growth','seed-discolor',
           'seed-size','shriveling','roots']

### Define labels and get data

In [19]:
diseases = ['diaporthe-stem-canker', 'charcoal-rot', 'rhizoctonia-root-rot',
       'phytophthora-rot', 'brown-stem-rot', 'powdery-mildew',
       'downy-mildew', 'brown-spot', 'bacterial-blight',
       'bacterial-pustule', 'purple-seed-stain', 'anthracnose',
       'phyllosticta-leaf-spot', 'alternarialeaf-spot',
       'frog-eye-leaf-spot', 'diaporthe-pod-&-stem-blight',
       'cyst-nematode', '2-4-d-injury', 'herbicide-injury']

In [32]:
soybeans = pd.read_csv('Soybean.csv') # Read the data
soybeans.columns = Columns # Assign the columns

### Guess Missing Values via KNN Interpolation.

In [33]:
soybeans = soybeans.replace('?','NaN') # Replace missing values with NaN for the purposes of KNN

In [34]:
soybeans = soybeans.groupby(['Disease'], as_index=False).fillna(method = 'pad') # Perform KNN

In [35]:
list (soybeans)

['Disease',
 'date',
 'plant-stand',
 'precip',
 'temp',
 'hail',
 'crop-hist',
 'area-damaged',
 'severity',
 'seed-tmt',
 'germination',
 'plant-growth',
 'leaves',
 'leafspots-halo',
 'leafspots-marg',
 'leafspots-size',
 'leaf-shread',
 'leaf-malf',
 'leaf-mild',
 'stem',
 'lodging',
 'stem-cankers',
 'canker-lesion',
 'fruiting-bodies',
 'external decay',
 'mycelium',
 'int-discolor',
 'sclerotia',
 'fruit-pods',
 'fruit spots',
 'seed',
 'mold-growth',
 'seed-discolor',
 'seed-size',
 'shriveling',
 'roots']

## Test and Train Set Setup

In [24]:
# Set up Labels by encoding diseases
le = sl.preprocessing.LabelEncoder()
labels = le.fit(soybeans['Disease'])

# Training set
XTrain, YTrain, XTest, YTest = train_test_split(soybeans, labels, test_size=0.3)

KeyError: 'Disease'

In [27]:
list (soybeans)

['area-damaged',
 'canker-lesion',
 'crop-hist',
 'date',
 'external decay',
 'fruit spots',
 'fruit-pods',
 'fruiting-bodies',
 'germination',
 'hail',
 'int-discolor',
 'leaf-malf',
 'leaf-mild',
 'leaf-shread',
 'leafspots-halo',
 'leafspots-marg',
 'leafspots-size',
 'leaves',
 'lodging',
 'mold-growth',
 'mycelium',
 'plant-growth',
 'plant-stand',
 'precip',
 'roots',
 'sclerotia',
 'seed',
 'seed-discolor',
 'seed-size',
 'seed-tmt',
 'severity',
 'shriveling',
 'stem',
 'stem-cankers',
 'temp']