# Crop Identification and Disease Recognition
## Machine Learning Models
## Image Classification with Naive Bayes and Random Forest
#### Analysis by Ben Geissel

In [3]:
# Import necessary packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
from PIL.Image import core as image
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
import image_processing

In [4]:
# Run through image files and determine if any are the wrong shape
# Count images by crop/disease type
data_path = 'PlantVillage-Dataset/raw_image_data/color'
diff_shape_count = 0
img_count = 0
leaf_type_img_count = 0
leaf_type_img_count_dict = {}
for folder in os.listdir(data_path):
    for image in os.listdir('%s/%s' % (data_path, folder)):
        img_loc = '%s/%s/%s' % (data_path, folder, image)
        img = Image.open(img_loc)
        arr = np.array(img)
        img_shape = arr.shape
        img_count += 1
        leaf_type_img_count += 1
        if img_shape != (256, 256, 3):
            diff_shape_count += 1
            print(img_loc)
            print(img_shape)
        else:
            continue
    leaf_type_img_count_dict[folder] = leaf_type_img_count
    leaf_type_img_count = 0
print('Wrong Shape Image Count: %d' % (diff_shape_count))
print('Total Color Image Count: %d' % (img_count))
print()
print('Color Image Count by Class:')
leaf_type_img_count_dict

Wrong Shape Image Count: 0
Total Color Image Count: 54304

Color Image Count by Class:


{'Apple___Apple_scab': 630,
 'Apple___Black_rot': 621,
 'Apple___Cedar_apple_rust': 275,
 'Apple___healthy': 1645,
 'Blueberry___healthy': 1502,
 'Cherry_(including_sour)___Powdery_mildew': 1052,
 'Cherry_(including_sour)___healthy': 854,
 'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot': 513,
 'Corn_(maize)___Common_rust_': 1192,
 'Corn_(maize)___Northern_Leaf_Blight': 985,
 'Corn_(maize)___healthy': 1162,
 'Grape___Black_rot': 1180,
 'Grape___Esca_(Black_Measles)': 1383,
 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)': 1076,
 'Grape___healthy': 423,
 'Orange___Haunglongbing_(Citrus_greening)': 5507,
 'Peach___Bacterial_spot': 2297,
 'Peach___healthy': 360,
 'Pepper,_bell___Bacterial_spot': 997,
 'Pepper,_bell___healthy': 1477,
 'Potato___Early_blight': 1000,
 'Potato___Late_blight': 1000,
 'Potato___healthy': 152,
 'Raspberry___healthy': 371,
 'Soybean___healthy': 5090,
 'Squash___Powdery_mildew': 1835,
 'Strawberry___Leaf_scorch': 1109,
 'Strawberry___healthy': 456,
 'Tomato___Bac

In [5]:
# Run through image files and convert to flattened pixel array and add to list
# Create target list
data_list = []
target_list = []
data_path = 'PlantVillage-Dataset/raw_image_data/color'
for folder in os.listdir(data_path):
    for image in os.listdir('%s/%s' % (data_path, folder)):
        img_loc = '%s/%s/%s' % (data_path, folder, image)
        img_flat_arr = image_processing.image_to_flat_array(img_loc)
        data_list.append(img_flat_arr)
        target_list.append(folder)

In [6]:
# Convert list to array
data_array = np.array(data_list)
target_array = np.array(target_list)

In [7]:
# Normalize Arrays
norm_list = []
for img_array in data_array:
    norm_arr = image_processing.pixel_normalization(img_array)
    norm_list.append(norm_arr)

In [8]:
# Standardize Arrays
standardized_list = []
for norm_arr in norm_list:
    standardized_img_arr = image_processing.pixel_centering(norm_arr)
    standardized_list.append(standardized_img_arr)

In [9]:
# Convert to array
standardized_data_array = np.array(standardized_list)

In [10]:
# Train Test Validation Split - 15% testing data, 15% validation data, 70% training data
X_train, X_test, y_train, y_test = train_test_split(standardized_data_array, target_array,
                                                    test_size = .15, random_state = 14)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                 test_size = (1 - .7/.85), random_state = 14)

In [20]:
print(X_test.shape)
print(y_test.shape)

(8146, 196608)
(8146,)


UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U50'), dtype('<U50')) -> dtype('<U50')

In [14]:
x = np.array([[10,20,30], [40,50,60]])
y = np.array([[100], [200]])
print(x.shape)
print(y.shape)
print(np.append(x, y, axis=1))

(2, 3)
(2, 1)
[[ 10  20  30 100]
 [ 40  50  60 200]]


In [None]:
# Apply Principal Component Analysis (PCA) to reduce dimensionality
# Fit to training data
pca = PCA(.10)
X_train_pca = pca.fit(X_train)

In [None]:
# Transform train, validation, and testing data
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

In [None]:
print(X_train_pca.shape)
print(X_val_pca.shape)
print(X_test_pca.shape)