# Convolutional Neural Network for House Plant Identification

In [1]:
import pandas as pd
import numpy as np

import re
import time
import cv2

import os
import requests
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
import contextlib

from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation, Flatten, Dropout, Dense
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam

import tensorflow as tf
import keras
from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten, Conv3D, MaxPool3D, AveragePooling2D
from keras.layers import Conv2D, MaxPooling2D, Input, Concatenate, Conv1D

from PIL import Image
import urllib
from sklearn.model_selection import train_test_split

from sklearn import preprocessing

## Webscrapping Google Images

### Basic scrapper to collect common houseplant names and relevant information

In [2]:
page = "https://www.guide-to-houseplants.com/house-plants-encyclopedia-a-z.html"
result = requests.get(page)

soup = BeautifulSoup(result.content, "html.parser")

plants = []
plants_true = []
plant_link = []

for plant in soup.find_all('i'):
    try:
        sip = plant.find_previous_sibling()
        plants.append(sip.get_text())
        plants_true.append(plant.get_text())
        plant_link.append(sip.get('href'))
    except:
        continue

info_df = pd.DataFrame()

info_df['plant'] = plants
info_df['latin'] = plants_true
info_df = info_df[info_df.plant != 'Botanical Name']
info_df['url'] = plant_link

In [9]:
def create_features(url):
    result = requests.get(url)
    soup = BeautifulSoup(result.content, "html.parser")
    final = {}

    for row in soup.find_all('p'):
        if row.find('b'):
            if row.find('b').get_text() == 'Water:':
                text = row.get_text() 
                final['Water'] = text.replace("Water: ", "")
            if row.find('b').get_text() == 'Light:':
                text = row.get_text() 
                final['Light'] = text.replace("Light: ", "")
            if row.find('b').get_text() == 'Soil:':
                text = row.get_text() 
                final['Soil'] = text.replace("Soil: ", "")
            final['url'] = url
    return final

info_df['info'] = info_df.url.apply(create_features)

info_series = info_df['info'].dropna().apply(pd.Series)
info_df = info_df.join( pd.DataFrame(info_series).set_index('url'), on='url' )
info_df.to_csv('info_df')

### Scrapping Google Images

In [24]:
# This is the path I use
DRIVER_PATH = r'C:\Users\David\Downloads\chromedriver_win32\chromedriver'
wd = webdriver.Chrome(executable_path=DRIVER_PATH)

In [25]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    

    #Load the right page
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        for img in thumbnail_results[results_start:number_results]:
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                        image_urls.add(actual_image.get_attribute('src'))
            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                break
        else:
            print("Gotta sleep for 30")
            time.sleep(30)
            print("Back at it")
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        results_start = len(thumbnail_results)
        df = pd.DataFrame(image_urls, columns=['image_url'])
        df['label'] = query

    return df

In [26]:
df = pd.DataFrame()

for plant in plants_true:
    temp_df = fetch_image_urls(plant, 150, wd, 1)
    df = pd.concat([df, temp_df], ignore_index=True)

df.to_pickle('cucumber')

# Convolutional Neural Network for House Plant Identification

## Preparing the data

In [27]:
def url_to_array(url):
    try:
        resp = urllib.request.urlopen(url, timeout=10)
        image = np.asarray(bytearray(resp.read()), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        image = cv2.resize(image, (32, 32), 0, 0, cv2.INTER_LINEAR)
        # print(url)
        resp.close()
        return image
    except:
        # print('oh no')
        return np.nan

df['array'] = df.image_url.apply(url_to_array)

df.to_pickle('cucumber')

In [28]:
def return_predition(img):
    image = cv2.imread(img, cv2.IMREAD_COLOR)
    image = cv2.resize(image, (32, 32), 0, 0, cv2.INTER_LINEAR)
    X = image.reshape(-1, 32, 32, 3)
    predictions = np.argmax(model.predict(X), axis=-1) 
    pred = le.inverse_transform(predictions)
    return pred[0]

return_predition(r'C:\Users\David\Documents\code\Final_Project\images.jpg')



'Selaginella kraussiana'

In [5]:
df = df.dropna() #As some image arrays can't be returned, we should drop the rows before setting the labels
df = df.reset_index(drop=True) 
X = list(df.array)
X = np.array(X)
input_shape = X[0].shape

## Label Prep

In [6]:
#Label Prep
le = preprocessing.LabelEncoder() #We will need this to return our original labels later
y = df.label
le.fit(y)
y = le.transform(y)
output_shape = len(np.unique(y))

y = to_categorical(y)

In [7]:
model = Sequential()
chanDim = -1
model.add(Conv2D(32, (3, 3), padding="same",input_shape=input_shape))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(128, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(128, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(1024))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(output_shape, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="Adam",metrics=[["accuracy"]])
model.fit(X, y, batch_size=150, epochs=40, validation_split=0.1)

Epoch 1/40

KeyboardInterrupt: 

In [46]:
model.save(r'C:\Users\David\Documents\code\What_Plant')

INFO:tensorflow:Assets written to: C:\Users\David\Documents\code\What_Plant\assets


In [4]:
df = pd.read_pickle(r'C:\Users\David\Documents\code\What_Plant\cucumber')