In [9]:
!pip install pandas numpy requests opencv-python imutils scikit-learn scipy Pillow keras tensorflow scikit-image



In [1]:
import pandas as pd
import numpy as np
import requests
import random
import cv2
import imutils
import sklearn.preprocessing as preprocessing
import scipy.spatial.distance

from io import BytesIO
from PIL import Image
from sklearn.model_selection import train_test_split
from keras.applications import vgg16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, InputLayer
from keras import optimizers, regularizers
from skimage import transform
from skimage.util import random_noise

In [2]:
def get_img_arrays(df_l):
    X = []
    df = df_l.copy()
    for index,row in df_l.iterrows():
        try:
            response = requests.get(row['url'], headers = headers)
            image_io = BytesIO(response.content)
            img = Image.open(image_io)
            img = img.resize((150, 150)) # resize to 150x150x3
            img = np.array(img)
            if img.shape!=(150,150,3):
                df.drop(index, inplace=True)
            else:
                X.append(img)
        except:
            df.drop(index, inplace=True)
    return X, df

X_list, df = get_img_arrays(df)
X = np.array(X_list)

df['category'] = df['category'].str.replace('sports','entertainment')
df['category'] = df['category'].str.replace('art','museums')

df_under = df.copy()
grouped = df_under.groupby('category')
museum_len = len(grouped.get_group('museums'))
parks_len = len(grouped.get_group('parks'))
num_to_remove = parks_len - museum_len

parks_indexes = grouped.get_group('parks').index.tolist()
indexes_to_remove = random.sample(parks_indexes, num_to_remove)

df_under.drop(df_under.index[indexes_to_remove], inplace = True)
X_under = np.delete(X, indexes_to_remove, 0)
y = df_under['category']

X_std = X_under / 255

y_ohe = pd.get_dummies(y)


X_train, X_test, y_train, y_test = train_test_split(X_std, y_ohe, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)


def get_img_flip(img):
    return np.fliplr(img)

def get_img_transform(img):
    return transform.rotate(img, random.uniform(-30,30))

def get_img_noise(img):
    return random_noise(img, mode='s&p', clip=True)

X_train_aug = []
y_train_aug = []

for idx in range(0,len(X_train)):
    X_train_aug.append(X_train[idx])
    X_train_aug.append(get_img_flip(X_train[idx]))
    X_train_aug.append(get_img_transform(X_train[idx]))
    X_train_aug.append(get_img_noise(X_train[idx]))
    y_train_aug.append(y_train.iloc[idx,0])
    y_train_aug.append(y_train.iloc[idx,0])
    y_train_aug.append(y_train.iloc[idx,0])
    y_train_aug.append(y_train.iloc[idx,0])


def get_bottleneck_features(model, input_imgs):
    features = model.predict(input_imgs, verbose=0)
    return features

inputs = (150, 150, 3)
vgg = vgg16.VGG16(include_top=False, weights='imagenet', input_shape=inputs)

output = vgg.layers[-1].output
output = keras.layers.Flatten()(output)
vgg_model = Model(vgg.input, output)

vgg_model.trainable = False
for layer in vgg_model.layers:
    layer.trainable = False


X_train_aug_vgg = get_bottleneck_features(vgg_model, X_train_aug)
X_val_vgg = get_bottleneck_features(vgg_model, X_val)
X_test_vgg = get_bottleneck_features(vgg_model, X_test)

batch_s = 400
learning_rate = 0.00005
dropout = 0.55
epochs_num = 10
l2_loss_lambda = 0.00000001
n_classes = len(y_train_aug.columns)
l2 = regularizers.l2(l2_loss_lambda)
input_shape = vgg_model.output_shape[1]

model = Sequential()
model.add(InputLayer(input_shape=(input_shape,)))
model.add(Dense(1000, activation='relu', kernel_regularizer=l2, input_dim=input_shape))
model.add(Dropout(dropout))
model.add(Dense(1000, activation='relu', kernel_regularizer=l2))
model.add(Dropout(dropout))
model.add(Dense(1000, activation='relu', kernel_regularizer=l2))
model.add(Dropout(dropout))
model.add(Dense(1000, activation='relu', kernel_regularizer=l2))
model.add(Dropout(dropout))
model.add(Dense(n_classes, activation='sigmoid', kernel_regularizer=l2))
model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.RMSprop(lr=learning_rate),
              metrics=['accuracy'])

history_vgg = model.fit(X_train_aug_vgg, y_train_aug, batch_size=batch_s,
                        epochs=epochs_num,
                        validation_data=(X_val_vgg, y_val), verbose=1)


def histogram(image, mask, bins):
    ''' get histogram of reg, green, and blue color distribution feature vectors for a
    section of an image (represented by a mask)
    return 3 histograms as a single color feature vecotr
    '''
    hist = cv2.calcHist([image], [0,1,2], mask, [bins[0],bins[1],bins[2]],[0, 180, 0, 256, 0, 256])
    if imutils.is_cv2():
        hist = cv2.normalize(hist).flatten()
    else:
        hist = cv2.normalize(hist, hist).flatten()
    return hist

def get_color_description(img_array, bins):
    '''
    get color distribution feature vector for an image with specified number of bins
    split image up into 5 sections, 4 corners and a center ellipse, getting r,g, and b
    distribution vectors
    '''
    color = cv2.COLOR_BGR2HSV
    img = img_array * 255
    image = cv2.cvtColor(img, color)
    features = []
    (h, w) = image.shape[:2]
    (cX, cY) = (int(w * 0.5), int(h * 0.5))
    segments = [(0, cX, 0, cY), (cX, w, 0, cY), (cX, w, cY, h), (0, cX, cY, h)]
    (axesX, axesY) = (int(w * 0.75) // 2, int(h * 0.75) // 2)
    ellipMask = np.zeros(image.shape[:2], dtype="uint8")
    cv2.ellipse(ellipMask, (cX, cY), (axesX, axesY), 0, 0, 360, 255, -1)
    for (startX, endX, startY, endY) in segments:
        cornerMask = np.zeros(image.shape[:2], dtype="uint8")
        cv2.rectangle(cornerMask, (startX, startY), (endX, endY), 255, -1)
        cornerMask = cv2.subtract(cornerMask, ellipMask)
        hist = histogram(image, cornerMask, bins)
        features.extend(hist)
    hist = histogram(image, ellipMask, bins)
    features.extend(hist)
    return features

bins = [8, 8, 8]
color_feats = []
for x in X:
    color_feats.append(get_color_description(x, bins))

feats_arr = get_bottleneck_features(vgg_model, X)

df['color_feats'] = color_feats
df['vgg_feats'] = feats_arr

grouped = df.groupby('category')


def classify(img_vgg, model):
    '''
    find class using cnn model, using imgvgg vector and return prediction
    '''
    cats = ['beaches/ocean', 'entertainment', 'gardens/zoo', 'landmarks', 'museums', 'parks']
    predictions = np.array(model.predict(img_vgg))
    pred = np.argmax(predictions)
    return cats[pred]

def get_distance(img_feats, feats):
    '''
    get distance between vectors
    '''
    return returnscipy.spatial.distance.cosine(img_feats, feats)

def get_recommendations(img_class, img_array, img_vgg, df):
    '''
    get df of top attractions and display 3 images from top attractions
    '''
    bins = [8, 8, 8]
    img_color_des = get_color_description(img_array, bins)
    df['color_feats'] = df.apply(lambda row: get_distance(img_color_des, row['color_feats']), axis=1)
    df['vgg_feats'] = df.apply(lambda row: get_distance(img_vgg, row['vgg_feats']), axis=1)
    
    min_max_scaler = preprocessing.MinMaxScaler()
    color_array = df['color_feats'].values.astype(float).reshape(-1, 1)
    scaled_color_array = min_max_scaler.fit_transform(color_array)
    vgg_array = df['vgg_feats'].values.astype(float).reshape(-1, 1)
    scaled_vgg_array = min_max_scaler.fit_transform(vgg_array)
    
    df.drop(['color_feats', 'vgg_feats'], axis=1, inplace=True)
    
    if img_class in ['beaches/oceans']:
        total_distance = 0.5 * scaled_vgg_array + scaled_color_array
    elif img_class in ['gardens/zoo']:
        total_distance = 10 * scaled_vgg_array + scaled_color_array
    elif img_class in ['entertainment', 'landmarks', 'museums']:
        total_distance = 20 * scaled_vgg_array + scaled_color_array
    else:
        total_distance = 1 * scaled_vgg_array + scaled_color_array
    
    df['distance'] = total_distance
    
    grouped_df = df.groupby(['name', 'location'])['distance'].mean()
    grouped_df = pd.DataFrame(grouped_df).reset_index()
    
    grouped_df['length'] = grouped_df.location.str.len()
    grouped_df = grouped_df[grouped_df.length > 3]
    
    grouped_df.sort_values(by=['distance'], ascending=True, inplace=True)
    
    top_df = grouped_df[:3].reset_index()
    atts = [top_df.loc[0, 'name'], top_df.loc[1, 'name'], top_df.loc[2, 'name']]
    
    grouped = df.groupby('name')
    groups = []
    for attraction in atts:
        groups.append(grouped.get_group(attraction))
    
    show_recommendations(groups, atts)
    return top_df

def show_recommendations(groups, atts):
    '''
    show 3 images for each recommended attraction
    '''
    for idx, group in enumerate(groups):
        df = pd.DataFrame(group).reset_index()
        imgs = [df.loc[0, 'url'], df.loc[2, 'url'], df.loc[5, 'url']]
        fig = plt.figure()
        fig.suptitle(atts[idx], fontsize="x-large")
        for i in range(3):
            a = fig.add_subplot(1, 3, i+1)
            image = load_image(imgs[i])
            plt.imshow(image, cmap='Greys_r')
            plt.axis('off')

def input_img_load_to_array(path):
    image = Image.open(path)
    img = image.resize((150, 150))
    return ((np.array(img))/255)

path = 'path to test img'
input_img = input_img_load_to_array(path)
img_vgg = get_bottleneck_features(vgg_model, np.array([input_img]))

label = classify(img_vgg, model)
group = grouped.get_group(label)
top_3 = get_recommendations(label, image_array, img_vgg, group)

NameError: name 'df' is not defined