# Book Cover in Recommender System

### import requirements

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random
import cv2
import urllib.request

In [2]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler

In [3]:
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras.layers import concatenate
from keras.layers import Flatten
from keras.layers import Input
from keras.models import Model

### --- classes and functions ---

In [6]:
class NN_models:

    def create_mlp(Self, dim, regress=False):
        # define our MLP network
        model = Sequential()
        model.add(Dense(8, input_dim=dim, activation="relu"))
        model.add(Dense(4, activation="relu"))

        # check to see if the regression node should be added
        if regress:
            model.add(Dense(1, activation="linear"))

        # return our model
        return model

    def create_cnn(Self, height, width, depth, filters=(16, 32, 64), regress=False):
        # initialize the input shape and channel dimension, assuming
        # TensorFlow/channels-last ordering

        filters = np.asarray(filters)
        input_shape = (height, width, depth)
        chanDim = -1

        # define the model input
        inputs = Input(shape=input_shape)

        # loop over the number of filters
        for i in range(filters.shape[0]):
            # if this is the first CONV layer then set the input
            # appropriately
            f = filters[i]
            if i == 0:
                x = inputs

            # CONV => RELU => BN => POOL
            x = Conv2D(f, (3, 3), padding="same")(x)
            x = Activation("relu")(x)
            x = BatchNormalization(axis=chanDim)(x)
            x = MaxPooling2D(pool_size=(2, 2))(x)

        # flatten the volume, then FC => RELU => BN => DROPOUT
        x = Flatten()(x)
        x = Dense(16)(x)
        x = Activation("relu")(x)
        x = BatchNormalization(axis=chanDim)(x)
        x = Dropout(0.5)(x)

        # apply another FC layer, this one to match the number of nodes
        # coming out of the MLP
        x = Dense(4)(x)
        x = Activation("relu")(x)

        # check to see if the regression node should be added
        if regress:
            x = Dense(1, activation="linear")(x)

        # construct the CNN
        model = Model(inputs, x)

        # return the CNN
        return model

In [72]:
def download_images(df, input_path):
    
    for i in df['isbn'].index:
        file_path_s = input_path + "//" + str(i) + '.jpg'
        url = df.loc[i]['image']
        urllib.request.urlretrieve(url, file_path_s)


def load_cover_images(df, input_path):
    # initialize our images array
    images = []

    # loop over the indexes of the books
    for i in df['isbn']:

        path = input_path + "//" + str(i) + '.jpg'
        print(path)
        image = cv2.imread(path)
        outputImage = cv2.resize(image, (32, 32))
        images.append(outputImage)

    # return our set of images
    return np.array(images)

In [37]:
def Create_cover_image_data(train_data, test_data, images, scaling):
    # process and filter data and label per for the CNN

    train_Y = train_data.average_rating
    test_Y = test_data.average_rating

    train_images = []
    for j in train_data['isbn']:
        train_images.append(images[j - 1])
    test_images = []
    for j in test_data['isbn']:
        test_images.append(images[j - 1])

    train_Y = np.asarray(train_Y) / scaling
    test_Y = np.asarray(test_Y) / scaling

    return np.asarray(train_images), train_Y, np.asarray(test_images), test_Y


def Create_user_book_data(train_data, test_data, data, scaling):
    # process and filter data and label per for the NN
    train_Y = train_data.average_rating
    test_Y = test_data.average_rating

    # Continous data
    cont = ['year', 'pages']
    cs = MinMaxScaler()

    trainCont = cs.fit_transform(train_data[cont])
    testCont = cs.transform(test_data[cont])

    # Categorical data
    #categ = ['author', 'title', 'firstgenre']
    categ = ['author', 'title', 'publisher']

    for j in range(len(categ)):
        bin = LabelBinarizer().fit(data[categ[j]])
        if j == 0:
            trainFull = np.hstack([trainCont, bin.transform(train_data[categ[j]])])
            testFull = np.hstack([testCont, bin.transform(test_data[categ[j]])])
        else:
            trainFull = np.hstack([trainFull, bin.transform(train_data[categ[j]])])
            testFull = np.hstack([testFull, bin.transform(test_data[categ[j]])])

    train_Y = np.asarray(train_Y) / scaling
    test_Y = np.asarray(test_Y) / scaling

    return trainFull, train_Y, testFull, test_Y

In [36]:
def split_data(df, split, n):
    allbooks = random.sample(list(df['isbn']), n)
    split_train = np.around(n * split)
    train = df[df['isbn'].isin(allbooks[0:int(split_train) - 1])]
    test = df[df['isbn'].isin(allbooks[int(split_train):n])]

    return test, train

In [11]:
def NN_fit(train_data, train_Y, test_data, test_Y, model):
    # train the model
    print("[INFO] training model...")
    m = model.fit(train_data, train_Y, validation_data=(test_data, test_Y), epochs=25, batch_size=8)

    # make predictions on the testing data
    print("[INFO] predicting book ratings...")
    preds_test = model.predict(test_data)
    preds_train = model.predict(train_data)

    # compute the difference between the *predicted* book rating and the
    # *actual* rating, then compute the percentage difference and
    # the absolute percentage difference
    diff = preds_test.flatten() - test_Y
    percentDiff = (diff / test_Y) * 100
    absPercentDiff = np.abs(percentDiff)

    # compute the mean and standard deviation of the absolute percentage
    # difference
    mean = np.mean(absPercentDiff)
    std = np.std(absPercentDiff)

    return preds_test, preds_train, mean, std, m.history['val_loss']

## --- main ---

### load dataset

In [22]:
df1 = pd.read_csv('books_agg.csv', index_col=0)
df1.head(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,Location,Age,City,State,Country
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2.0,0.0,"stockton, california, usa",18.0,stockton,california,usa
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8.0,5.0,"timmins, ontario, canada",,timmins,ontario,canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,8.0,0.0,"timmins, ontario, canada",,timmins,ontario,canada


In [15]:
df2 = pd.read_csv('books/GoodReadBooks.csv', error_bad_lines=False)
df2.head(3)

b'Skipping line 3350: expected 12 fields, saw 13\nSkipping line 4704: expected 12 fields, saw 13\nSkipping line 5879: expected 12 fields, saw 13\nSkipping line 8981: expected 12 fields, saw 13\n'


Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic


In [23]:
df = df1.merge(df2, left_on='ISBN', right_on='isbn')
df.shape

(55412, 27)

In [32]:
df_orig = df
df = df.dropna()
df = df.drop_duplicates(subset='isbn')
df.shape

(3382, 11)

In [26]:
df = df[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 
         'Image-URL-S', 'average_rating', 'language_code',
         '  num_pages', 'ratings_count', 'text_reviews_count']]

df.columns = ['isbn', 'title', 'author', 'year', 'publisher',
              'image', 'average_rating', 'language_code',
              'pages', 'ratings_count', 'text_reviews_count']

In [28]:
n_books = len(set(df['isbn']))
print(f'There are {n_books} books in the dataset.')

There are 3382 books in the dataset.


In [57]:
if not os.path.isdir(root_path + r'/NN'):
    os.mkdir(root_path + r'/NN')
save_path = root_path + r'/NN'

root_path = r'/Users/cinny/GitHub/ml_ecommerce/books_amazon/'

# download images from web
if not os.path.isdir(root_path + r'/img_s'):
    os.mkdir(root_path + r'/img_s')
input_path = root_path + r'/img_s'
download_images(df, input_path)

KeyboardInterrupt: 