In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

#keras
import tensorflow.keras as keras
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, Flatten, BatchNormalization
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        #print(os.path.join(dirname, filename))
        

dataset_path = '/kaggle/input/petfinder-pawpularity-score/'

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Taking a look at the Data - (EDA)**

In [2]:
train_df = pd.read_csv(dataset_path + 'train.csv')
test_df = pd.read_csv(dataset_path + 'test.csv')
submission_df = pd.read_csv(dataset_path + 'sample_submission.csv')

train_df.head()

In [3]:
submission_df.head()

In [4]:
test_df.head()

In [5]:
# Add image path to the dataset

train_df['Img'] = train_df['Id'].map(lambda x: str(dataset_path + 'train/' + x + '.jpg'))
# train_df = train_df.drop(columns=['Id']) - keeping the ID of the test data. Would be useful for submission

test_df['Img'] = test_df['Id'].map(lambda x: str(dataset_path + 'test/' + x + '.jpg'))
# test_df = test_df.drop(columns=['Id']) - keeping the ID of the test data. Would be useful for submission

train_df.head()

**Checkiing out the distribution of the target variable - Pawpularity**

In [6]:
train_df['Pawpularity'].describe()

In [7]:
train_df['Pawpularity'].hist(figsize=(10,5))

The above looks skewed

**Checking to see the distribution**

In [8]:
sns.set(rc={'figure.figsize':(14,9)})

fig = plt.figure()
sns.histplot(data=train_df, x='Pawpularity', kde=True)
plt.axvline(train_df['Pawpularity'].mean(), c='green', ls='-', lw=3, label="Mean Pawpularity")
plt.title('Pawpularity score Histogram', fontsize=20, fontweight='bold')
plt.legend()
plt.show()

This is not sufficient to say if the data is normally distributed as it is centered around 38. We will check the normality of the distribution with a quantile - quantile diagram.

In [9]:
from statsmodels.graphics.gofplots import qqplot

fig = plt.figure()
qqplot(train_df['Pawpularity'], line='s')
plt.title('Quantile-Quantile plot of Pawpularity distribution')
plt.show()

From the QQPlot above, we can also see the target column is not normally distributed. We will now check with the Kolmogorov-Smirnov test to further confirm

In [10]:
from scipy.stats import kstest

stat, p = kstest(train_df['Pawpularity'],'norm')
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print(f'Data seem Gaussian. We will not reject H0 at {int(alpha*100)}% test level')
else:
    print(f'Data is not Gaussian We will reject H0 at {int(alpha*100)}% test level')

**Let us now take a look at the distribution of the rest of the parameters**

In [11]:
train_df.info()

**There are no missing values**

In [12]:
predictor = train_df.columns[1:-2]

fig = plt.figure(figsize=(25,20))
for i, x in enumerate(predictor):
    ax = plt.subplot(3,4,i+1)
    sns.countplot(data=train_df, x=x, ax=ax)
    ax.set_xlabel(None)
    ax.set_title(x, fontweight='bold', color="#e7323f")

plt.suptitle("Predictor distribution", y=0.93,
             fontsize=20, fontweight='bold')
plt.show()  

In [13]:
# Lets take a look at the heatmap of the dataset
fig, ax = plt.subplots(figsize=(18, 18))
sns.heatmap(train_df[:-1].corr(),annot=True)

From the above, we can see there is a bit of high correlation (>0.5) between Eyes and Face, and Occlusion and Human. Info and Collage also come close but not up to 0.5

We, therefore also need to check if there is too much multicollinearity that could degrade the performance of our models. For this we will use Variance Inflation Factor (VIF) from Statsmodel

In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = train_df[predictor]
vif["Feature"] = X.columns
  
# calculating VIF for each feature
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]  
vif = vif.sort_values("VIF", ascending=False)
vif

From the table above, Face and Eyes have very high values of VIF, which means both parameters are highly correlated. Hence, having the 2 pramters together in our model will lead to a model with high multicollinearity. To avoid this, we will have to use only one of the 2 parameters.

We remove the column(Parameter) with the highest VIF.

In [15]:
X.drop("Face", axis=1, inplace=True)
X.head()

**Correlations between predictor variables and Pawpularity**

We will now check whether there are strong linear correlations (Pearson) between the predictor variables and the variable to be predicted (Pawpularity).

In [16]:
for x in X.columns:
    corr_y = round(np.corrcoef(train_df[x], train_df["Pawpularity"])[0,1],4)
    print(f"Pawpularity - {x}: {corr_y}")

From the above, there seem to be no correlation between the target variable and the other parameters.

**Taking a look at a sample Images**

In [17]:
from PIL import Image

im = Image.open(train_df['Img'][15])
width, height = im.size
print(width,height)

In [18]:
im

In [19]:
fig, ax = plt.subplots(2,3,figsize=(15,9))
fig.patch.set_facecolor('#343434')

for i, a in zip(train_df[['Img', 'Pawpularity']].sample(6).iterrows(), ax.ravel()):
    a.set(xticks=[], yticks=[])
    img = plt.imread(i[1][0])
    a.imshow(img)
    a.set_title(f'Id: {i[0]}, Pawpularity Score: {i[1][1]}', color="white")

fig.suptitle('Pawpularity Images', fontsize=20, fontweight='bold', color="#e7273e")
fig.tight_layout()
fig.show()

**Let us look at 3 of the most popular and 3 of the least popular to see if there are any physical difference**

In [20]:
top = train_df[train_df['Pawpularity'] == 100]['Img']
top

In [21]:
top = train_df[train_df['Pawpularity'] == 100]['Img']

fig, ax = plt.subplots(1,3)
fig.patch.set_facecolor('#343434')

for i, ax in zip(top.sample(3), ax.ravel()):
    ax.set(xticks=[], yticks=[])
    img = plt.imread(i)
    ax.imshow(img)
    
fig.suptitle('Most Pawpular Images', fontsize=20, color='#7bbfc5', y=0.95)
fig.tight_layout()
fig.show()

In [22]:
bottom = train_df[train_df['Pawpularity'] == 1]['Img']

fig, ax = plt.subplots(1,3)
fig.patch.set_facecolor('#343434')

for i, ax in zip(bottom.sample(3), ax.ravel()):
    ax.set(xticks=[], yticks=[])
    img = plt.imread(i)
    ax.imshow(img)
    
fig.suptitle('Least Pawpular Images', fontsize=20, color='#7bbfc5', y=0.95)
fig.tight_layout()
fig.show()

# Feature Engineering

Since we are predicting the popularity of pictures, we will therefore extract features that can help determine (increase or decrease) the likeness or popularity of the pictures such as the background or dominant color.

We will start with extracting the dominant colors of the images and store as a parameter.
We will be using clustering methods on the RGB layers of our image files to extract the dominant color in HLS (Hue Lightness Saturation) format. This format will allow us to recover in a single formula the information on the hue, saturation and luminance of the dominant color of each image.

In [23]:
from sklearn.cluster import KMeans
from collections import Counter
import cv2


def get_dominant_color(image_path, k=4, image_processing_size = None):
    """
    takes an image as input
    returns the dominant color of the image as a list
    
    dominant color is found by running k means on the 
    pixels & returning the centroid of the largest cluster

    processing time is speed up by working with a smaller image; 
    this resizing can be done with the image_processing_size param 
    which takes a tuple of image dims as input
    """
    
    image = plt.imread(image_path)
    #resize image if new dims provided
    if image_processing_size is not None:
        image = cv2.resize(image, image_processing_size, 
                            interpolation = cv2.INTER_AREA)
    
    #reshape the image to be a list of pixels
    image = image.reshape((image.shape[0] * image.shape[1], 3))

    #cluster and assign labels to the pixels 
    clt = KMeans(n_clusters = k)
    labels = clt.fit_predict(image)

    #count labels to find most popular
    label_counts = Counter(labels)

    #subset out most popular centroid
    dominant_color = clt.cluster_centers_[label_counts.most_common(1)[0][0]]
    dominant_color = list(dominant_color)
    r = int(dominant_color[0])
    g = int(dominant_color[1])
    b = int(dominant_color[2])
    
    #Convert to HLS color space
    dominant_hls = colorsys.rgb_to_hls(r, g, b)

    return list(dominant_hls)

Taking a look at sample Image

In [24]:
import colorsys
import matplotlib

sample_img = train_df['Img'][103]
sample_hls = get_dominant_color(sample_img, k=3, image_processing_size = (50, 50))
sample_dom_color = colorsys.hls_to_rgb(sample_hls[0],
                                       sample_hls[1],
                                       sample_hls[2])
sample_dom_color = "#{:02x}{:02x}{:02x}".format(int(sample_dom_color[0]),
                                                int(sample_dom_color[1]),
                                                int(sample_dom_color[2]))
print("Dominant HLS : ", sample_hls)
print("Dominant Color Hex : ", sample_dom_color)

fig = plt.figure(figsize=(12,5))
ax = fig.add_subplot(121)
ax = plt.imshow(plt.imread(sample_img))
ax2 = fig.add_subplot(122)
rect1 = matplotlib.patches.Rectangle((0,0), 10, 5,color=sample_dom_color)
ax2.add_patch(rect1)
plt.axis('off')
plt.suptitle('Dominant color of sample image', fontsize=20, fontweight='bold', y=0.98)
fig.tight_layout()
plt.show()

Applying to all images

In [25]:
from tqdm.notebook import tqdm

tqdm.pandas()
train_df['Dominant_color_hls'] = train_df['Img'].progress_apply(lambda x: get_dominant_color(x, k=3, image_processing_size = (50, 50)))

In [None]:
train_df.head()

In [None]:
temp_train_df = train_df["Dominant_color_hls"].apply(pd.Series)
temp_train_df = temp_train_df.rename(columns={0:"H",1:"L",2:"S"})
train_df = pd.concat([train_df, temp_train_df], axis=1)
train_df.drop("Dominant_color_hls", axis=1, inplace=True)
train_df.head()

Looking at the distribution of H, L, S

In [None]:
fig = plt.figure(figsize=(20,6))
ax1 = fig.add_subplot(131)
sns.histplot(train_df["H"], ax=ax1)
ax1.set_title("Hue", fontsize=17, color="#186fb4")
ax2 = fig.add_subplot(132)
sns.histplot(train_df["L"], ax=ax2)
ax2.set_title("Luminance", fontsize=17, color="#186fb4")
ax3 = fig.add_subplot(133)
sns.histplot(train_df["S"], ax=ax3)
ax3.set_title("Saturation", fontsize=17, color="#186fb4")
plt.suptitle('Dominant HLS color of train images', 
             fontsize=20, fontweight='bold', y=0.98)
fig.tight_layout()
plt.show()

In [None]:
test_df["Dominant_color_hls"] = test_df["Img"].progress_apply(
    lambda x : get_dominant_color(
        x, 
        k=3, 
        image_processing_size = (50, 50)))

In [None]:
temp_test_df = test_df["Dominant_color_hls"].apply(pd.Series)
temp_test_df = temp_test_df.rename(columns={0:"H",1:"L",2:"S"})
test_df = pd.concat([test_df, temp_test_df], axis=1)
test_df.drop("Dominant_color_hls", axis=1, inplace=True)
test_df.head()

**Resizing Images**

We will have to perform resize to obtain input_shape conforming to what the models we will be running expect. We are therefore going to save the initial size of the image in a variable. This could also have an impact on the popularity of the photo.

In [None]:
def get_img_size(path):
    width = []
    height = []
    landscape = []
    for image_path in tqdm(os.listdir(path)):
        image = plt.imread(path+image_path)
        width.append(image.shape[1])
        height.append(image.shape[0])
        if(image.shape[1] > image.shape[0]):
            landscape_img = 1
        else:
            landscape_img = 0
        landscape.append(landscape_img)
    return width, height, landscape

In [None]:
TRAIN_PATH = "../input/petfinder-pawpularity-score/train/"
TEST_PATH = "../input/petfinder-pawpularity-score/test/"

train_df['Width'], train_df['Heihgt'], train_df['Landscape'] = get_img_size(TRAIN_PATH)

In [None]:
train_df.head()

In [None]:
test_df['Width'], test_df['Heihgt'], test_df['Landscape'] = get_img_size(TEST_PATH)

test_df.head()

**Final Training Data**

We will now define the final dataset. 
Remember we agreeed to remove Face due to the high multicollinearity.
Also, remember our target variable - Pawpularity is not normally distributed, hence we will normalise by dividing by 100

In [None]:
img = train_df[["Img"]].values
y = np.ravel(train_df[["Pawpularity"]]/100)
X = train_df.drop(["Id","Img", "Face", "Pawpularity"], axis=1)
X_test = test_df.drop(["Id", "Face", "Img"], axis=1)

In [None]:
# Normalization

from sklearn.preprocessing import StandardScaler, MinMaxScaler

encoder = MinMaxScaler()
encoder.fit(X)
X_scaled = encoder.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

X_test_scaled = encoder.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

**Now we will use RandomForest to determine feature importance and GridSearchCV to find best hypeparameter**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=38)

print(f'X_train: {X_train.shape}')
print(f'X_valid: {X_valid.shape}')
print(f'y_train: {y_train.shape}')
print(f'y_valid: {y_valid.shape}')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

regressor = RandomForestRegressor(random_state=8)
param_grid = {
            "n_estimators" : [10,20,50,75,100,150],
            "max_features" : ["log2", "sqrt"],
            "max_depth"    : [5,10,15,25,35,50],
            "bootstrap"    : [True, False]
        }

grid_reg = GridSearchCV(
    regressor,
    param_grid,
    cv = 5,
    verbose=1,
    n_jobs=-1)

best_reg = grid_reg.fit(X_train, y_train)

In [None]:
print(f"The best parameters for the model is: \n {best_reg.best_params_}")

**Save the Best Parameter model**

In [None]:
import pickle

with open('best_randforst_param.sav', 'wb') as best_randforst_param:
    pickle.dump(best_reg, best_randforst_param)
    
    

# loaded_model = pickle.load(open(filename, 'rb'))

Now, we will plot the importance of features in the modeling:

In [None]:
importances = best_reg.best_estimator_.feature_importances_

feature_names = X_train.columns
forest_importances = pd.DataFrame(importances, columns=["FI"], index=feature_names)
forest_importances = forest_importances.sort_values("FI", ascending=False)

fig, ax = plt.subplots()
sns.barplot(data=forest_importances, x = "FI", 
            y=forest_importances.index, ax=ax, 
            palette="Blues_d")
ax.set_title("Feature importances of RandomForestRegressor", fontweight='bold')
ax.set_xlabel("Mean decrease in impurity")
ax.set_ylabel("Features")
fig.tight_layout()

**Now let us use this random Forest model to perform prediction on the validation set**

In [None]:
pred_reg = best_reg.predict(X_valid)

In [None]:
fig = plt.figure(figsize=(12,8))
plt.scatter(x=pred_reg, y=y_valid)
plt.ylabel("Pawpularity real values (y_valid)")
plt.xlabel("Predicted values (rfr_pred)")
plt.title("Predicted Pawpularity VS True values with RandomForest", 
          fontsize=15, fontweight='bold')
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"Mean Squared Error: {mean_squared_error(y_valid, pred_reg)}")
print(f"R Squared Score: {r2_score(y_valid, pred_reg)}")

From the above, especially the R-Squared score, it shows the model did very bad (Worst). This means the dataset alone is not enough predictor of popularity

**We will now use Image claffication with NASNetLarge**

In [None]:
# lad the Keras model NASNetLarge

nasnet_model = tf.keras.applications.NASNetLarge(
    include_top=False,
    weights=None,
    input_tensor=None,
    input_shape=(299,299,3),
    pooling='avg'
)

nasnet_model.load_weights('../input/keras-applications-models/NASNetLarge.h5')

# Non trainable
nasnet_model.trainable = False

For better use in Keras, we will create generators by slightly modifying our DataFrame Pandas. We will indeed add the name (and extension) of the image files to our y DataSets.

In [None]:
k_df = train_df[["Id","Pawpularity"]]
k_df["Image"] = k_df["Id"].apply(lambda x: x+".jpg")
k_df["Pawpularity"] = k_df["Pawpularity"]/100
k_df.head()

In [None]:
k_X_train, k_X_valid, k_y_train, k_y_valid = train_test_split(
    k_df["Image"], k_df["Pawpularity"], 
    test_size=0.2, 
    random_state=38)

print(f"X_train : {k_X_train.shape}")
print(f"X_test : {k_X_valid.shape}")
print(f"y_train : {k_y_train.shape[0]}")
print(f"y_test : {k_y_valid.shape[0]}")

In [None]:
k_train_df = pd.DataFrame(k_X_train, columns=["Image"])
k_train_df["Pawpularity"] = k_y_train
k_valid_df = pd.DataFrame(k_X_valid, columns=["Image"])
k_valid_df["Pawpularity"] = k_y_valid

In [None]:
k_train_df.head()

In [None]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.nasnet.preprocess_input,
    validation_split=0.2)
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.nasnet.preprocess_input)

In [None]:
train_generator = datagen.flow_from_dataframe(
    dataframe=k_train_df,
    directory=dataset_path+"train/",
    x_col="Image",
    y_col="Pawpularity",
    subset="training",
    target_size=(299,299),
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="raw")

valid_generator = datagen.flow_from_dataframe(
    dataframe=k_train_df,
    directory=dataset_path+"train/",
    x_col="Image",
    y_col="Pawpularity",
    subset="validation",
    target_size=(299,299),
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="raw")

test_generator = datagen.flow_from_dataframe(
    dataframe=k_valid_df,
    directory=dataset_path+"train/",
    x_col="Image",
    y_col="Pawpularity",
    target_size=(299,299),
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="raw")

In [None]:
# Add new fully-connected layers
base_output = nasnet_model.output
base_output = Dense(128, activation='relu')(base_output)
base_output = Dropout(0.2)(base_output)
base_output = Dense(256, activation='relu')(base_output)
base_output = Dense(128, activation='relu')(base_output)
base_output = Dropout(0.2)(base_output)
# Output : new classifier
predictions = Dense(1, activation='linear')(base_output)

# Define new model
my_nasnet_model = Model(inputs=nasnet_model.input, outputs=predictions)
my_nasnet_model.compile(optimizer="adam", loss=tf.keras.metrics.mean_squared_error)

In [None]:
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size

# Early Stopping to prevent overfitting
early_stopper = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", 
    patience=15, 
    verbose=3, 
    restore_best_weights=True)


history_nasnet = my_nasnet_model.fit(
    train_generator,
    steps_per_epoch=STEP_SIZE_TRAIN,
    validation_data=valid_generator,
    validation_steps=STEP_SIZE_VALID,
    epochs=50,
    verbose=2,
    callbacks=[early_stopper])

In [None]:
fig = plt.figure(figsize=(12, 8))
plt.plot(history_nasnet.history["loss"],
         color="#186fb4", linestyle="-.",
         label="Train")
plt.plot(history_nasnet.history["val_loss"],
         color="#186fb4",
         label="Validation")
plt.legend()
plt.title("RMSE metric of NasNetLarge model for Pawpularity", 
          fontsize=20, fontweight='bold')
plt.show()

From the above, we see a downward slope of the training loss but the validation barely dropped. Lets try predicting

In [None]:
nasnet_pred = my_nasnet_model.predict(test_generator)
nasnet_pred.shape

In [None]:
fig = plt.figure(figsize=(12,8))
plt.scatter(x=nasnet_pred, y=k_y_valid)
plt.ylabel("Pawpularity real values (k_y_valid)")
plt.xlabel("Predicted values (nasnet_pred)")
plt.title("Predicted Pawpularity VS True values with NasnetLarge", 
          fontsize=20, fontweight='bold')
plt.show()

In [None]:
print(f"R Squared Score: {r2_score(k_y_valid, nasnet_pred)}")

Again from the above, the model did poorly (though better than the RandomForest Model). Hence the model alone is not enough to predict the target value.

We will now try optimizing our model 

# Transfer Learning Optimization

In [None]:
datagen_2 = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=20, # rotation
    width_shift_range=0.2, # horizontal shift
    height_shift_range=0.2, # vertical shift
    zoom_range=0.2, # zoom
    horizontal_flip=True, # horizontal flip
    featurewise_std_normalization=True,
    preprocessing_function=tf.keras.applications.xception.preprocess_input,
    validation_split=0.2)

In [None]:
train_generator_2 = datagen_2.flow_from_dataframe(
    dataframe=k_train_df,
    directory=dataset_path+"train/",
    x_col="Image",
    y_col="Pawpularity",
    subset="training",
    target_size=(299,299),
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="raw")

valid_generator_2 = datagen_2.flow_from_dataframe(
    dataframe=k_train_df,
    directory=dataset_path+"train/",
    x_col="Image",
    y_col="Pawpularity",
    subset="validation",
    target_size=(299,299),
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="raw")

In [None]:
STEP_SIZE_TRAIN = train_generator_2.n//train_generator_2.batch_size
STEP_SIZE_VALID = valid_generator_2.n//valid_generator_2.batch_size

In [None]:
tf.keras.backend.clear_session()
history_nasnet_2 = my_nasnet_model.fit(
    train_generator_2,
    steps_per_epoch=STEP_SIZE_TRAIN,
    validation_data=valid_generator_2,
    validation_steps=STEP_SIZE_VALID,
    epochs=50,
    verbose=2,
    callbacks=[early_stopper])

In [None]:
fig = plt.figure(figsize=(12, 7))
plt.plot(history_nasnet_2.history["loss"],
         color="#186fb4", linestyle="-.",
         label="Train")
plt.plot(history_nasnet_2.history["val_loss"],
         color="#186fb4",
         label="Validation")
plt.legend()
plt.title("RMSE metric of Xception augmented model for Pawpularity", 
          fontsize=20, fontweight='bold')
plt.show()

In [None]:
nasnet_pred_2 = history_nasnet_2.model.predict(test_generator)
nasnet_pred_2.shape

In [None]:
fig = plt.figure(figsize=(12,8))
plt.scatter(x=nasnet_pred_2, y=k_y_valid)
plt.ylabel("Pawpularity real values (k_y_valid)")
plt.xlabel("Predicted values (xcept_pred)")
plt.title("Predicted Pawpularity VS True values with Xception", 
          fontsize=20, fontweight='bold')
plt.show()

In [None]:
print(f"R Squared Score: {r2_score(k_y_valid, nasnet_pred_2)}")

The augmented model looks a bit better but still fails to predict popularity scores reliably enough

# Hybrid approach with feature detection and RandomForest

We are therefore going to use a hybrid approach consisting in carrying out the feature detection with NASNetLarge, then in coupling the results with the database of image characteristics to finally predict y with a RandomForestRegressor.

In [None]:
def feature_detect_img(folder, img_size=299):
    listVectors = []
    for img in tqdm(os.listdir(dataset_path+folder+"/")):
        image = plt.imread(dataset_path+folder+"/"+img)
        #resize image if new dims provided
        image = cv2.resize(image, (img_size,img_size),
                           interpolation = cv2.INTER_AREA)
        image = np.expand_dims(image, axis=0)
        image = tf.keras.applications.nasnet.preprocess_input(image)
        
        img_vector = nasnet_model.predict(image)
        listVectors.append(np.array(img_vector))
    
    return listVectors

In [None]:
train_vectors_fd = feature_detect_img("train", img_size=299)

In [None]:
train_vectors_fd = np.array(train_vectors_fd)
train_vectors_fd = np.squeeze(train_vectors_fd)
train_vectors_fd.shape
train_vectors_fd = pd.DataFrame(train_vectors_fd)

In [None]:
hy_train_df = pd.concat([train_df,train_vectors_fd], axis=1)
hy_train_df.head()

In [None]:
hy_train_df.columns

In [None]:
h_labels = hy_train_df["Id"]
h_y = hy_train_df["Pawpularity"]
h_X = hy_train_df.drop(["Id","Pawpularity", "Img"], axis=1)

# Normalization
encoder = MinMaxScaler()
encoder.fit(h_X)
h_X_scaled = encoder.transform(h_X)
h_X_scaled = pd.DataFrame(h_X_scaled, columns=h_X.columns)

h_X_train, h_X_valid, h_y_train, h_y_valid = train_test_split(
    h_X_scaled, h_y, test_size=0.3, random_state=38)

In [None]:
print(f"X_train : {h_X_train.shape}")
print(f"X_test : {h_X_valid.shape}")
print(f"y_train : {h_y_train.shape[0]}")
print(f"y_test : {h_y_valid.shape[0]}")

In [None]:
h_rfr = RandomForestRegressor(random_state=8)
param_grid = {
            "n_estimators" : [10,50, 75, 100, 150],
            "max_features" : ["log2", "sqrt"],
            "max_depth"    : [5,15,25, 35, 50],
            "bootstrap"    : [True, False]
        }

h_grid_rfr = GridSearchCV(
    h_rfr,
    param_grid,
    cv = 5,
    verbose=2,
    n_jobs=-1)

h_best_rfr = h_grid_rfr.fit(h_X_train, h_y_train)

In [None]:
print(f"The best parameters for Randomforest: {h_best_rfr.best_params_}")

In [None]:
h_rfr_pred = h_best_rfr.predict(h_X_valid)

In [None]:
fig = plt.figure(figsize=(12,8))
plt.scatter(x=h_rfr_pred, y=h_y_valid)
plt.ylabel("Pawpularity real values (y_valid)")
plt.xlabel("Predicted values (rfr_pred)")
plt.title("Predicted Pawpularity VS True values with RandomForest", 
          fontsize=20, fontweight='bold')
plt.show()

In [None]:
submission_df = pd.read_csv("".join([dataset_path,"test.csv"]))
submission_df = submission_df[["Id"]]
submission_df["Image"] =  submission_df["Id"].apply(lambda x: x+".jpg")

submission_generator = test_datagen.flow_from_dataframe(
    dataframe=submission_df,
    directory=dataset_path+"test/",
    x_col="Image",
    y_col=None,
    target_size=(299,299),
    batch_size=32,
    seed=42,
    shuffle=False,
    class_mode=None)

In [None]:
submission_pred = my_nasnet_model.predict(submission_generator)
submission_pred.shape

In [None]:
test_vectors_fd = feature_detect_img("test", img_size=299)

In [None]:
test_vectors_fd = np.array(test_vectors_fd)
test_vectors_fd = np.squeeze(test_vectors_fd)
test_vectors_fd.shape
test_vectors_fd = pd.DataFrame(test_vectors_fd)

In [None]:
hy_test_df = pd.concat([test_df,test_vectors_fd], axis=1)
hy_test_df.head()

In [None]:
h_test_labels = hy_test_df["Id"]
h_X_test = hy_test_df.drop("Id", axis=1)
h_X_test_scaled = encoder.transform(h_X_test)
h_X_test_scaled = pd.DataFrame(h_X_test_scaled, columns=h_X_test.columns)

In [None]:
submission_pred = h_best_rfr.predict(h_X_test_scaled)

In [None]:
fig = plt.figure(figsize=(10,7))
plt.hist((submission_pred))
plt.xlabel("Pawpularity Score")
plt.ylabel("number of individuals")
plt.title("Distribution of predicted submission results", 
          fontsize=20, fontweight='bold')
plt.show()

In [None]:
submission_df["Pawpularity"] = (submission_pred)
submission_df = submission_df[["Id","Pawpularity"]]
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", sep=",", index=False)