> In this kernel, we'll use Transfer learning to achieve an accuracy of about 96%  on the Dog VS. Cat classification challenge.
**
A [link](https://medium.com/@risingdeveloper/transfer-learning-on-image-classification-using-keras-and-kaggle-kernels-c76d3b030649) to my Medium post with full explanation of this Notebook and transfer learning in general**

In [None]:
#Lets take a look at our directory
import os, shutil
print(os.listdir("./input"))


The train data contains 25,000 images of both dogs and cats. We are going to sample a small portion of  the data for training because of memory and Ram limits on kaggle kernels. So therefore, we will use Data Augmentation to reduce overfitting.

In [None]:
import random
import gc   #Gabage collector for cleaning deleted data from memory

train_dir = './input/train'
test_dir = './input/test'

# train_imgs = ['../input/train/{}'.format(i) for i in os.listdir(train_dir)]  #get full data set
train_dogs = ['./input/train/{}'.format(i) for i in os.listdir(train_dir) if 'dog' in i]  #get dog images
train_cats = ['./input/train/{}'.format(i) for i in os.listdir(train_dir) if 'cat' in i]  #get cat images

test_imgs = ['./input/test/{}'.format(i) for i in os.listdir(test_dir)] #get test images

train_imgs = train_dogs[:100] + train_cats[:100]  # slice the dataset and use 2000 in each class
random.shuffle(train_imgs)  # shuffle it randomly

#Clear list that are useless
del train_dogs
del train_cats
gc.collect()   #collect garbage to save memory

In [None]:
train_imgs

In [None]:
#Import some packages to use
import cv2
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
%matplotlib inline 

In [None]:
#Lets declare our image dimensions
#we are using coloured images. 
nrows = 150
ncolumns = 150
channels = 3  #change to 1 if you want to use grayscale image


#A function to read and process the images to an acceptable format for our model
def read_and_process_image(list_of_images):
    """
    Returns two arrays: 
        X is an array of resized images
        y is an array of labels
    """
    X = [] # images
    y = [] # labels
    
    for image in list_of_images:
        X.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR), (nrows,ncolumns), interpolation=cv2.INTER_CUBIC))  #Read the image
        #get the labels
        if 'dog' in image:
            y.append(1)
        elif 'cat' in image:
            y.append(0)
    
    return X, y

In [None]:
#get the train and label data
X, y = read_and_process_image(train_imgs)

In [None]:
#Lets view some of the pics
plt.figure(figsize=(20,10))
columns = 5
for i in range(columns):
    plt.subplot(5 / columns + 1, columns, i + 1)
    plt.imshow(X[i])

In [None]:
del train_imgs
gc.collect()
#Convert list to numpy array
X = np.array(X)
y = np.array(y)

#Lets plot the label to be sure we just have two class
sns.countplot(y)
plt.title('Labels for Cats and Dogs')

In [None]:
print("Shape of train images is:", X.shape)
print("Shape of labels is:", y.shape)

In [None]:
#Lets split the data into train and test set
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=2)

In [None]:
#clear memory
del X
del y
gc.collect()

#get the length of the train and validation data
ntrain = len(X_train)
nval = len(X_val)

#We will use a batch size of 32. Note: batch size should be a factor of 2.***4,8,16,32,64...***
batch_size = 2


Now lets create our model.

In [None]:
from keras.applications import InceptionResNetV2

conv_base = InceptionResNetV2(weights='imagenet', include_top=False, input_shape=(150,150,3))

In [None]:
conv_base.summary()

In [None]:
from keras import layers
from keras import models

model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))  #Sigmoid function at the end because we have just two classes

In [None]:
#Lets see our model
model.summary()

In [None]:
print('Number of trainable weights before freezing the conv base:', len(model.trainable_weights))
conv_base.trainable = False
print('Number of trainable weights after freezing the conv base:', len(model.trainable_weights))


In [None]:
#We'll use the RMSprop optimizer with a learning rate of 0.0001
#We'll use binary_crossentropy loss because its a binary classification
from keras import optimizers

model.compile(loss='binary_crossentropy', optimizer=optimizers.RMSprop(lr=2e-5), metrics=['acc'])

In [None]:
#Lets create the augmentation configuration
#This helps prevent overfitting, since we are using a small dataset
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img

train_datagen = ImageDataGenerator(rescale=1./255,   #Scale the image between 0 and 1
                                    rotation_range=40,
                                    width_shift_range=0.2,
                                    height_shift_range=0.2,
                                    shear_range=0.2,
                                    zoom_range=0.2,
                                    horizontal_flip=True,
                                    fill_mode='nearest')

val_datagen = ImageDataGenerator(rescale=1./255)  #We do not augment validation data. we only perform rescale

In [None]:
#Create the image generators
train_generator = train_datagen.flow(X_train, y_train,batch_size=batch_size)
val_generator = val_datagen.flow(X_val, y_val, batch_size=batch_size)

In [None]:
#The training part
#We train for 64 epochs with about 100 steps per epoch
history = model.fit_generator(train_generator,
                              steps_per_epoch=ntrain // batch_size,
                              epochs=20,
                              validation_data=val_generator,
                              validation_steps=nval // batch_size)

In [None]:
#Save the model
model.save_weights('model_wieghts-1000.h5')
model.save('model_keras-1000.h5')

In [None]:
#lets plot the train and val curve
import matplotlib.pyplot as plt

#get the details form the history object
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

#Train and validation accuracy
plt.plot(epochs, acc, 'b', label='Training accurarcy')
plt.plot(epochs, val_acc, 'r', label='Validation accurarcy')
plt.title('Training and Validation accurarcy')
plt.legend()

plt.figure()
#Train and validation loss
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.legend()

plt.show()


Let's create a function that makes our plot looks smoother and cleaner.

In [None]:
def smooth_plot(points, factor=0.7):
    smooth_pts = []
    for point in points:
        if smooth_pts:
            previous = smooth_pts[-1]
            smooth_pts.append(previous * factor + point * (1 - factor))
        else:
            smooth_pts.append(point)
    return smooth_pts

In [None]:
#Plot figure
plt.plot(epochs, smooth_plot(acc), 'b', label='Training accurarcy')
plt.plot(epochs, smooth_plot(val_acc), 'r', label='Validation accurarcy')
plt.title('Training and Validation accurarcy')
plt.legend()
plt.show()


In [None]:
#Now lets predict on the first 10 Images of the test set
random.shuffle(test_imgs) 
X_test, y_test = read_and_process_image(test_imgs[0:10]) #Y_test in this case will be empty.
x = np.array(X_test)
y = np.array(y_test)
test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
i = 0
columns = 5
text_labels = []
plt.figure(figsize=(30,20))
for batch, by in test_datagen.flow(x,y ,batch_size=1):
    pred = model.predict(batch)
    print(pred)
    print(by)
    if pred > 0.5:
        text_labels.append('dog')
    else:
        text_labels.append('cat')
    plt.subplot(5 / columns + 1, columns, i + 1)
    plt.title('This is a ' + text_labels[i] + " " + str(y[i]))
    imgplot = plt.imshow(batch[0])
    i += 1
    if i % 10 == 0:
        break
plt.show()

A [link](https://medium.com/@risingdeveloper/transfer-learning-on-image-classification-using-keras-and-kaggle-kernels-c76d3b030649) to my Medium post with full explanation of the code and transfer learning in general

In [None]:
#Now lets predict on the first 10 Images of the test set
n_samples = 500
random.shuffle(test_imgs) 
X_test, y_test = read_and_process_image(test_imgs[0:n_samples]) #Y_test in this case will be empty.
x = np.array(X_test)
y = np.array(y_test)
positives = 0
cont = 0
for batch, by in test_datagen.flow(x,y ,batch_size=1):
    pred = model.predict(batch)
    if pred > 0.5 and by == 1:
        positives += 1
    elif pred < 0.5 and by == 0:
        positives += 1
    cont += 1
    if cont % n_samples == 0:
        break
    print(pred)
    print(by)
    print("------------------")
accuracy = positives/n_samples
print(accuracy)

In [20]:
import pandas as pd

df = pd.read_parquet("./evaluation_cluster_pages__NEW.parquet")
#df.drop(df.index, inplace=True)
#df = df.loc[df['cluster_label'] == "products"]["url"]
#df = df.append({'category': 1 ,'cluster_label':"product","domain":"sadasd","referring_url":"sad","text":"sada","url":"sadads"}, ignore_index=True)
#df= df.loc[73,"text"]
df

Unnamed: 0,category,cluster_label,domain,referring_url,text,url
0,1,product,https://www.helliot-bikes.eu/shop/mountain-bik...,https://www.helliot-bikes.eu/shop/mountain-bik...,\nFacebookYoutubeTwitter Svenska Svedese sv Po...,https://www.helliot-bikes.eu/shop/mountain-bik...
1,1,product,https://www.bikester.it/cube-attain-pro-disc-b...,https://www.bikester.it/cube-attain-pro-disc-b...,Eccellente4.57/5.00Certificato dal 19.05.2019×...,https://www.bikester.it/cube-attain-pro-disc-b...
2,1,product,https://www.giordanoshop.com/bicicletta-mounta...,https://www.giordanoshop.com/bicicletta-mounta...,JavaScript sembra essere disabilitato nel tuo...,https://www.giordanoshop.com/bicicletta-mounta...
3,0,product,https://www.ebay.it/i/223438119514?chn=ps,https://www.ebay.it/i/223438119514?chn=ps,Passa al contenuto principaleLogo eBayScegli ...,https://www.ebay.it/i/223438119514?chn=ps
4,1,product,https://www.giordanoshop.com/bicicletta-freest...,https://www.giordanoshop.com/bicicletta-freest...,JavaScript sembra essere disabilitato nel tuo...,https://www.giordanoshop.com/bicicletta-freest...
5,0,product,https://www.ebay.it/p/Cassetto-per-Registrator...,https://www.ebay.it/p/Cassetto-per-Registrator...,Passa al contenuto principaleLogo eBayScegli ...,https://www.ebay.it/p/Cassetto-per-Registrator...
6,0,product,https://www.ebay.it/p/Bussola-Chrome-Vanadium-...,https://www.ebay.it/p/Bussola-Chrome-Vanadium-...,Passa al contenuto principaleLogo eBayScegli ...,https://www.ebay.it/p/Bussola-Chrome-Vanadium-...
7,1,product,https://nordicgrizzly.com/product/stock-only-1...,https://nordicgrizzly.com/product/stock-only-1...,My account\nCheckout All Categories\nUncatego...,https://nordicgrizzly.com/product/stock-only-1...
8,0,product,https://www.imballaggiper.it/273-scatola-ecomm...,https://www.imballaggiper.it/273-scatola-ecomm...,Chiamaci 800910384 Scrivici info@imballaggipe...,https://www.imballaggiper.it/273-scatola-ecomm...
9,1,product,https://www.ebay.it/i/143164999853?chn=ps,https://www.ebay.it/i/143164999853?chn=ps,Passa al contenuto principaleLogo eBayScegli ...,https://www.ebay.it/i/143164999853?chn=ps


In [None]:
def get_line_generator():
    testfile = open("./test_set_cluster_pages.txt")

    for line in testfile:
        url, category = line.strip().split("\t")
        print(url, category)
    

In [24]:
from bs4 import BeautifulSoup
import urllib
import re

def get_text_from_url(url):
    html = urllib.request.urlopen(url)
    soup = BeautifulSoup(html)
    body = soup.find('body')
    [x.extract() for x in body.findAll('script')]
    [x.extract() for x in body.findAll('style')]
    text=re.sub("\s\s+" , " ", body.get_text())
    #text = " ".join(body.get_text().split())
    return text 

print(get_text_from_url('https://www.androidworld.it/2015/08/24/galaxy-s7-snapdragon-820-329118/'))

 smart mobile android AndroidWorld Recensioni
Schede tecniche
Smartphone
Smartwatch
Tablet
App
Giochi
Guide
Video
Forum
📸 3 Hardware Samsung starebbe già sperimentando Snapdragon 820, ovviamente per Galaxy S7 Nicola Ligas 24/08/2015 ore 16:02 - Aggiornato il 24/08/2015 ore 17:01
Il divorzio tra Samsung e Qualcomm che ha portato all’abbandono dello Snapdragon 810 in favore dell’Exynos 7420 sui top di gamma dell’azienda coreana di quest’anno, sarebbe stata solo una separazione consensuale, per poi riunirsi nel prossimo futuro.Samsung starebbe infatti già sperimentando a fondo lo Snapdragon 820 per valutarne l’impiego o meno nel Galaxy S7 ed in chissà quanti altri smartphone.Al momento non c’è niente di definitivo, ma è chiaro che per Qualcomm sarebbe come una prova di affidabilità riuscire a convincere uno dei suoi più importanti clienti ad impiegare il nuovo SoC, cosa che potrebbe dare una “botta di fiducia” anche agli altri.LEGGI ANCHE: Sensore ISOCELL per Galaxy Note 5 ed S6 edge+Lo S