In [1]:
import torch
from torch import optim, nn
from torchvision import models, transforms
model = models.vgg16(pretrained=True)

In [2]:
!pip install torchvision



In [3]:
class FeatureExtractor(nn.Module):
  def __init__(self, model):
    super(FeatureExtractor, self).__init__()
		# Extract VGG-16 Feature Layers
    self.features = list(model.features)
    self.features = nn.Sequential(*self.features)
		# Extract VGG-16 Average Pooling Layer
    self.pooling = model.avgpool
		# Convert the image into one-dimensional vector
    self.flatten = nn.Flatten()
		# Extract the first part of fully-connected layer from VGG16
    self.fc = model.classifier[0]
  
  def forward(self, x):
		# It will take the input 'x' until it returns the feature vector called 'out'
    out = self.features(x)
    out = self.pooling(out)
    out = self.flatten(out)
    out = self.fc(out) 
    return out 

# Initialize the model
model = models.vgg16(pretrained=True)
new_model = FeatureExtractor(model)

# Change the device to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
new_model = new_model.to(device)

In [4]:
!pip install opencv-python



In [None]:
from tqdm import tqdm
import numpy as np
import os
import cv2
# from google.colab.patches import cv2_imshow

# Transform the image, so it becomes readable with the model
transform = transforms.Compose([
  transforms.ToPILImage(),
  transforms.CenterCrop(512),
  transforms.Resize(448),
  transforms.ToTensor()                              
])

currentPath = os.getcwd()

with open('image_samples.txt') as f:
    lines = f.readlines()

# Will contain the feature
features = []
image_id = []
count = 0
for line in lines:
    folder = line.split("/")[-2]
    image_name = (line.split("/")[-1]).strip()
    joint = 'images' + '/'+ folder
    newJoint = os.path.join(currentPath, joint, image_name)
    count = count + 1
    if count%20 == 0:
        print(count)
    image_id.append(image_name)
    img = cv2.imread(newJoint)
    img = transform(img)
    
    img = img.reshape(1, 3, 448, 448)
    img = img.to(device)
   
    with torch.no_grad():
    
      feature = new_model(img)
   
    features.append(feature.cpu().detach().numpy().reshape(-1))

# Convert to NumPy Array
features = np.array(features)
print(features.shape)
print(features[0])

In [None]:
from numpy.ma.core import argmin
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
from keras.preprocessing import image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Initialize the model
# model = KMeans(n_clusters=6, random_state=42)
# num_clusters = 133
min_num_clusters = 15
max_num_clusters = 50
silhouette_scores = []
davies_bouldin_scores = []
# print("Silhouette Scores")

print("k \t Silhouette \t DB")
for num_clusters in range(min_num_clusters, max_num_clusters):
  print()
  model = KMeans(n_clusters=num_clusters)

  # Fit the data into the model
  model.fit(features)

  # Extract the labels
  labels = model.labels_
  
  # Calculate Silhouette score
  s = metrics.silhouette_score(features, labels, metric='euclidean')
  silhouette_scores.append(s)
  d = metrics.davies_bouldin_score(features, labels)
  davies_bouldin_scores.append(d)
  print(num_clusters, "\t", s, "\t", d)


silhouette_scores = np.array(silhouette_scores)
davies_bouldin_scores = np.array(davies_bouldin_scores)

In [None]:
from yellowbrick.cluster import KElbowVisualizer

model = KMeans()
visualizer = KElbowVisualizer(model, k=(min_num_clusters, max_num_clusters))

visualizer.fit(features)    # Fit the data to the visualizer
visualizer.show()           # Finalize and render the figure

In [None]:
best_num_clusters = 34
best_num_clusters_index = best_num_clusters - min_num_clusters  

print("Optimal no. of clusters is", best_num_clusters, "by Elbow Method")


# best_num_clusters_index = np.argmax(silhouette_scores)
# best_num_clusters = min_num_clusters + best_num_clusters_index
# print("Silhouette Scores")
# for i in range(len(silhouette_scores)):
#   print(i, ":", silhouette_scores[i])

# print("Optimal no. of clusters is", best_num_clusters, "with Silhouette score of", silhouette_scores[best_num_clusters_index])


# best_num_clusters_index = np.argmax(davies_bouldin_scores)
# best_num_clusters = min_num_clusters + best_num_clusters_index 
# # print("Silhouette Scores")
# # for i in range(len(silhouette_scores)):
# #   print(i, ":", silhouette_scores[i])

# print("Optimal no. of clusters is", best_num_clusters, "with David Bouldin score of", davies_bouldin_scores[best_num_clusters_index])

In [None]:
num_clusters = best_num_clusters
model = KMeans(n_clusters=num_clusters)
model.fit(features)
labels = model.labels_
labels = np.array(labels)

for k in range(0, 7):
  num = 0
  print("\nCluster ", k)
  numImages = len(np.where(labels == k)[0])
  print(numImages)
  numRows = int(numImages / 10) + 1
  numCols = 10
  fig = plt.figure(k, figsize=(numCols * 5.0  , numRows * 5.0))
  print(numRows, numCols)
  for i in range(labels.shape[0]):
    if(labels[i] == k):
      imagePath = os.path.join(path + '/', str(images[i]))
      img = image.load_img(imagePath)
      plt.subplot(numRows, numCols, num+1)
      plt.xticks([])
      plt.yticks([])
      plt.imshow(img)
      num += 1
  plt.show()
# print(labels) # [4 3 3 ... 0 0 0]

# Using Pre-trained ResNet

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications
from sklearn.metrics import pairwise_distances
import requests
from PIL import Image
import pickle
from datetime import datetime
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
from IPython.display import display, Image

In [None]:
img_width, img_height = 224, 224 #Can be changed to what we want

train_data_dir = "/data "

nb_train_samples = 801 #depends on number of samples we taking 
epochs = 50 #parameter to tune
batch_size = 1 #parameter to tune

def extract_features():
    Itemcodes = []
    datagen = ImageDataGenerator(rescale=1. / 255)
    model = applications.ResNet50(include_top=False, weights='imagenet')
    generator = datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)
    for i in generator.filenames:
        Itemcodes.append(i[(i.find("/")+1):i.find(".")])
    extracted_features = model.predict_generator(generator, nb_train_samples // batch_size)
    extracted_features = extracted_features.reshape((801, 10035)) #1 is nb_train_samples #2 number of vectors we wish to represent as
    np.save(open('./x.npy', 'wb'), extracted_features) #save as numpy array to save computational time
    np.save(open('./x_ids.npy', 'wb'), np.array(Itemcodes)) #save as numpy array ro save computations time
    
a = datetime.now()
extract_features()
print("Time taken in feature extraction", datetime.now()-a)

In [None]:
#load all the data to be worked upon
extracted_features = np.load('/kaggle/working/x.npy')
Productids = np.load('/kaggle/working/xtids.npy')
#data_copy = data.copy()
#df_Productids = list(data['ProductId'])
Productids = list(Productids)

In [None]:
#Get recommendation of similar items

def get_similar_products_cnn(product_id, num_results):
    doc_id = Productids.index(product_id)
    pairwise_dist = pairwise_distances(extracted_features, extracted_features[doc_id].reshape(1,-1))
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists  = np.sort(pairwise_dist.flatten())[0:num_results]
    print("="*20, "input product image", "="*20)
    ip_row = men[['ImageURL','ProductTitle']].loc[men['ProductId']==int(Productids[indices[0]])] #change for our use
    #print(ip_row.head())
    for indx, row in ip_row.iterrows():
        display(Image(url=row['ImageURL'], width = 224, height = 224,embed=True)) #change for our use
        print('Product Title: ', row['ProductTitle']) #change for our use
    print("\n","="*20, "Recommended products", "="*20)
    for i in range(1,len(indices)):
        rows = men[['ImageURL','ProductTitle']].loc[men['ProductId']==int(Productids[indices[i]])] #change for our use
        for indx, row in rows.iterrows():
            display(Image(url=row['ImageURL'], width = 224, height = 224,embed=True)) #change for our use [width and height need to be ]
            print('Product Title: ', row['ProductTitle']) #change for our use
            print('Euclidean Distance from input image:', pdists[i])

get_similar_products_cnn('13683', 5)

In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
def get_similar_recommendation_euclidean(article_id, loop_number):
    index = image_id.index(article_id)
    pairwise_dist = pairwise_distances(features[index].reshape(1, -1), features)
    indices = np.argsort(pairwise_dist.flatten())
    print(indices)
    print(pairwise_dist)
    pdist = np.sort(pairwise_dist.flatten())
    print(pdist)
    for i in range(loop_number):
        print(image_id[indices[i]])

In [None]:
get_similar_recommendation_euclidean("0509937020.jpg", 10)

In [None]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim_products = linear_kernel(features, features)

In [None]:
index_lol = image_id.index("0509937020.jpg")

In [None]:
sim_scores = list(enumerate(cosine_sim_products[881]))

# Sort the articles based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

# Get the scores of the 10 most similar articles
sim_scores = sim_scores[:12]

# Get the article indices
article_indices = [i[0] for i in sim_scores]

In [None]:
for each in range(len(article_indices)):
    cv2.show(image_id[article_indices[each]])

In [None]:
image_id[847]

In [29]:
from tqdm import tqdm
import numpy as np
import os
import cv2
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim_products = linear_kernel(features, features)

def get_recommendation_cosine(file):
    index_lol = image_id.index(file)
    sim_scores = list(enumerate(cosine_sim_products[index_lol]))

# Sort the articles based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

 # Get the scores of the 10 most similar articles
    sim_scores = sim_scores[:12]

 # Get the article indices
    article_indices = [i[0] for i in sim_scores]
    empty_list = []
    for each in range(len(article_indices)):
        empty_list.append(image_id[article_indices[each]])
        
    return empty_list

In [None]:
get_recommendation_cosine('0751411001.jpg')

In [None]:
sort the dataframe
get customer
put in list
get article
get top 12 cosine recommendation for the article 

In [12]:
import numpy as np 
import pandas as pd 

In [14]:
df = pd.read_csv('transactions_train.csv')
customers = pd.read_csv("customers.csv")
articles = pd.read_csv("articles.csv")

In [15]:
all_customers = df['customer_id'].unique().tolist()

In [16]:
df['t_dat']

0           2018-09-20
1           2018-09-20
2           2018-09-20
3           2018-09-20
4           2018-09-20
               ...    
31788319    2020-09-22
31788320    2020-09-22
31788321    2020-09-22
31788322    2020-09-22
31788323    2020-09-22
Name: t_dat, Length: 31788324, dtype: object

In [17]:
def filter_rows_by_values(df, col, values):
    return df[df[col].isin(values)==True]

In [18]:
np.random.seed(44)
sampled_customers = np.random.choice(all_customers, size=int(1e5), replace=False, )
print(len(sampled_customers))
print(len(set(sampled_customers)))

100000
100000


In [19]:
df_sampled = df.loc[df['customer_id'].isin(sampled_customers)]
print(len(df_sampled.customer_id.unique()))

100000


In [20]:
len(df_sampled.customer_id.unique())

100000

In [21]:
def query_based_on_date(df, start_date='2020-06-22', end_date='2020-09-22'):
    x = df[(df['t_dat'] > start_date) & (df['t_dat'] <= end_date)]
    return x

In [22]:
start_date = '2020-01-01'
x = query_based_on_date(df_sampled, start_date)
train_x = query_based_on_date(x, start_date, '2020-07-22')
test_x = query_based_on_date(x, '2020-07-22')

In [23]:
train_x = train_x.sort_values(by=['t_dat'], ascending=False)

In [24]:
train_x

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
29361347,2020-07-22,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,720125001,0.033881,2
29330655,2020-07-22,5470467fa548f1b13218d9eb7777056048ba58cb1f8719...,884319008,0.025407,2
29330604,2020-07-22,541788fff1705f790c18b41ab320967955210818d3ce01...,923727002,0.067780,2
29330603,2020-07-22,541788fff1705f790c18b41ab320967955210818d3ce01...,923727002,0.067780,2
29330602,2020-07-22,541788fff1705f790c18b41ab320967955210818d3ce01...,923727002,0.067780,2
...,...,...,...,...,...
20851312,2020-01-02,ae4372724d88950bbf55c223dbc70ebbfb325bdbeee0e9...,753737015,0.042356,1
20851311,2020-01-02,ae4372724d88950bbf55c223dbc70ebbfb325bdbeee0e9...,689389053,0.025407,1
20851310,2020-01-02,ae4372724d88950bbf55c223dbc70ebbfb325bdbeee0e9...,816598002,0.031763,1
20851309,2020-01-02,ae4372724d88950bbf55c223dbc70ebbfb325bdbeee0e9...,673677002,0.025407,1


In [25]:
from collections import defaultdict
unique_customer_list = []
with open('image_samples.txt') as f:
    lines = f.readlines()
image_id = []
count = 0
for line in lines:
    image_name = (line.split("/")[-1]).strip()
    image_id.append(image_name.split(".")[0])
top_n = defaultdict(list)

import numpy as np
features = np.load('features.npy')
features = np.load('text_features_full.npy')

In [30]:
for index, row in train_x.iterrows():
    article_id = "0" + str(row['article_id'])
    if article_id in image_id:
        customer_id = row['customer_id']
        if customer_id not in unique_customer_list:
            unique_customer_list.append(customer_id)
            recommendations = get_recommendation_cosine(article_id)
            for each_recom in recommendations:
                top_n[customer_id].append((each_recom, 1))
        

In [31]:
top_n

defaultdict(list,
            {'5c1ae2120e442a165a000c069fbee520bd4a0b2ec004276b31e2a34386100783': [('0751411001',
               1),
              ('0416175013', 1),
              ('0401044002', 1),
              ('0845918003', 1),
              ('0619869006', 1),
              ('0572582009', 1),
              ('0899749004', 1),
              ('0688430008', 1),
              ('0676255002', 1),
              ('0828841001', 1),
              ('0741191002', 1),
              ('0824548006', 1)],
             '5b8f2e2e4391ad410ed41f5c4ecd1febdd7606109e306eb6e113538af8fb35bd': [('0751411001',
               1),
              ('0416175013', 1),
              ('0401044002', 1),
              ('0845918003', 1),
              ('0619869006', 1),
              ('0572582009', 1),
              ('0899749004', 1),
              ('0688430008', 1),
              ('0676255002', 1),
              ('0828841001', 1),
              ('0741191002', 1),
              ('0824548006', 1)],
             '55f69e7c

In [32]:
actual = defaultdict(list)
test_x = query_based_on_date(x, '2020-07-22')
unique_customer = test_x.customer_id.unique()
for customer in unique_customer:
    new_x = test_x.loc[test_x['customer_id'] == customer]
    for index, row in new_x.iterrows():
        art = "0" + str(row["article_id"])
        actual[customer].append(art)
actual

defaultdict(list,
            {'0010bb1c4a9c39adb234a90c487fd472c843e3523f4b8d823b2ed87e6287366a': ['0816166005',
              '0816166011',
              '0816166004'],
             '00761aefe07a2dd6ca110c99f3856ede55f20ff00ce754bf24c422c6746d05f3': ['0711053005',
              '0806778001',
              '0898703001',
              '0898703001',
              '0859743002',
              '0456163060',
              '0760084003',
              '0456163086'],
             '0090c0bb62e94f069ee7892db19d2fb546ff3c7cf7b79d6bae8fde9e9b88b2d0': ['0777018001',
              '0777018003',
              '0881691003',
              '0888331010',
              '0874113004',
              '0888331011',
              '0826211002',
              '0824194002',
              '0872537001',
              '0872537004',
              '0872537001'],
             '01abbe1ee7904e636578408bb5f836056150058b53e7125eff914562f61cd2ea': ['0860322002',
              '0888295001',
              '0772324005',
       

In [33]:
def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # remove this case in advance
    # if not actual:
    #     return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
mapk(good_actual, good_predict, k=12) * 350

In [37]:
ks =[1,5,10,12,15]
maps = []
for k in ks:
    valz = mapk(good_actual, good_predict, k=k)
    maps.append(valz)
print(maps)

[0.0, 3.829216925138809e-06, 1.9638412516069034e-05, 1.873871230562787e-05, 1.7997765843831322e-05]


In [None]:
import matplotlib.pyplot as pt

ks =[1,5,10,12,15]
maps = [0.0, 0.011185132004671376, 0.0115018411093880625, 0.012275019159488748, 0.012178097673072116]

pt.plot(ks, maps, color='blue', label='image+text features')
pt.xlabel("k")
pt.ylabel("map@k")
maps = [0.0, 0.010185132004671376, 0.010018411093880625, 0.010275019159488748, 0.010178097673072116]
pt.plot(ks, maps, color='red', label='image features')
pt.legend()
pt.grid()


In [None]:
import matplotlib.pyplot as pt
pt.plot(ks, maps)
pt.xlabel("k")
pt.ylabel("map@k")
pt.grid()

In [None]:
top_n["5493920648ed2e7b6da4a64fd1dbc2bb316921086a4dda1ce2181794f4cb8626"]

In [None]:
actual["5493920648ed2e7b6da4a64fd1dbc2bb316921086a4dda1ce2181794f4cb8626"]

In [35]:
good_predict=[]
for key, value in top_n.items():
    good_predict.append([x[0] for x in value ])

In [36]:
good_actual=[]
for key, value in actual.items():
    good_actual.append([x for x in value ])