In [25]:
import torch
from torch import optim, nn
from torchvision import models, transforms
model = models.vgg16(pretrained=True)

In [26]:
!pip install torchvision

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
class FeatureExtractor(nn.Module):
  def __init__(self, model):
    super(FeatureExtractor, self).__init__()
		# Extract VGG-16 Feature Layers
    self.features = list(model.features)
    self.features = nn.Sequential(*self.features)
		# Extract VGG-16 Average Pooling Layer
    self.pooling = model.avgpool
		# Convert the image into one-dimensional vector
    self.flatten = nn.Flatten()
		# Extract the first part of fully-connected layer from VGG16
    self.fc = model.classifier[0]
  
  def forward(self, x):
		# It will take the input 'x' until it returns the feature vector called 'out'
    out = self.features(x)
    out = self.pooling(out)
    out = self.flatten(out)
    out = self.fc(out) 
    return out 

# Initialize the model
model = models.vgg16(pretrained=True)
new_model = FeatureExtractor(model)

# Change the device to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
new_model = new_model.to(device)

In [28]:
!pip install opencv-python

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
from os.path import join
import os

root_dir = '/nethome/sannavajjala6/projects/ml_project/images'
image_paths = []
for root, dirs, files in os.walk(root_dir):
    files = [join(root, x) for x in files]
    image_paths.extend(files)

print(len(image_paths))

105101


In [30]:
import pandas as pd
import numpy as np

df = pd.read_csv('transactions_train.csv')
customers = pd.read_csv("customers.csv")
articles = pd.read_csv("articles.csv")

In [31]:
all_customers = df['customer_id'].unique().tolist()

In [32]:
def filter_rows_by_values(df, col, values):
    return df[df[col].isin(values)==True]

In [33]:
np.random.seed(44)
sampled_customers = np.random.choice(all_customers, size=int(1e5), replace=False, )
print(len(sampled_customers))
print(len(set(sampled_customers)))

100000
100000


In [34]:
df_sampled = df.loc[df['customer_id'].isin(sampled_customers)]
print(len(df_sampled.customer_id.unique()))

100000


In [35]:
def query_based_on_date(df, start_date='2020-06-22', end_date='2020-09-22'):
    x = df[(df['t_dat'] > start_date) & (df['t_dat'] <= end_date)]
    return x

In [36]:
start_date = '2020-01-01'
x = query_based_on_date(df_sampled, start_date)
train_x = query_based_on_date(x, start_date, '2020-07-22')
test_x = query_based_on_date(x, '2020-07-22')

In [37]:
train_x = train_x.sort_values(by=['t_dat'], ascending=False)

In [38]:
product_ids = []
product_ids += list(train_x.article_id.unique())
product_ids += list(test_x.article_id.unique())

print(len(product_ids))

53767


In [44]:
from os.path import join, exists
product_paths = []

for product_id in product_ids:
    product_id = str(product_id)
    path = join('./images', '0' + product_id[:2], '0' + product_id + '.jpg')
    if exists(path):
        product_paths.append(path)

print(len(product_paths))
    
    # break

53683


In [6]:
import random
random.seed(0)

random.shuffle(image_paths)
sample_image_paths = image_paths[:5000]
print(len(sample_image_paths))

5000


In [7]:
# with open('image_samples.txt', 'w') as f:
#     f.writelines([x + '\n' for x in sample_image_paths])
!pwd

/nethome/sannavajjala6/projects/ml_project


In [46]:
sample_image_paths = product_paths

# from os.path import join
# with open('final_image_paths.txt', 'r') as f:
#     sample_image_paths = f.readlines()
#     sample_image_paths = [join('/nethome/sannavajjala6/projects/ml_project/images/', x.rstrip()) for x in sample_image_paths]
    
    
print(len(sample_image_paths))

53683


In [47]:
sample_product_ids = []
for image_path in sample_image_paths:
    sample_product_ids.append(int(image_path.split('/')[-1].split('.')[0]))

print(len(sample_product_ids))
print(len(sample_image_paths))
print(sample_product_ids[:5])
print(sample_image_paths[:5])

53683
53683
[720125001, 884319008, 923727002, 685811007, 685816004]
['./images/072/0720125001.jpg', './images/088/0884319008.jpg', './images/092/0923727002.jpg', './images/068/0685811007.jpg', './images/068/0685816004.jpg']


In [45]:
import pandas as pd
articles = pd.read_csv('articles.csv')
print(articles.columns)

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')


In [48]:
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,5pk regular Placement1,302,Socks,Socks & Tights,1010014,Placement print,9,Black,...,Socks Bin,F,Menswear,3,Menswear,26,Men Underwear,1021,Socks and Tights,Socks in a fine-knit cotton blend with a small...
105538,953763001,953763,SPORT Malaga tank,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,2,H&M+,1005,Jersey Fancy,Loose-fitting sports vest top in ribbed fast-d...
105539,956217002,956217,Cartwheel dress,265,Dress,Garment Full body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,18,Womens Trend,1005,Jersey Fancy,"Short, A-line dress in jersey with a round nec..."
105540,957375001,957375,CLAIRE HAIR CLAW,72,Hair clip,Accessories,1010016,Solid,9,Black,...,Small Accessories,D,Divided,2,Divided,52,Divided Accessories,1019,Accessories,Large plastic hair claw.


In [12]:
articles['article_id']

0         108775015
1         108775044
2         108775051
3         110065001
4         110065002
            ...    
105537    953450001
105538    953763001
105539    956217002
105540    957375001
105541    959461001
Name: article_id, Length: 105542, dtype: int64

In [49]:
sample_article_ids = []

articles.loc[articles['article_id'] == sample_product_ids[0], 'detail_desc'].iloc[0]

'Sports tights in fast-drying functional fabric with a wide waistband to hold in and shape the waist. Regular waist with a concealed key pocket in the waistband.'

In [50]:
sample_product_descs = []
for product_id in sample_product_ids:
    sample_product_descs.append(articles.loc[articles['article_id'] == sample_product_ids[0], 'detail_desc'].iloc[0])
print(len(sample_product_descs))

53683


In [51]:
import torch
from transformers import AutoTokenizer, AutoModel

from typing import List, Optional

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased', output_hidden_states=True)

def gen_embeds(sents: List[str], average: bool = True) -> torch.Tensor:
    """
    sents:  List of sentences to generate embeddings for
    average: Return average over sentences

    Returns:
    torch.Tensor of size num_sents x emb_dim if average = True else
                    size num_sents x max_len, emb_dim
    """
    with torch.no_grad():
        inputs = tokenizer(sents, padding=True, return_tensors="pt")
        logits = model(**inputs).last_hidden_state

        if average:
            # Average across the entire sentence
            return torch.mean(logits, dim=1)
        else:
            return logits


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [52]:
sample_embeds = gen_embeds(sample_product_descs)

In [53]:
sample_embeds.shape

torch.Size([53683, 768])

In [54]:
import numpy as np
np.save('text_features_full.npy', sample_embeds.cpu().numpy())

In [None]:

def load_image_and_transform(image_path):
    img = cv2.imread(image_path)
    img = transform(img)
    img = img.reshape(1, 3, 448, 448)
    img = img.to(device)
    return img

results = Parallel(n_jobs=20)(delayed(load_image_and_transform)(image_path) for image_path in tqdm(sample_image_paths))

print(len(results))

In [None]:
from joblib import Parallel, delayed

from tqdm import tqdm
import numpy as np
import os
import cv2
# from google.colab.patches import cv2_imshow

# Transform the image, so it becomes readable with the model
transform = transforms.Compose([
  transforms.ToPILImage(),
  transforms.CenterCrop(512),
  transforms.Resize(448),
  transforms.ToTensor()                              
])



image_list = []
for image_path in tqdm(sample_image_paths):
    img = cv2.imread(image_path)
    img = transform(img)
    img = img.reshape(1, 3, 448, 448)
    image_list.append(img.cpu())

print(len(image_list))

  5%|███▏                                                          | 2793/53683 [05:55<1:27:20,  9.71it/s]

In [None]:
image_tensor = torch.cat(image_list)
print(image_tensor.shape)

In [None]:
# from tqdm import tqdm
# import numpy as np
# import os
# import cv2
# # from google.colab.patches import cv2_imshow

# # Transform the image, so it becomes readable with the model
# transform = transforms.Compose([
#   transforms.ToPILImage(),
#   transforms.CenterCrop(512),
#   transforms.Resize(448),
#   transforms.ToTensor()                              
# ])

# Will contain the feature
features = []
image_id = []
# path = 'images'
# images = os.listdir(path)
# print("Total images:", len(images))
# for image in images:
  # print(image)

# images = sample_image_paths
# Iterate each image

for start_idx in tqdm(range(0, 4900, 100)):
    curr_batch = image_tensor[start_idx:start_idx+100]
    
    with torch.no_grad():
      # Extract the feature from the image
      feature = new_model(curr_batch)

    # Convert to NumPy Array, Reshape it, and save it to features variable
    features.append(feature.cpu().detach())

    
# for i in images:
#     # print (i)
#     image_id.append(i.split('/')[-1])
#     # Set the image path
#     # print("image", i)
#     # print('path:', path)
#     imagePath = i
#     # imagePath = os.path.join(path + '/', str(i))
#     # print(imagePath)
#     # Read the file
#     img = cv2.imread(imagePath)
#     # cv2_imshow(img)
#     # Transform the image
#     img = transform(img)
#     # Reshape the image. PyTorch model reads 4-dimensional tensor
#     # [batch_size, channels, width, height]
#     img = img.reshape(1, 3, 448, 448)
#     img = img.to(device)
#     # We only extract features, so we don't need gradient
#     with torch.no_grad():
#       # Extract the feature from the image
#       feature = new_model(img)
#     # Convert to NumPy Array, Reshape it, and save it to features variable
#     features.append(feature.cpu().detach().numpy().reshape(-1))

# # Convert to NumPy Array
# features = np.array(features)
# print(features.shape)
# print(features[0])

In [None]:
all_features = torch.cat(features)

In [None]:
print(all_features.shape)

In [None]:
import numpy as np
np.save('features_2.npy', all_features.cpu().numpy())

In [None]:
from numpy.ma.core import argmin
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
from keras.preprocessing import image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Initialize the model
# model = KMeans(n_clusters=6, random_state=42)
# num_clusters = 133
min_num_clusters = 15
max_num_clusters = 50
silhouette_scores = []
davies_bouldin_scores = []
# print("Silhouette Scores")

print("k \t Silhouette \t DB")
for num_clusters in range(min_num_clusters, max_num_clusters):
  print()
  model = KMeans(n_clusters=num_clusters)

  # Fit the data into the model
  model.fit(features)

  # Extract the labels
  labels = model.labels_
  
  # Calculate Silhouette score
  s = metrics.silhouette_score(features, labels, metric='euclidean')
  silhouette_scores.append(s)
  d = metrics.davies_bouldin_score(features, labels)
  davies_bouldin_scores.append(d)
  print(num_clusters, "\t", s, "\t", d)


silhouette_scores = np.array(silhouette_scores)
davies_bouldin_scores = np.array(davies_bouldin_scores)

In [None]:
from yellowbrick.cluster import KElbowVisualizer

model = KMeans()
visualizer = KElbowVisualizer(model, k=(min_num_clusters, max_num_clusters))

visualizer.fit(features)    # Fit the data to the visualizer
visualizer.show()           # Finalize and render the figure

In [None]:
best_num_clusters = 34
best_num_clusters_index = best_num_clusters - min_num_clusters  

print("Optimal no. of clusters is", best_num_clusters, "by Elbow Method")


# best_num_clusters_index = np.argmax(silhouette_scores)
# best_num_clusters = min_num_clusters + best_num_clusters_index
# print("Silhouette Scores")
# for i in range(len(silhouette_scores)):
#   print(i, ":", silhouette_scores[i])

# print("Optimal no. of clusters is", best_num_clusters, "with Silhouette score of", silhouette_scores[best_num_clusters_index])


# best_num_clusters_index = np.argmax(davies_bouldin_scores)
# best_num_clusters = min_num_clusters + best_num_clusters_index 
# # print("Silhouette Scores")
# # for i in range(len(silhouette_scores)):
# #   print(i, ":", silhouette_scores[i])

# print("Optimal no. of clusters is", best_num_clusters, "with David Bouldin score of", davies_bouldin_scores[best_num_clusters_index])

In [None]:
num_clusters = best_num_clusters
model = KMeans(n_clusters=num_clusters)
model.fit(features)
labels = model.labels_
labels = np.array(labels)

for k in range(0, 7):
  num = 0
  print("\nCluster ", k)
  numImages = len(np.where(labels == k)[0])
  print(numImages)
  numRows = int(numImages / 10) + 1
  numCols = 10
  fig = plt.figure(k, figsize=(numCols * 5.0  , numRows * 5.0))
  print(numRows, numCols)
  for i in range(labels.shape[0]):
    if(labels[i] == k):
      imagePath = os.path.join(path + '/', str(images[i]))
      img = image.load_img(imagePath)
      plt.subplot(numRows, numCols, num+1)
      plt.xticks([])
      plt.yticks([])
      plt.imshow(img)
      num += 1
  plt.show()
# print(labels) # [4 3 3 ... 0 0 0]

# Using Pre-trained ResNet

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications
from sklearn.metrics import pairwise_distances
import requests
from PIL import Image
import pickle
from datetime import datetime
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
from IPython.display import display, Image

In [None]:
img_width, img_height = 224, 224 #Can be changed to what we want

train_data_dir = "/data "

nb_train_samples = 801 #depends on number of samples we taking 
epochs = 50 #parameter to tune
batch_size = 1 #parameter to tune

def extract_features():
    Itemcodes = []
    datagen = ImageDataGenerator(rescale=1. / 255)
    model = applications.ResNet50(include_top=False, weights='imagenet')
    generator = datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)
    for i in generator.filenames:
        Itemcodes.append(i[(i.find("/")+1):i.find(".")])
    extracted_features = model.predict_generator(generator, nb_train_samples // batch_size)
    extracted_features = extracted_features.reshape((801, 10035)) #1 is nb_train_samples #2 number of vectors we wish to represent as
    np.save(open('./x.npy', 'wb'), extracted_features) #save as numpy array to save computational time
    np.save(open('./x_ids.npy', 'wb'), np.array(Itemcodes)) #save as numpy array ro save computations time
    
a = datetime.now()
extract_features()
print("Time taken in feature extraction", datetime.now()-a)

In [None]:
articleIds = []
for image_path in sample_image_paths:
    articleIds.append(image_path.split('/')[-1].split('.')[0])
    
extracted_features = all_features.cpu().numpy()
print(len(articleIds), extracted_features.shape)

In [None]:
articleIds = np.array(articleIds)
print(articleIds.shape)

In [None]:
# #load all the data to be worked upon
# extracted_features = np.load('/kaggle/working/x.npy')
# Productids = np.load('/kaggle/working/xtids.npy')
# #data_copy = data.copy()
# #df_Productids = list(data['ProductId'])
# Productids = list(Productids)

In [None]:
Productids[0:10]

In [None]:
import pandas as pd
men = pd.read_csv('articles.csv')
print(men.columns)
print(men.shape)

In [None]:
men['ImageURL']

In [None]:
Productids = list(articleIds)

In [None]:
#Get recommendation of similar items
from sklearn.metrics import pairwise_distances

def get_similar_products_cnn(product_id, num_results):
    doc_id = Productids.index(product_id)
    pairwise_dist = pairwise_distances(extracted_features, extracted_features[doc_id].reshape(1,-1))
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists  = np.sort(pairwise_dist.flatten())[0:num_results]
    print("="*20, "input product image", "="*20)
    ip_row = men[['image_url','prod_name']].loc[men['article_id']==int(Productids[indices[0]])] #change for our use
    #print(ip_row.head())
    for indx, row in ip_row.iterrows():
        display(Image(url=row['image_url'], width = 224, height = 224,embed=True)) #change for our use
        print('Product Title: ', row['prod_name']) #change for our use
    print("\n","="*20, "Recommended products", "="*20)
    for i in range(1,len(indices)):
        rows = men[['image_url','prod_name']].loc[men['article_id']==int(Productids[indices[i]])] #change for our use
        for indx, row in rows.iterrows():
            display(Image(url=row['image_url'], width = 224, height = 224,embed=True)) #change for our use [width and height need to be ]
            print('Product Title: ', row['prod_name']) #change for our use
            print('Euclidean Distance from input image:', pdists[i])

get_similar_products_cnn('0676255002', 5)

In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
def get_similar_recommendation(article_id, loop_number):
    index = image_id.index(article_id)
    pairwise_dist = pairwise_distances(features[index].reshape(1, -1), features)
    indices = np.argsort(pairwise_dist.flatten())
    print(indices)
    print(pairwise_dist)
    pdist = np.sort(pairwise_dist.flatten())
    print(pdist)
    for i in range(loop_number):
        print(image_id[indices[i]])

In [None]:
get_similar_recommendation("0509937020.jpg", 2)

In [None]:
pairwise_dist = pairwise_distances(features[0].reshape(1, -1), features)

In [None]:
indices = np.argsort(pairwise_dist.flatten())
print(indices)

In [None]:
pdist = np.sort(pairwise_dist.flatten())
print(pdist)