In [435]:
import os
import sys
import pandas as pd
import numpy as np
import math
import random
from gensim import corpora
from gensim.similarities import SparseMatrixSimilarity
from src.preprocessing import remove_stop_words

## Loading/Preprocessing Data

In [22]:
def chunk_data():
    chunk_size = 100000
    batch = 1
    for chunk in pd.read_json('data/base/offers_corpus_english_v2.json', lines=True, nrows= 100000000000000, chunksize=chunk_size):
        chunk.to_json('data/base/product_corpus/chunk' + str(batch) + '.json')
        batch += 1

In [23]:
def generate_computer_data():
    chunk_size = 100000
    computer_df = pd.DataFrame()
    for chunk in pd.read_json('data/base/offers_corpus_english_v2.json', lines=True, nrows= 100000000000000, chunksize=chunk_size):
        computer_df = computer_df.append(chunk[chunk['category'].values == 'Computers_and_Accessories'])
    return computer_df

In [134]:
def get_pos_clusters(df):
    MAX_CLUSTER_SIZE = 80
    valid_clusters = (((df['cluster_id'].value_counts() > 1) & 
                        (df['cluster_id'].value_counts() <= MAX_CLUSTER_SIZE)))

    valid_clusters = list(valid_clusters[valid_clusters == True].index)
    all_clusters = df[df['cluster_id'].isin(valid_clusters)]['cluster_id'].values
    return set(all_clusters)

In [178]:
def extract_key_features_OLD(computer_df):
    left = computer_df[['id_left', 'title_left',
                        'description_left', 'cluster_id_left']]
    right = computer_df[['id_right', 'title_right',
                         'description_right', 'cluster_id_right']]

    left = left.rename(columns={'id_left': 'id', 
                        'title_left': 'title',
                        'description_left': 'description',
                        'cluster_id_left': 'cluster_id'
                       })
    
    right = right.rename(columns={'id_right': 'id',
                          'title_right': 'title',
                          'description_right': 'description',
                          'cluster_id_right': 'cluster_id'
                         })
    
    ret = left.append(right)
    ret = ret.drop_duplicates(subset=['id'])
    
    return ret

In [15]:
computer_df = pd.read_csv('data/base/computer_wdc_whole_no_duplicates.csv')

In [10]:
computer_df = computer_df.drop_duplicates('title')

In [4]:
computer_df

Unnamed: 0,brand,category,cluster_id,description,id,identifiers,keyValuePairs,price,specTableContent,title
0,,Computers_and_Accessories,1554982,,41,"[{'/mpn': '[nxm81eh034]'}, {'/gtin13': '[47131...","{'categorie': 'laptops', 'merk': 'acer', 'prod...",,categorie laptops merk acer productserie aspir...,acer aspire e1 522 65208g1tmnkk specificaties ...
1,,Computers_and_Accessories,15189423,description,55,[{'/productID': '[k1009900]'}],,,,kohler lavatory bonnet 1009900 bn ferguson
2,hp enterprise,Computers_and_Accessories,14583973,description hp third party rackmount option ki...,72,"[{'/sku': '[231122b21]'}, {'/mpn': '[231122b21...","{'category': 'hp option', 'sub category': 'rac...",,specifications category hp option sub category...,"null , 231122 b21 hp 3rd party rail kit ml370 ..."
3,hp enterprise,Computers_and_Accessories,3859891,description hp proliant dl380 g6 rack mountabl...,75,[{'/sku': '[491505001]'}],,,,"null , 491505 001 hp dl380 g6 e5504 2 00ghz 4g..."
4,,Computers_and_Accessories,10106149,,91,[{'/mpn': '[ds1010c101]'}],,,,ds1010c 101
...,...,...,...,...,...,...,...,...,...,...
432616,,Computers_and_Accessories,16080620,""" ...",17557404,[{'/productID': '[5403739000]'}],,,,"""Acer Swift 1 SF113-31-P4A2 Pink ""@de"
432617,,Computers_and_Accessories,4750388,,17557407,[{'/gtin12': '[888462794961]'}],,,,"""Apple MJYR2LL/A Smart Keyboard for 12.9-inch ..."
432618,,Computers_and_Accessories,518797,"""Unfold the full-size Smart Keyboard when you ...",17557412,[{'/mpn': '[mptl2lla]'}],,,,"""Apple - Smart Keyboard for 10.5-inch iPad Pro..."
432619,"""Apple""",Computers_and_Accessories,13868227,"""11.6-inch LED-backlit glossy widescreen TFT d...",17557460,[{'/mpn': '[mjvm2lla]'}],,,,"""Macbook Air 11.6-inch (Glossy) 1.6GHZ Dual Co..."


In [16]:
all_clusters = set(computer_df['cluster_id'].values)

In [127]:
num = 0
for cluster in all_clusters:
    if len(computer_df.loc[computer_df['cluster_id'].values == cluster]) >= 2 and len(computer_df.loc[computer_df['cluster_id'].values == cluster]) < 80:
        num += 1

print(num)

52530


In [252]:
len(all_clusters)

295932

## Building Dictionary and Similarity

In [415]:
def extract_key_features(cluster):
    new_cluster = cluster.loc[:, ("id", "description", "title")]
    new_cluster["title"] = new_cluster["title"].map(lambda x: remove_stop_words(x))
    new_cluster["description"] = new_cluster["description"].map(lambda x: remove_stop_words(x))
    new_cluster["titleDesc"] = new_cluster["title"].map(lambda x: x.split(" ")) + new_cluster["description"].map(lambda x: x.split(" ")).map(lambda x: x[0:6])
    return new_cluster

In [416]:
pos_clusters = list(get_pos_clusters(computer_df))

In [417]:
cluster = computer_df.loc[computer_df["cluster_id"].values == pos_clusters[0]].copy()

In [418]:
cluster = extract_key_features(cluster)

In [419]:
dictionary = corpora.Dictionary(cluster["titleDesc"])

In [420]:
cluster_dict = [dictionary.doc2bow(title) for title in cluster["title"].map(lambda x: x.split(" "))]

In [421]:
index = SparseMatrixSimilarity(cluster_dict, num_features=len(dictionary))

In [422]:
index[cluster_dict]

array([[1.        , 0.87705797, 0.7526177 , 0.76271284],
       [0.87705797, 1.0000001 , 0.8581164 , 0.71151245],
       [0.7526177 , 0.8581164 , 1.0000001 , 0.8291561 ],
       [0.76271284, 0.71151245, 0.8291561 , 1.        ]], dtype=float32)

In [355]:
def combinations(total, choose):
    return int(math.factorial(total) / (math.factorial(choose) * math.factorial(total - choose)))

In [487]:
def create_pos_data(data, cluster_id):
    MAX_PAIRS = 16
    cluster = data.loc[data["cluster_id"].values == cluster_id]
    cluster = extract_key_features(cluster)
    max_combos = combinations(len(cluster), 2)
    
    dictionary = corpora.Dictionary(cluster["titleDesc"])
    cluster_dict = [dictionary.doc2bow(title) for title in cluster["title"].map(lambda x: x.split(" "))]
    sim_matrix = np.array(SparseMatrixSimilarity(cluster_dict, num_features=len(dictionary)))
    
    for row in range(sim_matrix.shape[0]):
        for column in range(sim_matrix.shape[1]):
            if (row >= column):
                sim_matrix[row][column] = 100
        
    if max_combos < MAX_PAIRS:
        MAX_PAIRS = max_combos
    
    hard_pos = MAX_PAIRS // 2
    random_pos = MAX_PAIRS - hard_pos
    
    pairs = []
    for x in range(hard_pos):
        min_sim = np.unravel_index(sim_matrix.argmin(), sim_matrix.shape)
        pair = [cluster["title"].iloc[min_sim[0]], cluster["title"].iloc[min_sim[1]], 1]
        pairs.append(pair)
        sim_matrix[min_sim[0]][min_sim[1]] = 100
    
    avail_indices = np.argwhere(sim_matrix != 100)
    for x in range(random_pos):
        print(avail_indices)
        ran_idx = random.sample(list(range(len(avail_indices))), 1)
        choice = avail_indices[ran_idx][0]
        pair = [cluster["title"].iloc[choice[0]],
                cluster["title"].iloc[choice[1]], 1]
        pairs.append(pair)
        avail_indices = np.delete(avail_indices, ran_idx, 0)
    
    return pairs

In [488]:
create_pos_data(computer_df, 131074)

[[0 1]
 [1 2]
 [2 3]]
[[1 2]
 [2 3]]
[[2 3]]


[['cx 2g10 300 emc gb 10k 3 5 fc al',
  'cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack wholesale price 2pack',
  1],
 ['cx 2g10 300 emc gb 10k 3 5 fc al new wholesale price',
  'cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd',
  1],
 ['cx 2g10 300 emc gb 10k 3 5 fc al new wholesale price',
  'cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack wholesale price 2pack',
  1],
 ['cx 2g10 300 emc gb 10k 3 5 fc al new wholesale price',
  'cx 2g10 300 emc gb 10k 3 5 fc al',
  1],
 ['cx 2g10 300 emc gb 10k 3 5 fc al',
  'cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd',
  1],
 ['cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd',
  'cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack wholesale price 2pack',
  1]]

In [408]:
cluster["title"]

11736     cx 2g10 300 emc gb 10k 3 5 fc al , null new wh...
71690               null , cx 2g10 300 emc gb 10k 3 5 fc al
190454         cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd , null
224854    null , cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2...
Name: title, dtype: object