In [9]:
import argparse
import csv
import gzip
from typing import List
from ast import literal_eval
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
from pprint import pprint
from tqdm import tqdm

In [10]:
import networkx
import scipy as sc

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
read_path = '../data/interim/meta_Electronics_edges.csv'
df = pd.read_csv(read_path)

In [13]:
df.rename(columns = {'product1.1' : 'product2'}, inplace = True)

In [14]:
df.head()

Unnamed: 0,product1,product2,weight,product_pair
0,60009810,039914157x,1.0,0060009810|039914157x
1,60009810,0425167798,1.0,0060009810|0425167798
2,60219602,0060219394,1.5,0060219602|0060219394
3,60219602,0060219475,0.5,0060219602|0060219475
4,60219602,0060219521,1.5,0060219602|0060219521


In [15]:
val_prop = 0.25

In [16]:
n_val_samples = int(val_prop*df.shape[0])
print(f'No. of samples in validation = {n_val_samples}')

No. of samples in validation = 720499


In [17]:
train, val = train_test_split(df, test_size=int(n_val_samples), random_state=42)

In [18]:
valid_product_pairs = set(df['product_pair'])
print(f'No. of valid product pair - {len(valid_product_pairs)}')

No. of valid product pair - 2881996


In [19]:
val_product_arr = np.array(list(set(val['product1']).union(set(val['product2']))))

In [20]:
def get_sample(item_array, n_iter = None, sample_size = 2):
    n = len(item_array)
    start_idx = (n_iter * sample_size) % n
    if (start_idx + sample_size >= n) or (start_idx <= sample_size):
        np.random.shuffle(item_array)
    
    return item_array[start_idx:start_idx + sample_size]
    
    
def collect_samples(item_array, sample_size, n_samples):
    samples = []
    for i in range(0, n_samples):
        if i % 1000000 == 0:
            print(f'Neg Sample : {i}')
        sample = get_sample(item_array, n_iter = i, sample_size = sample_size)
        samples.append(sample)
    return samples

def create_product_df(df, col_list):
    pairs = df[col_list].values
    pairs.sort(axis = 1)
    df['product_pair'] =['|'.join(arr) for arr in pairs]
    
    return df

def combine_val_and_neg_edges(val, neg_samples):
    neg_samples['edge'] = 0
    val['edge'] = 1
    
    VAL_COLS = ['product1', 'product2', 'edge']
    neg_samples = neg_samples[VAL_COLS]
    val = val[VAL_COLS]
    
    val = pd.concat([val, neg_samples])
    
    print(f'Final val df size = {val.shape[0]}')
    
    return val

In [21]:
neg_samples = collect_samples(val_product_arr, 
                              sample_size=2, 
                              n_samples=int(1.1 * n_val_samples))
neg_samples_df = pd.DataFrame(neg_samples, columns=['product1', 'product2'])
neg_samples_df.dropna(inplace=True)

Neg Sample : 0


In [22]:
neg_samples_df = create_product_df(neg_samples_df, 
                                   col_list = ['product1', 'product2'])

In [23]:
print(f'Size of the negative sample df = {neg_samples_df.shape[0]}')

Size of the negative sample df = 792545


In [24]:
neg_samples_df = neg_samples_df[~neg_samples_df['product_pair'].isin(valid_product_pairs)]
print(f'Size of negative samples df after removing valid pairs = {neg_samples_df.shape[0]}')

Size of negative samples df after removing valid pairs = 792508


In [25]:
neg_samples_df = neg_samples_df.iloc[:n_val_samples].copy()

In [26]:
val = combine_val_and_neg_edges(val, neg_samples_df)

Final val df size = 1440998


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val['edge'] = 1


In [27]:
val.head()

Unnamed: 0,product1,product2,edge
1414411,b00mmzfrhw,b01mzyoj76,1
2080999,b014lrkdwm,b071hb4bpr,1
560068,b005l8vf3w,b00rkbb94s,1
2684943,b01es8uwfm,b00bd8i3ei,1
1918376,b00zw80dt8,b01eaiv5h4,1


In [28]:
train.head()

Unnamed: 0,product1,product2,weight,product_pair
1033747,b00f37z8q6,b00priwjay,1.0,b00f37z8q6|b00priwjay
573036,b005p9g7dk,b00fzpdg1k,1.0,b005p9g7dk|b00fzpdg1k
396472,b003sw13wq,b004yi8eso,1.0,b003sw13wq|b004yi8eso
473085,b004rym2he,b00v7cbh6g,1.0,b004rym2he|b00v7cbh6g
1186362,b00idan8h6,b009tnagda,1.0,b00idan8h6|b009tnagda


In [29]:
train = train[['product1', 'product2', 'weight']].copy()

In [30]:
print(f'Train size = {train.shape[0]}')
print(f'Val Size = {val.shape[0]}')

Train size = 2161497
Val Size = 1440998


In [31]:
input_filename = Path(read_path).resolve().stem

In [32]:
train.to_csv(f'../data/interim/{input_filename}_train.csv', index = False)
val.to_csv(f'../data/interim/{input_filename}_val.csv', index = False)

train.to_csv(f'../data/interim/{input_filename}_train.edgelist', 
             sep = ' ',
             index = False,
             header = False)