In [30]:
import argparse
import csv
import gzip
from typing import List
from ast import literal_eval

import numpy as np
import pandas as pd
from pandas.api.types import is_object_dtype
from pprint import pprint
from tqdm import tqdm

## Transform Categories 

In [31]:
read_path = '../data/interim/meta_Electronics.csv'
META_COLS = ['asin', 
             'category', 
             'title', 
             'description', 
             'price', 
             'brand', 
             'also_buy', 
             'also_view',
             'similar_item']
df = pd.read_csv(read_path, usecols = META_COLS, converters = {'also_buy' : literal_eval,
                                                              'also_view' : literal_eval, 
                                                              'category' : literal_eval,
                                                              'description' : literal_eval,})

  df = pd.read_csv(read_path, usecols = META_COLS, converters = {'also_buy' : literal_eval,


In [32]:
df.head()

Unnamed: 0,category,description,title,also_buy,brand,also_view,similar_item,price,asin
0,"[electronics, camera &amp; photo, video survei...",[the following camera brands and models have b...,genuine geovision 1 channel 3rd party nvr ip s...,[],geovision,[],,$65.00,11300000
1,"[electronics, camera &amp; photo]",[this second edition of the handbook of astron...,"books ""handbook of astronomical image processi...",[0999470906],33 books co.,"[0943396670, 1138055360, 0999470906]",,,43396828
2,"[electronics, ebook readers &amp; accessories,...",[a zesty tale. (publishers weekly)<br /><br />...,one hot summer,"[0425167798, 039914157x]",visit amazon's carolina garcia aguilera page,[],,$11.49,60009810
3,"[electronics, ebook readers & accessories, ebo...",[],hurray for hattie rabbit: story and pictures (...,"[0060219521, 0060219580, 0060219394]",visit amazon's dick gackenbach page,"[0060219521, 0060219475, 0060219394]",,.a-section.a-spacing-mini{margin-bottom:6px!im...,60219602
4,"[electronics, ebook readers & accessories, ebo...",[&#8220;sex.lies.murder.fame. is brillllli&#82...,sex.lies.murder.fame.: a novel,[],visit amazon's lolita files page,[],,$13.95,60786817


In [33]:
def get_category_lvl(category_list: list, lvl = 0) -> str:
    try:
        return category_list[lvl]
    except IndexError:
        return 'NA_VALUE'

In [34]:
df['category_lvl_1'] = df['category'].apply(get_category_lvl, args=(0,))
df['category_lvl_2'] = df['category'].apply(get_category_lvl, args=(1,))
df['category_lvl_3'] = df['category'].apply(get_category_lvl, args=(2,))
df['category_lvl_4'] = df['category'].apply(get_category_lvl, args=(3,))

In [35]:
df.head()

Unnamed: 0,category,description,title,also_buy,brand,also_view,similar_item,price,asin,category_lvl_1,category_lvl_2,category_lvl_3,category_lvl_4
0,"[electronics, camera &amp; photo, video survei...",[the following camera brands and models have b...,genuine geovision 1 channel 3rd party nvr ip s...,[],geovision,[],,$65.00,11300000,electronics,camera &amp; photo,video surveillance,surveillance systems
1,"[electronics, camera &amp; photo]",[this second edition of the handbook of astron...,"books ""handbook of astronomical image processi...",[0999470906],33 books co.,"[0943396670, 1138055360, 0999470906]",,,43396828,electronics,camera &amp; photo,NA_VALUE,NA_VALUE
2,"[electronics, ebook readers &amp; accessories,...",[a zesty tale. (publishers weekly)<br /><br />...,one hot summer,"[0425167798, 039914157x]",visit amazon's carolina garcia aguilera page,[],,$11.49,60009810,electronics,ebook readers &amp; accessories,ebook readers,NA_VALUE
3,"[electronics, ebook readers & accessories, ebo...",[],hurray for hattie rabbit: story and pictures (...,"[0060219521, 0060219580, 0060219394]",visit amazon's dick gackenbach page,"[0060219521, 0060219475, 0060219394]",,.a-section.a-spacing-mini{margin-bottom:6px!im...,60219602,electronics,ebook readers & accessories,ebook readers,NA_VALUE
4,"[electronics, ebook readers & accessories, ebo...",[&#8220;sex.lies.murder.fame. is brillllli&#82...,sex.lies.murder.fame.: a novel,[],visit amazon's lolita files page,[],,$13.95,60786817,electronics,ebook readers & accessories,ebook readers,NA_VALUE


In [36]:
df['similar_item'].unique()[2]

' class="a-bordered a-horizontal-stripes  a-spacing-extra-large a-size-base comparison_table">\n\n\n\n            \n            \n            \n            \n            \n            <tbody><tr class="comparison_table_image_row">\n                <td class="comparison_table_first_col"></td>\n\n\n                <th class="comparison_image_title_cell" role="columnheader">\n                    <div class="a-row a-spacing-top-micro">\n                        <center>\n                             <img alt="boox max carta ereader,13.3&quot; flexible screen 16 gb with built-in wi-fi,bluetooth" src="https://images-na.ssl-images-amazon.com/images/i/5127u2ftmdl._sl500_ac_ss350_.jpg" id="comparison_image" />\n                        </center>\n                    </div>\n                    <div class="a-row a-spacing-top-small">\n                        <div id="comparison_title" class="a-section a-spacing-none">\n                            <span aria-hidden="true" class="a-size-base a-color

In [37]:
df['similar_item'] = np.where(df['similar_item'].isnull(), 0, 1)

In [38]:
write_path = '../data/interim/meta_Electronics_transformed.csv'
df.to_csv(write_path, index = False)

## Node Relationships

In [71]:
read_path = '../data/interim/meta_Electronics_transformed.csv'
df = pd.read_csv(read_path, on_bad_lines = 'warn',
                 dtype = {'asin' : 'str', 
                          'title' : 'str',
                          'brand' : 'str'},
                 converters = {'also_buy' : literal_eval,
                               'also_view' : literal_eval, 
                               'category' : literal_eval,
                               'description' : literal_eval})

  df = pd.read_csv(read_path, on_bad_lines = 'warn',


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786445 entries, 0 to 786444
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   category        786445 non-null  object
 1   description     786445 non-null  object
 2   title           786426 non-null  object
 3   also_buy        786445 non-null  object
 4   brand           781009 non-null  object
 5   also_view       786445 non-null  object
 6   similar_item    786445 non-null  int64 
 7   price           310336 non-null  object
 8   asin            786445 non-null  object
 9   category_lvl_1  786445 non-null  object
 10  category_lvl_2  786445 non-null  object
 11  category_lvl_3  786445 non-null  object
 12  category_lvl_4  786445 non-null  object
dtypes: int64(1), object(12)
memory usage: 78.0+ MB


In [75]:
RELATED_FIELDS = ['also_buy', 'also_view']
df['is_na'] = df.apply(lambda x : not bool(sum([len(x[col]) for col in RELATED_FIELDS])), axis = 1)

In [76]:
df.head()

Unnamed: 0,category,description,title,also_buy,brand,also_view,similar_item,price,asin,category_lvl_1,category_lvl_2,category_lvl_3,category_lvl_4,is_na
0,"[electronics, camera &amp; photo, video survei...",[the following camera brands and models have b...,genuine geovision 1 channel 3rd party nvr ip s...,[],geovision,[],0,$65.00,11300000,electronics,camera &amp; photo,video surveillance,surveillance systems,True
1,"[electronics, camera &amp; photo]",[this second edition of the handbook of astron...,"books ""handbook of astronomical image processi...",[0999470906],33 books co.,"[0943396670, 1138055360, 0999470906]",0,,43396828,electronics,camera &amp; photo,NA_VALUE,NA_VALUE,False
2,"[electronics, ebook readers &amp; accessories,...",[a zesty tale. (publishers weekly)<br /><br />...,one hot summer,"[0425167798, 039914157x]",visit amazon's carolina garcia aguilera page,[],0,$11.49,60009810,electronics,ebook readers &amp; accessories,ebook readers,NA_VALUE,False
3,"[electronics, ebook readers & accessories, ebo...",[],hurray for hattie rabbit: story and pictures (...,"[0060219521, 0060219580, 0060219394]",visit amazon's dick gackenbach page,"[0060219521, 0060219475, 0060219394]",0,.a-section.a-spacing-mini{margin-bottom:6px!im...,60219602,electronics,ebook readers & accessories,ebook readers,NA_VALUE,False
4,"[electronics, ebook readers & accessories, ebo...",[&#8220;sex.lies.murder.fame. is brillllli&#82...,sex.lies.murder.fame.: a novel,[],visit amazon's lolita files page,[],0,$13.95,60786817,electronics,ebook readers & accessories,ebook readers,NA_VALUE,True


In [77]:
df = df[~df['is_na']]
df = df[~df['title'].isnull()]
df = df[['asin'] + RELATED_FIELDS]

In [78]:
df

Unnamed: 0,asin,also_buy,also_view
1,0043396828,[0999470906],"[0943396670, 1138055360, 0999470906]"
2,0060009810,"[0425167798, 039914157x]",[]
3,0060219602,"[0060219521, 0060219580, 0060219394]","[0060219521, 0060219475, 0060219394]"
5,0070524076,"[0073049557, 0134454170, 1118142063, 007733968...","[0073512141, 0077339681, 0073049557, 007304956..."
6,0091912407,[0330509691],[b0719ldqr1]
...,...,...,...
786424,b01hjcn5gc,"[b01hjcn55i, b01d6ieg6q]",[]
786431,b01hjffhtc,[],"[b07bc99b8l, b06x9k9x1g, b07gdc29sr, b07cwng32..."
786433,b01hjfrhya,[b01m096i7q],[]
786437,b01hjf704m,[],"[b01ccmun8c, b003mttjoy, b008ifxqfu, b00jdvrci..."


In [80]:
df['also_bought_count'] = df['also_buy'].apply(len)

In [83]:
df = df[df['also_bought_count'] >= 2]
df

Unnamed: 0,asin,also_buy,also_view,also_bought_count
2,0060009810,"[0425167798, 039914157x]",[],2
3,0060219602,"[0060219521, 0060219580, 0060219394]","[0060219521, 0060219475, 0060219394]",3
5,0070524076,"[0073049557, 0134454170, 1118142063, 007733968...","[0073512141, 0077339681, 0073049557, 007304956...",9
7,0101635370,"[b01naj3kqb, b00wyspt0c, b00af40u5g, b00ofvnm4...","[b01naj3kqb, b00ofvnm4g, b00l41wy8k, b07f34pnp...",5
10,0151004714,"[0307596907, 030726419x, 0140140999]","[0393356051, 0307743764, 0393355098, 030727666...",3
...,...,...,...,...
786418,b01hjcn5to,"[b01hjcn55i, b01d6ieg6q, b008plzaba]","[b01hrva9b4, b01hrvabd0, b01hjcn55i, b01f9rh5m4]",3
786420,b01hjdr9dq,"[b01czelia4, b06xccxmtb, b00ix1kebs, b01ey0x6f...","[b004wi1so8, b077ly5rqh, b00hkvqujw, b07548g6z...",11
786421,b01hjdxfqq,"[b002q907ew, b00hsf65mc, b009vcz4v8]",[],3
786423,b01hjdnl60,"[b00ivpu786, b01dob6y5q, b010q57t02, b01m592j9...","[b07dvf9lkg, b07cpsbxwl, b07d7xbk4k, b07cyw6jx...",10


In [84]:
df.drop(columns = ['also_bought_count'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns = ['also_bought_count'], inplace=True)


In [87]:
def explode_on_related(df: pd.DataFrame, relationship : str) -> pd.DataFrame:
    vals = df[relationship].values.tolist()
    lens = [len(val_list) for val_list in vals]
    vals_array = np.repeat(df['asin'], lens)
    exploded_df = pd.DataFrame(np.column_stack((vals_array, np.concatenate(vals))),
                               columns=['asin', 'related'])
    exploded_df['relationship'] = relationship
    return exploded_df

In [92]:
also_buy_df = explode_on_related(df, 'also_buy')
also_view_df = explode_on_related(df, 'also_view')

In [93]:
also_buy_df

Unnamed: 0,asin,related,relationship
0,0060009810,0425167798,also_buy
1,0060009810,039914157x,also_buy
2,0060219602,0060219521,also_buy
3,0060219602,0060219580,also_buy
4,0060219602,0060219394,also_buy
...,...,...,...
1919007,b01hjdnl60,b01mqmjfdk,also_buy
1919008,b01hjdnl60,b06xx29s9q,also_buy
1919009,b01hjdnl60,b00j4eqvpg,also_buy
1919010,b01hjcn5gc,b01hjcn55i,also_buy


In [94]:
also_view_df

Unnamed: 0,asin,related,relationship
0,0060219602,0060219521,also_view
1,0060219602,0060219475,also_view
2,0060219602,0060219394,also_view
3,0070524076,0073512141,also_view
4,0070524076,0077339681,also_view
...,...,...,...
1208868,b01hjdnl60,b01cw4bi6g,also_view
1208869,b01hjdnl60,b06xfqtdql,also_view
1208870,b01hjdnl60,b0773fxvkl,also_view
1208871,b01hjdnl60,b07gv3nb69,also_view


In [95]:
combined_df = pd.concat([also_buy_df, also_view_df], axis=0)

In [96]:
write_path = '../data/interim/meta_Electronics_relationship.csv'
combined_df.to_csv(write_path)