The [dataset](https://jmcauley.ucsd.edu/data/amazon_v2/index.html) is metadata for Magazine Subscriptions category

In [1]:
import sys

IN_COLAB = 'google.colab' in sys.modules
DIR_PATH = ""

if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/dirve")
    DIR_PATH = r"/content/dirve/MyDrive/研究所/Data/dblp/"
    sys.path.append('/content/dirve/MyDrive/Colab Notebooks/package')
else:
    DIR_PATH = r"D:\\論文實驗\\data\\dblp\\"
    sys.path.append('D:\\論文實驗\\package')
    sys.path.append("D:\\論文實驗\\env\\Lib\\site-packages")

In [None]:
import pandas as pd
from utils import preprocessingText

DIR_PATH = r"D:\\論文實驗\\data\\amazon\\"
DATASET = r"meta_Software.json"

def checkAttr():
    '''
        check whether ASIN of the item is existing.
        
        Result: 
            all of the items exist ASIN of the properties.
    '''
    with open(DIR_PATH + DATASET, "r") as f:
        data = pd.read_json(f, orient="records", typ="series", lines=True)
    
        for item in data:
            if not all(key in item for key in ["asin", "also_view", "also_buy", "category", "price"]):
                print(item)

def reduceAttr(output_file):
    '''
        保留需要的商品屬性, asin, also_view, also_buy, category, price.
        also_view和also_buy有多個以上時, 用空白隔開
    '''
    
    with open(DIR_PATH + DATASET, "r") as f:
        data = pd.read_json(f, orient="records", typ="series", lines=True, dtype={"category":list})

        output = open(DIR_PATH + output_file, "w")
        for item in data:
            also_view = " ".join(item["also_view"])
            also_buy = " ".join(item["also_buy"])
            category = " ".join([preprocessingText(string) for string in item["category"]])
            price = item["price"][1:] if item["price"] != "" and item["price"][0] == "$" else ""

            if also_view and also_buy and category and price:
                row = "{asin},{also_view},{also_buy},{category},{price}".format(
                    asin = item["asin"], 
                    also_view = also_view, 
                    also_buy = also_buy, 
                    category = category, 
                    price = price.replace(",", ""))
                output.write(row + "\n")
        output.close()
checkAttr()
reduceAttr("extraction.csv")

### 需要保留的商品屬性
- asin
- also_view
- also_buy
- category
- price

In [None]:
with open(DIR_PATH + DATASET, "r") as f:
    data = pd.read_json(f, orient="records", typ="series", lines=True)
    count = 0
    for item in data:
        if item["asin"] == "0078605407":
            count += 1
            also_view = " ".join(item["also_view"])
            also_buy = " ".join(item["also_buy"])
            category = " ".join([preprocessingText(string) for string in item["category"]])
            price = item["price"][1:] if item["price"] != "" and item["price"][0] == "$" else ""
            
            print("{asin},{also_view},{also_buy},{category},{price}".format(
                    asin = item["asin"], 
                    also_view = also_view, 
                    also_buy = also_buy, 
                    category = category, 
                    price = str(price)))
#print("{0}/{1}".format(count, data.size))

In [None]:
with open(DIR_PATH + "extraction.csv", "r") as f:
    count = 0
    for line in f:
        count = count + 1
    print(count)

In [4]:
def read_items(filename):
    dataset = dict()
    with open(filename) as file:
        next(file)
        for line in file:
            asin, also_view, also_buy, category, price = line.split(",")
            dataset[asin] = dict()
            dataset[asin]["also_view"] = also_view.split(" ")
            dataset[asin]["also_buy"] = also_buy.split(" ")
            dataset[asin]["category"] = category.split(" ")
            dataset[asin]["price"] = float(price)
            dataset[asin]["freq"] = 1

    for asin, attr in dataset.items():
        filter_list = []
        for substitution in attr["also_view"]:
            if substitution in dataset:
                filter_list.append(substitution)
                dataset[substitution]["freq"] += 1

        dataset[asin]["also_view"] = filter_list

        filter_list = []
        for complementary in attr["also_buy"]:
            if complementary in dataset:
                filter_list.append(complementary)
                dataset[complementary]["freq"] += 1

        dataset[asin]["also_buy"] = filter_list

    return dataset

def maxFrequency(filename):
    dataset = dict()

    with open(filename) as file:
        next(file)
        for line in file:
            asin, also_view, also_buy, category, price = line.split(",")
            dataset[asin] = dict()
            dataset[asin]["also_view"] = also_view.split(" ")
            dataset[asin]["also_buy"] = also_buy.split(" ")
            dataset[asin]["freq"] = 1
    
    max_asin = ""
    max_freq = 0

    for asin, attr in dataset.items():
        filter_list = []
        for substitution in attr["also_view"]:
            if substitution in dataset:
                filter_list.append(substitution)
                dataset[substitution]["freq"] += 1

        dataset[asin]["also_view"] = filter_list

        filter_list = []
        for complementary in attr["also_buy"]:
            if complementary in dataset:
                filter_list.append(complementary)
                dataset[complementary]["freq"] += 1

        dataset[asin]["also_buy"] = filter_list

        if dataset[asin]["freq"] > max_freq:
            max_asin = asin
            max_freq = dataset[asin]["freq"]

    return max_asin


def sample_items(dataset:dict, root, num_substituion=0, num_complements=0, picked = []):
    
    def _bfs_sample(dataset, root, num, relation:str, picked = picked):
        
        if relation not in ['also_buy', 'also_view']:
            raise ValueError("Parameter of relation is an error")

        queue = []
        if root not in picked:
            picked.append(root)
        queue.append(root)

        while len(queue) != 0 and num > 0:
            queue += dataset[root][relation]
            
            # pop first element
            root = queue[0]
            del queue[0]

            if root not in picked:
                picked.append(root)
                num -= 1

    _bfs_sample(dataset, root, num_substituion, "also_view", picked)
    _bfs_sample(dataset, root, num_complements, "also_buy", picked)
    return picked
    
    
# print("asin: " + max_freq_asin)
# print("also_view: {0}".format(dataset[max_freq_asin]["also_view"]))
# print("also_buy: {0}".format(dataset[max_freq_asin]["also_buy"]))
# print("category: {0}".format(dataset[max_freq_asin]["category"]))
# print("price: {0}".format(dataset[max_freq_asin]["price"]))
# print("freq: {0}".format(dataset[max_freq_asin]["freq"]))

In [6]:
DIR_PATH = r"D:\\論文實驗\\data\\amazon\\"
FILE = r"preprocessed_Software.csv"

dataset = read_items(DIR_PATH + FILE)
max_freq_asin = maxFrequency(DIR_PATH + FILE) # value of frequency has been removed

result = []
sample_items(dataset, max_freq_asin, 1, 1, result)
print(len(result))

with open(DIR_PATH + "/sample_items.csv", "w") as file:
    file.write("asin,also_view,also_buy,category,price\n")
    for asin in result:
        file.write("{0},{1},{2},{3},{4}\n".format(
            asin,
            " ".join(dataset[asin]["also_view"]),
            " ".join(dataset[asin]["also_buy"]),
            " ".join(dataset[asin]["category"]),
            dataset[asin]["price"]
        ))

3
