# Amazon Data Manipulation
#### Data from Hugging Face: [Amazon Products Dataset 2023](https://huggingface.co/datasets/Studeni/AMAZON-Products-2023)

## TABLE OF CONTENT
### $~~~$ - 1. Preview
### $~~~$ - 2. Drop NaN
### $~~~$ - 3. Check PRODUCT_ID Duplication
### $~~~$ - 4. Save to CSV
### $~~~$ - 5. Visualization

---
## 1. Preview

In [None]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

In [None]:
amazon_products_dataset = load_dataset('Studeni/AMAZON-Products-2023')
amazon_products_dataset

In [None]:
amazon_products_df = pd.DataFrame(
    {
        'PRODUCT_ID': amazon_products_dataset['train']['parent_asin'],
        'TITLE': [x.replace('\n', ' ') for x in amazon_products_dataset['train']['title']],
        'DESCRIPTION': [x.replace('\n', ' ') for x in amazon_products_dataset['train']['description']],
        'MAIN_CATEGORY': amazon_products_dataset['train']['main_category'],
        'CATEGORIES': amazon_products_dataset['train']['categories'],
        'AVERAGE_RATING': amazon_products_dataset['train']['average_rating'],
        'RATING_NUMBER': amazon_products_dataset['train']['rating_number'],
        'PRICE': amazon_products_dataset['train']['price'],
        'DETAILS': [x.replace('\n', ' ') for x in amazon_products_dataset['train']['details']],
    }
)
amazon_products_df.info()

In [None]:
amazon_products_df

---
## 2. Drop NaN

In [None]:
amazon_products_df.dropna(subset=['MAIN_CATEGORY'], inplace=True)
amazon_products_df.reset_index(inplace=True, drop=True)

In [None]:
amazon_products_df.info()

---
## 3. Check PRODUCT_ID Duplication

In [None]:
'[*] PRODUCT_ID none duplicate: ' + str(amazon_products_df['PRODUCT_ID'].nunique() == amazon_products_df.shape[0])

---
## 4. Save to CSV

In [None]:
import os

In [None]:
def dir_check(dir_path):
    if not os.path.exists(dir_path):
        print(f'[*] Creating directory - "{dir_path}"...')
        os.mkdir(dir_path)

In [None]:
base_dir = "../../trainData"
dir_check(base_dir)

In [None]:
amazon_products_df.to_csv(os.path.join(base_dir, 'amazon_products.train.csv'), index=False)

---
## 5. Visualization

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Categories

In [None]:
categories = Counter(amazon_products_df['MAIN_CATEGORY'])

In [None]:
descending_categories = dict(categories.most_common())

In [None]:
plt.figure(figsize = (10,8))
sns.barplot(x=list(descending_categories.keys()), y=list(descending_categories.values()))
plt.xticks(rotation = 90)
plt.ylabel(f'Number of Products (Total {amazon_products_df.shape[0]})')
plt.title('Amazon Products Categories')
plt.show()

### Description Tokens

In [None]:
description_length = {}
for i in tqdm(amazon_products_df.index):
    curr_description_len = len(amazon_products_df.loc[i, 'DESCRIPTION'].split(' '))
    description_length.setdefault(curr_description_len, 0)
    description_length[curr_description_len] += 1

In [None]:
descending_description_length = dict(sorted(description_length.items(), key=lambda item: item[0], reverse=False))

In [None]:
plt.figure(figsize = (10,8))
sns.barplot(x=list(descending_description_length.keys()), y=list(descending_description_length.values()))
plt.xticks([])
plt.ylabel('Description Token(1 word) Frequency')
plt.xlabel(f'From Token Number {np.min(list(descending_description_length.keys()))} - {np.max(list(descending_description_length.keys()))}')
plt.title('Amazon Products Description Tokens Statistic')
plt.show()

In [None]:
f'Description tokens >= 512 ratio: {round((np.sum([v for k, v in descending_description_length.items() if k <= 512])/amazon_products_df.shape[0])*100, 2)}%'