In [1]:
import pandas as pd

In [2]:
df = pd.read_json('../data/meta_Amazon_Fashion.jsonl', lines=True)

In [3]:
# Number of rows
len(df)

826108

In [4]:
df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together
0,AMAZON FASHION,YUEDGE 5 Pairs Men's Moisture Control Cushione...,4.6,16,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],GiveGift,[],{'Package Dimensions': '10.31 x 8.5 x 1.73 inc...,B08BHN9PK5,
1,AMAZON FASHION,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,4.1,7,"[Drawstring closure, Machine Wash]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],DouBCQ,[],{'Package Dimensions': '15 x 10.2 x 0.4 inches...,B08R39MRDW,
2,AMAZON FASHION,Pastel by Vivienne Honey Vanilla Girls' Trapez...,4.3,11,"[Zipper closure, Hand Wash Only]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Pastel by Vivienne,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B077KJHCJ4,
3,AMAZON FASHION,Mento Streamtail,2.0,1,"[Thermoplastic Rubber sole, High Density Premi...",[Slip on the Women's Mento and you're ready to...,29.81,[{'thumb': 'https://m.media-amazon.com/images/...,[],Guy Harvey,[],{'Package Dimensions': '11.22 x 4.72 x 4.33 in...,B0811M2JG9,
4,AMAZON FASHION,RONNOX Women's 3-Pairs Bright Colored Calf Com...,4.3,3032,"[Pull On closure, Size Guide: ""S"" fits calf 10...",[Ronnox Calf Sleeves - Allowing Your Body to P...,17.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'HONEST Review: RONNOX Women's 3-Pa...,RONNOX,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07SB2892S,


Trying to understand the data. I want to reduce the dataset as much as possible an focus on high value data.

In [5]:
# Drop columns that are entirely empty
df = df.dropna(axis=1, how='all')
df.columns

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin'],
      dtype='object')

In [6]:
# Get unique values for key columns
print("Unique main categories:")
print(df['main_category'].unique())


Unique main categories:
['AMAZON FASHION']


In [7]:
# Check for non-empty category lists
print("Number of rows with non-empty categories:", 
      len(df[df['categories'].apply(lambda x: len(x) > 0)]))



Number of rows with non-empty categories: 0


In [8]:
# Group by parent_asin and count occurrences, sort in descending order
parent_asin_counts = df['parent_asin'].value_counts()
print("\nParent ASIN counts (descending):")
print(parent_asin_counts)



Parent ASIN counts (descending):
parent_asin
B08BHN9PK5    1
B00TT1OOB2    1
B08JTB3V2P    1
B01HI6GS2M    1
B01IU8B5Y8    1
             ..
B07HR134XV    1
B0823QNLLC    1
B07DSC93F8    1
B0176WL1FC    1
B0895H6NWS    1
Name: count, Length: 826108, dtype: int64


In [9]:
# Drop specified columns
df = df.drop(['main_category', 'videos', 'categories', 'parent_asin'], axis=1)
df.columns

Index(['title', 'average_rating', 'rating_number', 'features', 'description',
       'price', 'images', 'store', 'details'],
      dtype='object')

In [10]:
details = list(df['details'])

detail_keys = set()

for detail in details:
    for k in detail:
        detail_keys.add(k)

print(detail_keys)

{'', 'Top Style', 'Base Material', 'Item Dimensions LxWxH', 'Pile Height', 'Color Code', 'Surface Recommendation nail', 'Ruling Type', 'Total Diamond Weight', 'Wheel Size', 'Minimum Weight Recommendation', 'Movement Detection Technology', 'Surface Recommendation Nail', 'Package Weight', 'Item model number', 'Shape', 'Case Material', 'Lens Material', 'Frame Material Metal', 'Shaft Height', 'Filter Class', 'Eye Relief', 'Night vision', 'OEM Part Number', 'Plant or Animal Product Type', 'Fabric Type', 'Brake Width', 'Point Type', 'Blade Length', 'Insole Type', 'Material other', 'Thickness', 'Seat Height', 'Thread Type', 'Watch Movement', 'National Stock Number', 'Occasion Travel', 'Year', 'Cable Feature', 'Date First Available', 'Included Components', 'Item Volume', 'Material Acrylic', 'Puzzle type', 'Minimum Age Recomendation', 'Are Batteries Included', 'Screen Size', 'Locking', 'Drain Type', 'Bike Type', 'Capacity 11 Fluid Ounces', 'Liquid Contents Description', 'Suggested Users', 'Wire

In [11]:
print("Percentage of items that have some features")
sum(df['features'].apply(lambda x: 1 if len(x) > 0 else 0)) / len(df)

Percentage of items that have some features


0.5605489839100941

In [12]:
print("Percentage of items that have a description")
sum(df['description'].apply(lambda x: 1 if len(x) > 0 else 0)) / len(df)

Percentage of items that have a description


0.07176906651430563

In [13]:
df['price'].count()

np.int64(50249)

In [14]:
df['rating_number'].count()

np.int64(826108)

In [15]:
# Extract first image thumbnail URL from images list
df['thumbnail'] = df['images'].apply(lambda x: x[0]['thumb'] if x and len(x) > 0 and 'thumb' in x[0] else None)

# Display sample of data with new thumbnail column
df.head()


Unnamed: 0,title,average_rating,rating_number,features,description,price,images,store,details,thumbnail
0,YUEDGE 5 Pairs Men's Moisture Control Cushione...,4.6,16,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,GiveGift,{'Package Dimensions': '10.31 x 8.5 x 1.73 inc...,https://m.media-amazon.com/images/I/41+cCfaVOF...
1,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,4.1,7,"[Drawstring closure, Machine Wash]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,DouBCQ,{'Package Dimensions': '15 x 10.2 x 0.4 inches...,https://m.media-amazon.com/images/I/515cR-ta1E...
2,Pastel by Vivienne Honey Vanilla Girls' Trapez...,4.3,11,"[Zipper closure, Hand Wash Only]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,Pastel by Vivienne,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",https://m.media-amazon.com/images/I/31GwmwNCdA...
3,Mento Streamtail,2.0,1,"[Thermoplastic Rubber sole, High Density Premi...",[Slip on the Women's Mento and you're ready to...,29.81,[{'thumb': 'https://m.media-amazon.com/images/...,Guy Harvey,{'Package Dimensions': '11.22 x 4.72 x 4.33 in...,https://m.media-amazon.com/images/I/31P-uHUUIX...
4,RONNOX Women's 3-Pairs Bright Colored Calf Com...,4.3,3032,"[Pull On closure, Size Guide: ""S"" fits calf 10...",[Ronnox Calf Sleeves - Allowing Your Body to P...,17.99,[{'thumb': 'https://m.media-amazon.com/images/...,RONNOX,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",https://m.media-amazon.com/images/I/51CqMDJOOD...


In [16]:
# Drop additional columns
df = df.drop(['details', 'images'], axis=1)

In [17]:
# Reduces the dataset
df = df[df['price'].isna() == False]

In [18]:
len(df)

50249

In [None]:
print("Percentage of items that have some features")
sum(df['features'].apply(lambda x: 1 if len(x) > 0 else 0)) / len(df)

In [None]:
print("Percentage of items that have a description")
sum(df['description'].apply(lambda x: 1 if len(x) > 0 else 0)) / len(df)

In [45]:
def generate_product_spec_string(df_row):
    result = ""
    result += df_row['title']
    if len(df_row['features']) > 0:
        result = result + " "  + " ".join(df_row['features'])

    if len(df_row['description']) > 0:
        result = result + " "  + " ".join(df_row['description'])

    return result


df['product_spec'] = df.apply(generate_product_spec_string, axis=1)

In [None]:
df.head()

In [None]:
df.to_json('../data/processed_data.jsonl', orient='records', lines=True)