# Importing Libraries

In [10]:
import json
import random
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MultiLabelBinarizer
import streamlit as st
from bs4 import BeautifulSoup
import gc
import swifter


# Creating Sample File


In [11]:
input_file_path = r"D:\Study Material\semester 6\Big Data\Assignments\Ass3\All_Amazon_Meta.json\All_Amazon_Meta.json"
output_file_path = r"D:\Study Material\semester 6\Big Data\Assignments\Ass3\All_Amazon_Meta_Sampled.json"

def sample_json(input_file, output_file, target_size_gb, filter_key='also_buy'):
    # Convert the target size from gigabytes to bytes
    target_size_bytes = target_size_gb * 1024**3

    # Initialize the current size of the output file in bytes
    current_size_bytes = 0

    # Open the input file in read mode and the output file in write mode
    with open(input_file, 'r', encoding='utf-8') as infile, open (output_file, 'w', encoding='utf-8') as outfile:
        # Wrap the input file with tqdm for a single progress bar
        for line in tqdm(infile, desc="Sampling", unit=" lines", leave=True): 
            # Load the JSON data from the current line
            record = json.loads(line)
            # Check if the filter key exists and is not empty in the current record
            if record.get(filter_key):
                # If it exists, write the record to the output file and add a newline
                outfile.write(json.dumps(record) + '\n')
                # Add the size of the current line to the current size of the output file 
                current_size_bytes += len(line.encode('utf-8'))
                
                # If the current size of the output file is greater than or equal to the target size
                if current_size_bytes >= target_size_bytes:
                    # Stop writing to the output file
                    break
                
    print(f"Finished sampling. Output size: {current_size_bytes / 1024**3:.2f} GB")


In [12]:
sample_json(input_file_path, output_file_path, 20)


Sampling: 0 lines [00:00, ? lines/s][A
Sampling: 618 lines [00:00, 5809.65 lines/s][A
Sampling: 1711 lines [00:00, 8575.54 lines/s][A
Sampling: 3389 lines [00:00, 11829.72 lines/s][A
Sampling: 5142 lines [00:00, 13654.82 lines/s][A
Sampling: 7011 lines [00:00, 14609.50 lines/s][A
Sampling: 8491 lines [00:00, 14275.60 lines/s][A
Sampling: 10033 lines [00:00, 14173.12 lines/s][A
Sampling: 11462 lines [00:00, 13614.32 lines/s][A
Sampling: 13149 lines [00:01, 13248.09 lines/s][A
Sampling: 14478 lines [00:01, 11450.40 lines/s][A
Sampling: 15657 lines [00:01, 11388.35 lines/s][A
Sampling: 16819 lines [00:01, 10542.61 lines/s][A
Sampling: 17894 lines [00:01, 10146.54 lines/s][A
Sampling: 18921 lines [00:01, 10124.61 lines/s][A
Sampling: 19942 lines [00:01, 9524.50 lines/s] [A
Sampling: 20903 lines [00:01, 8678.22 lines/s][A
Sampling: 21892 lines [00:01, 8833.57 lines/s][A
Sampling: 22951 lines [00:02, 9216.13 lines/s][A
Sampling: 24033 lines [00:02, 9484.63 lines/s][A
Sam

Finished sampling. Output size: 20.00 GB





# Pre Processing


### Read File In A Data Frame 

In [13]:
# Define the input file path
input_file_path = r"D:\Study Material\semester 6\Big Data\Assignments\Ass3\All_Amazon_Meta_Sampled.json"

# Initialize an empty
data_list = []

# Define the chunk in MB 
chunk_size = 100 * 1024 * 1024  

# Start time
start_time = time.time()

# tqdm bar 
progress_bar = tqdm(unit='B', unit_scale=True, desc='Reading')
    
# Open the input file and read it in chunks
with open(input_file_path, 'r') as input_file:
    while True:
        # Read a chunk of data from the input file
        chunk = input_file.read(chunk_size)
        # Check if the chunk is empty (end of file)
        if not chunk:
            break
        # Split the chunk by newline to get individual JSON objects
        json_objects = chunk.strip().split('\n')
        # Parse each JSON object and append to the data list
        for json_object in json_objects:
            try:
                data_list.append(json.loads(json_object))
            except json.JSONDecodeError as e:
                #print(f"Skipping malformed JSON: {e}")
                continue
        # Delete the chunk to free up memory
        del chunk 
        # Update tqdm progress bar
        progress_bar.update(len(json_objects))

# Close tqdm progress bar
progress_bar.close()

# End time
end_time = time.time()
elapsed_time = end_time - start_time
elapsed_time_minutes = elapsed_time / 60

# Convert the list to pandas DataFrame
df = pd.DataFrame(data_list)

print(f"Data has been read and stored in a DataFrame.")
print(f"Total number of records: {len(df)}")
print(f"Elapsed time: {elapsed_time:.2f} seconds ({elapsed_time_minutes:.2f} minutes)")



Reading: 0.00B [30:54, ?B/s][A

Reading: 10.5kB [00:02, 4.28kB/s][A
Reading: 21.1kB [00:05, 4.00kB/s][A
Reading: 31.6kB [00:07, 4.01kB/s][A
Reading: 42.4kB [00:10, 3.98kB/s][A
Reading: 52.8kB [00:13, 4.02kB/s][A
Reading: 63.6kB [00:15, 3.99kB/s][A
Reading: 74.5kB [00:18, 4.18kB/s][A
Reading: 85.4kB [00:21, 4.05kB/s][A
Reading: 96.6kB [00:23, 4.30kB/s][A
Reading: 108kB [00:25, 4.35kB/s] [A
Reading: 119kB [00:28, 4.12kB/s][A
Reading: 130kB [00:31, 4.21kB/s][A
Reading: 141kB [00:33, 4.26kB/s][A
Reading: 152kB [00:36, 4.00kB/s][A
Reading: 163kB [00:39, 4.20kB/s][A
Reading: 174kB [00:41, 4.33kB/s][A
Reading: 185kB [00:44, 4.42kB/s][A
Reading: 196kB [00:46, 4.54kB/s][A
Reading: 207kB [00:50, 3.92kB/s][A
Reading: 218kB [00:52, 4.08kB/s][A
Reading: 229kB [00:54, 4.26kB/s][A
Reading: 240kB [00:57, 4.42kB/s][A
Reading: 251kB [00:59, 4.57kB/s][A
Reading: 262kB [01:03, 3.84kB/s][A
Reading: 273kB [01:05, 4.02kB/s][A
Reading: 284kB [01:08, 4.16kB/s][A
Reading: 295kB [01:1

Data has been read and stored in a DataFrame.
Total number of records: 2939250
Elapsed time: 1692.67 seconds (28.21 minutes)


In [14]:
df

Unnamed: 0,category,tech1,description,fit,title,also_buy,image,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin
0,"[Clothing, Shoes & Jewelry, Men, Clothing, Jea...",,[<b>pant size(Unit:inch)</b><br> W30(tag30) Wa...,"class=""a-normal a-align-center a-spacing-smal...",QIBOE Men's Baggy Jeans Denim Sweatpants Loose...,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5...",[https://images-na.ssl-images-amazon.com/image...,,QIBOE,"[Denim, Zipper closure, Material: cotton, Styl...","1,506,383 in Clothing, Shoes & Jewelry (","[B07H2Z6S9J, B077GQQKRV, B072XTTTK9, B002DMJOC...",{},"<img src=""https://images-na.ssl-images-amazon....",,"<div class=""a-fixed-left-grid a-spacing-none"">...",$38.99,6342509379
1,"[Clothing, Shoes & Jewelry, Women, Accessories...",,[Feature <br> -Great quality winter scarf. <br...,,Crazy Women's Voile Crinkle Scarf Shawl,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW...",[https://images-na.ssl-images-amazon.com/image...,,Crazy,"[Material:voile, Size: 180cm x 95cm, A scarf t...","273,519 in Clothing, Shoes & Jewelry (","[B017M5BVXA, B00NSF70KM, B01LYRMI0Q, B017M55DI...",{},"<img src=""https://images-na.ssl-images-amazon....",,"<div class=""a-fixed-left-grid a-spacing-none"">...",$0.50,6342502315
2,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",,[Material : Core-spun fabric silk <br> feature...,"class=""a-normal a-align-center a-spacing-smal...",FQQ Women Sexy Lingerie Lace Dress Sheer Babyd...,[B00VBVXVPI],[https://images-na.ssl-images-amazon.com/image...,,FQQ,"[100% Polyester, We use comfortable tissue to ...","3,266,227 in Clothing, Shoes & Jewelry (",[],{},"<img src=""https://images-na.ssl-images-amazon....",,"<div class=""a-fixed-left-grid a-spacing-none"">...",$2.80,6342522545
3,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",,[Material : Core-spun fabric silk <br> feature...,"class=""a-normal a-align-center a-spacing-smal...",Crazy Women's Sexy Leather Backless Bodycon Cl...,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT...",[https://images-na.ssl-images-amazon.com/image...,,Crazy,"[100% Polyester, We use comfortable tissue to ...","641,576 in Clothing, Shoes & Jewelry (","[B074XSR8LC, B01MZITI8H, B07JMZDG8C, B01LY4VKT...",{},"<img src=""https://images-na.ssl-images-amazon....",,"<div class=""a-fixed-left-grid a-spacing-none"">...",$8.50,6342522898
4,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",,[Material : Core-spun fabric silk <br> feature...,,FQQ Women's Sexy Lingerie Babydoll Dress Sleep...,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY...",[https://images-na.ssl-images-amazon.com/image...,,FQQ,"[100% Polyester, We use comfortable tissue to ...","1,761,440 in Clothing, Shoes & Jewelry (","[B00UHFS00K, B01EKRMG8C, B01AHZSZ9A, B06XY5N95...",{},"<img src=""https://images-na.ssl-images-amazon....",,"<div class=""a-fixed-left-grid a-spacing-none"">...",$4.50,6342523002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939245,"[Tools & Home Improvement, Hardware, Door Hard...","class=""a-keyvalue prodDetTable"" role=""present...","[Whether for home, farm, builder or industrial...",,National Hardware N195-784 3211BC Swivel Singl...,"[B000BD8N0A, B000FPDGA6, B000BD6DNO, B000BD5HK...",[],,National Hardware,"[Steel body and axel, Zinc die-cast sheave, Li...","[>#327,106 in Tools & Home Improvement (See to...",[],{},Tools & Home Improvement,,"February 17, 2007",,B000BD8N0K
2939246,"[Tools & Home Improvement, Hardware, Furniture...","class=""a-keyvalue prodDetTable"" role=""present...","[Magic Sliders 08200 Screw-On Floor Slide 3/4""...",,Magic Sliders 08200 Screw-On Floor Slide 3/4&q...,"[B07BYC5NTG, B000KL0C2K, B000LNPER0, B000BD8LO...",[],,Magic Sliders,"[3/4"" round.Slide everything as if it had whee...","[>#32,155 in Tools & Home Improvement (See top...",[],{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","February 17, 2007",$7.29,B000BD8LU2
2939247,"[Tools & Home Improvement, Hardware, Cabinet H...","class=""a-keyvalue prodDetTable"" role=""present...",[This screen door pull is constructed from bla...,,Slide-Co 121087 Mortise Type Screen Door Latch...,"[B000BPFWC0, B00DUQAA88, B000I1RR5C, B00DUQAAV...",[https://images-na.ssl-images-amazon.com/image...,,Slide-Co,"[Sliding Screen Door Latch & Pull, Mortise Ins...","[>#234,641 in Tools & Home Improvement (See to...",[],{},Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","June 15, 2006",$5.25,B000BD8L16
2939248,"[Tools & Home Improvement, Hardware, Window Ha...","class=""a-keyvalue prodDetTable"" role=""present...",[This slide bolt is used to replace damaged or...,,20PK Nyl RH Slide Bolt,"[B000BD8O2M, B000BD8NJQ, B010NB7FXK, B0044UP7Y...",[],,Slide-Co,[],"[>#177,817 in Tools & Home Improvement (See to...",[],{},Tools & Home Improvement,,"August 17, 2005",.a-box-inner{background-color:#fff}#alohaBuyBo...,B000BD8O22


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2939250 entries, 0 to 2939249
Data columns (total 18 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   category      object
 1   tech1         object
 2   description   object
 3   fit           object
 4   title         object
 5   also_buy      object
 6   image         object
 7   tech2         object
 8   brand         object
 9   feature       object
 10  rank          object
 11  also_view     object
 12  details       object
 13  main_cat      object
 14  similar_item  object
 15  date          object
 16  price         object
 17  asin          object
dtypes: object(18)
memory usage: 403.6+ MB


In [16]:
# Print the complete first row of the DataFrame as a JSON object
print(df.iloc[0])


category        [Clothing, Shoes & Jewelry, Men, Clothing, Jea...
tech1                                                            
description     [<b>pant size(Unit:inch)</b><br> W30(tag30) Wa...
fit              class="a-normal a-align-center a-spacing-smal...
title           QIBOE Men's Baggy Jeans Denim Sweatpants Loose...
also_buy        [B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5...
image           [https://images-na.ssl-images-amazon.com/image...
tech2                                                            
brand                                                       QIBOE
feature         [Denim, Zipper closure, Material: cotton, Styl...
rank                     1,506,383 in Clothing, Shoes & Jewelry (
also_view       [B07H2Z6S9J, B077GQQKRV, B072XTTTK9, B002DMJOC...
details                                                        {}
main_cat        <img src="https://images-na.ssl-images-amazon....
similar_item                                                     
date      

### Remove Unnecessary Columns

In [17]:
# Columns to keep
columns_to_keep = ['asin', 'title' , 'feature' , 'category' , 'also_buy' , 'also_view']

# Drop the columns not in the list
columns_to_remove = [col for col in df.columns if col not in columns_to_keep]

# Drop the specified columns
df.drop(columns=columns_to_remove, inplace=True)


In [18]:
df

Unnamed: 0,category,title,also_buy,feature,also_view,asin
0,"[Clothing, Shoes & Jewelry, Men, Clothing, Jea...",QIBOE Men's Baggy Jeans Denim Sweatpants Loose...,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5...","[Denim, Zipper closure, Material: cotton, Styl...","[B07H2Z6S9J, B077GQQKRV, B072XTTTK9, B002DMJOC...",6342509379
1,"[Clothing, Shoes & Jewelry, Women, Accessories...",Crazy Women's Voile Crinkle Scarf Shawl,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW...","[Material:voile, Size: 180cm x 95cm, A scarf t...","[B017M5BVXA, B00NSF70KM, B01LYRMI0Q, B017M55DI...",6342502315
2,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",FQQ Women Sexy Lingerie Lace Dress Sheer Babyd...,[B00VBVXVPI],"[100% Polyester, We use comfortable tissue to ...",[],6342522545
3,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",Crazy Women's Sexy Leather Backless Bodycon Cl...,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT...","[100% Polyester, We use comfortable tissue to ...","[B074XSR8LC, B01MZITI8H, B07JMZDG8C, B01LY4VKT...",6342522898
4,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",FQQ Women's Sexy Lingerie Babydoll Dress Sleep...,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY...","[100% Polyester, We use comfortable tissue to ...","[B00UHFS00K, B01EKRMG8C, B01AHZSZ9A, B06XY5N95...",6342523002
...,...,...,...,...,...,...
2939245,"[Tools & Home Improvement, Hardware, Door Hard...",National Hardware N195-784 3211BC Swivel Singl...,"[B000BD8N0A, B000FPDGA6, B000BD6DNO, B000BD5HK...","[Steel body and axel, Zinc die-cast sheave, Li...",[],B000BD8N0K
2939246,"[Tools & Home Improvement, Hardware, Furniture...",Magic Sliders 08200 Screw-On Floor Slide 3/4&q...,"[B07BYC5NTG, B000KL0C2K, B000LNPER0, B000BD8LO...","[3/4"" round.Slide everything as if it had whee...",[],B000BD8LU2
2939247,"[Tools & Home Improvement, Hardware, Cabinet H...",Slide-Co 121087 Mortise Type Screen Door Latch...,"[B000BPFWC0, B00DUQAA88, B000I1RR5C, B00DUQAAV...","[Sliding Screen Door Latch & Pull, Mortise Ins...",[],B000BD8L16
2939248,"[Tools & Home Improvement, Hardware, Window Ha...",20PK Nyl RH Slide Bolt,"[B000BD8O2M, B000BD8NJQ, B010NB7FXK, B0044UP7Y...",[],[],B000BD8O22


### Check  Null Values 

In [19]:
# Check the data type of each column
data_types = df.dtypes

# Display the data type of each column
print("Data type of each column:")
print(data_types)


Data type of each column:
category     object
title        object
also_buy     object
feature      object
also_view    object
asin         object
dtype: object


In [20]:
# Check for empty objects in each column
empty_object_counts = (df.applymap(lambda x: len(x) if isinstance(x, (str, list)) else 0) == 0).sum()

# Display the count of empty objects in each column
print("Number of empty objects in each column:")
print(empty_object_counts)


Number of empty objects in each column:
category      123563
title           1239
also_buy           0
feature      1810618
also_view     924398
asin               0
dtype: int64


In [21]:
# Identify rows where any of the specified columns have empty objects
mask = df.apply(lambda row: any(len(x) == 0 if isinstance(x, list) else x == '' for x in row), axis=1)

# Remove the identified rows
df_cleaned = df[~mask]

In [22]:
# To clean up space in memory
del df

In [23]:
df_cleaned

Unnamed: 0,category,title,also_buy,feature,also_view,asin
0,"[Clothing, Shoes & Jewelry, Men, Clothing, Jea...",QIBOE Men's Baggy Jeans Denim Sweatpants Loose...,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5...","[Denim, Zipper closure, Material: cotton, Styl...","[B07H2Z6S9J, B077GQQKRV, B072XTTTK9, B002DMJOC...",6342509379
1,"[Clothing, Shoes & Jewelry, Women, Accessories...",Crazy Women's Voile Crinkle Scarf Shawl,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW...","[Material:voile, Size: 180cm x 95cm, A scarf t...","[B017M5BVXA, B00NSF70KM, B01LYRMI0Q, B017M55DI...",6342502315
3,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",Crazy Women's Sexy Leather Backless Bodycon Cl...,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT...","[100% Polyester, We use comfortable tissue to ...","[B074XSR8LC, B01MZITI8H, B07JMZDG8C, B01LY4VKT...",6342522898
4,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",FQQ Women's Sexy Lingerie Babydoll Dress Sleep...,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY...","[100% Polyester, We use comfortable tissue to ...","[B00UHFS00K, B01EKRMG8C, B01AHZSZ9A, B06XY5N95...",6342523002
5,"[Clothing, Shoes & Jewelry, Men, Clothing, Pan...",Congs Men's Winter Fleece Lined Military Cargo...,"[B077M99VNM, B07HVRPXYH, B07CP7DLL5, B07FDF7NZ...","[100% Cotton, Zipper closure, Material: cotton...","[B077M99VNM, B00XKXI3WG, B07CP7DLL5, B07FDF7NZ...",6342503508
...,...,...,...,...,...,...
2939144,"[Tools & Home Improvement, Hardware, Door Hard...",Barn Door or Gate Latch - Stable Supplies,[B002HIZ75Q],"[Heavy Duty Steel, 10 Inch Latch Complete with...","[B0009SS140, B002HIZ75Q, B000SKV4IY, B008OB0YL...",B000B9SGXS
2939163,"[Tools & Home Improvement, Safety & Security, ...",MSA 495855 V-Gard Polyethylene Protective Cap ...,"[B00LCD3YPC, B01BLXRKEI, B00864W3CK, B007NXQJ7...",[Polyethylene shell provides superior impact p...,"[B000RMF7UE, B011VW9G5K, B009NRZ4PM, B078B9QD5...",B000BC5MQ4
2939184,"[Tools & Home Improvement, Hardware, Door Hard...",Schlage FB50N V GEO 619 B60 Single Cylinder De...,"[B0002YQZ5S, B0002YR17E, B0002YR188, B00H7HSSJ...","[The product is NI Key Entry Deadbolt, Easy an...","[B0002YR17E, B0030ZWEME, B0002YQZ6M, B0042U9N2...",B000BD5E52
2939190,"[Tools & Home Improvement, Hardware, Door Hard...",Slide-Co 13177-S Sliding Glass Door Roller Ass...,[B01AOHYC3O],"[Sliding glass door roller assembly, 1-1/4 inc...","[B000BD6CD0, B006P1LIFE, B002DVN9FC, B000I19BM...",B000BD5G1O


In [24]:
# Check for empty objects in each column
empty_object_counts = (df_cleaned.applymap(lambda x: len(x) if isinstance(x, (str, list)) else 0) == 0).sum()

# Display the count of empty objects in each column
print("Number of empty objects in each column:")
print(empty_object_counts)


Number of empty objects in each column:
category     0
title        0
also_buy     0
feature      0
also_view    0
asin         0
dtype: int64


# Clean Data

In [25]:
def clean_text(text):
    # Convert arrays or lists to strings
    if isinstance(text, (list, np.ndarray)):
        text = ' '.join(map(str, text))
    # Ensure text is a string
    if not isinstance(text, str):
        text = str(text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase
    text = text.lower()
    
    return text

In [26]:
df_cleaned['cleaned_category'] = df_cleaned['category'].apply(clean_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['cleaned_category'] = df_cleaned['category'].apply(clean_text)


In [27]:
df_cleaned['cleaned_title'] = df_cleaned['title'].apply(clean_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['cleaned_title'] = df_cleaned['title'].apply(clean_text)


In [28]:
df_cleaned['cleaned_feature'] = df_cleaned['feature'].apply(clean_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['cleaned_feature	'] = df_cleaned['feature'].apply(clean_text)


In [29]:
df_cleaned

Unnamed: 0,category,title,also_buy,feature,also_view,asin,cleaned_category,cleaned_title,cleaned_feature\t
0,"[Clothing, Shoes & Jewelry, Men, Clothing, Jea...",QIBOE Men's Baggy Jeans Denim Sweatpants Loose...,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5...","[Denim, Zipper closure, Material: cotton, Styl...","[B07H2Z6S9J, B077GQQKRV, B072XTTTK9, B002DMJOC...",6342509379,clothing shoes jewelry men clothing jeans den...,qiboe mens baggy jeans denim sweatpants loose ...,denim zipper closure material cotton style hip...
1,"[Clothing, Shoes & Jewelry, Women, Accessories...",Crazy Women's Voile Crinkle Scarf Shawl,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW...","[Material:voile, Size: 180cm x 95cm, A scarf t...","[B017M5BVXA, B00NSF70KM, B01LYRMI0Q, B017M55DI...",6342502315,clothing shoes jewelry women accessories scar...,crazy womens voile crinkle scarf shawl,materialvoile size cm x cm a scarf that looks ...
3,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",Crazy Women's Sexy Leather Backless Bodycon Cl...,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT...","[100% Polyester, We use comfortable tissue to ...","[B074XSR8LC, B01MZITI8H, B07JMZDG8C, B01LY4VKT...",6342522898,clothing shoes jewelry women clothing dresses...,crazy womens sexy leather backless bodycon clu...,polyester we use comfortable tissue to make t...
4,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",FQQ Women's Sexy Lingerie Babydoll Dress Sleep...,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY...","[100% Polyester, We use comfortable tissue to ...","[B00UHFS00K, B01EKRMG8C, B01AHZSZ9A, B06XY5N95...",6342523002,clothing shoes jewelry women clothing lingeri...,fqq womens sexy lingerie babydoll dress sleepw...,polyester we use comfortable tissue to make t...
5,"[Clothing, Shoes & Jewelry, Men, Clothing, Pan...",Congs Men's Winter Fleece Lined Military Cargo...,"[B077M99VNM, B07HVRPXYH, B07CP7DLL5, B07FDF7NZ...","[100% Cotton, Zipper closure, Material: cotton...","[B077M99VNM, B00XKXI3WG, B07CP7DLL5, B07FDF7NZ...",6342503508,clothing shoes jewelry men clothing pants cas...,congs mens winter fleece lined military cargo ...,cotton zipper closure material cotton multi p...
...,...,...,...,...,...,...,...,...,...
2939144,"[Tools & Home Improvement, Hardware, Door Hard...",Barn Door or Gate Latch - Stable Supplies,[B002HIZ75Q],"[Heavy Duty Steel, 10 Inch Latch Complete with...","[B0009SS140, B002HIZ75Q, B000SKV4IY, B008OB0YL...",B000B9SGXS,tools home improvement hardware door hardware...,barn door or gate latch stable supplies,heavy duty steel inch latch complete with all...
2939163,"[Tools & Home Improvement, Safety & Security, ...",MSA 495855 V-Gard Polyethylene Protective Cap ...,"[B00LCD3YPC, B01BLXRKEI, B00864W3CK, B007NXQJ7...",[Polyethylene shell provides superior impact p...,"[B000RMF7UE, B011VW9G5K, B009NRZ4PM, B078B9QD5...",B000BC5MQ4,tools home improvement safety security perso...,msa vgard polyethylene protective cap with fa...,polyethylene shell provides superior impact pr...
2939184,"[Tools & Home Improvement, Hardware, Door Hard...",Schlage FB50N V GEO 619 B60 Single Cylinder De...,"[B0002YQZ5S, B0002YR17E, B0002YR188, B00H7HSSJ...","[The product is NI Key Entry Deadbolt, Easy an...","[B0002YR17E, B0030ZWEME, B0002YQZ6M, B0042U9N2...",B000BD5E52,tools home improvement hardware door hardware...,schlage fbn v geo b single cylinder deadbolt ...,the product is ni key entry deadbolt easy and ...
2939190,"[Tools & Home Improvement, Hardware, Door Hard...",Slide-Co 13177-S Sliding Glass Door Roller Ass...,[B01AOHYC3O],"[Sliding glass door roller assembly, 1-1/4 inc...","[B000BD6CD0, B006P1LIFE, B002DVN9FC, B000I19BM...",B000BD5G1O,tools home improvement hardware door hardware...,slideco s sliding glass door roller assembly inch,sliding glass door roller assembly inch steel...


### Remove Used Colmuns (category , description , title, feature) 

In [30]:
# Columns to remove
columns_to_remove = ['category','title','feature']

# Drop the specified columns
df_cleaned.drop(columns=columns_to_remove, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.drop(columns=columns_to_remove, inplace=True)


In [34]:
df_cleaned

Unnamed: 0,also_buy,also_view,asin,cleaned_category,cleaned_title,cleaned_feature
0,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5...","[B07H2Z6S9J, B077GQQKRV, B072XTTTK9, B002DMJOC...",6342509379,clothing shoes jewelry men clothing jeans den...,qiboe mens baggy jeans denim sweatpants loose ...,denim zipper closure material cotton style hip...
1,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW...","[B017M5BVXA, B00NSF70KM, B01LYRMI0Q, B017M55DI...",6342502315,clothing shoes jewelry women accessories scar...,crazy womens voile crinkle scarf shawl,materialvoile size cm x cm a scarf that looks ...
3,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT...","[B074XSR8LC, B01MZITI8H, B07JMZDG8C, B01LY4VKT...",6342522898,clothing shoes jewelry women clothing dresses...,crazy womens sexy leather backless bodycon clu...,polyester we use comfortable tissue to make t...
4,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY...","[B00UHFS00K, B01EKRMG8C, B01AHZSZ9A, B06XY5N95...",6342523002,clothing shoes jewelry women clothing lingeri...,fqq womens sexy lingerie babydoll dress sleepw...,polyester we use comfortable tissue to make t...
5,"[B077M99VNM, B07HVRPXYH, B07CP7DLL5, B07FDF7NZ...","[B077M99VNM, B00XKXI3WG, B07CP7DLL5, B07FDF7NZ...",6342503508,clothing shoes jewelry men clothing pants cas...,congs mens winter fleece lined military cargo ...,cotton zipper closure material cotton multi p...
...,...,...,...,...,...,...
2939144,[B002HIZ75Q],"[B0009SS140, B002HIZ75Q, B000SKV4IY, B008OB0YL...",B000B9SGXS,tools home improvement hardware door hardware...,barn door or gate latch stable supplies,heavy duty steel inch latch complete with all...
2939163,"[B00LCD3YPC, B01BLXRKEI, B00864W3CK, B007NXQJ7...","[B000RMF7UE, B011VW9G5K, B009NRZ4PM, B078B9QD5...",B000BC5MQ4,tools home improvement safety security perso...,msa vgard polyethylene protective cap with fa...,polyethylene shell provides superior impact pr...
2939184,"[B0002YQZ5S, B0002YR17E, B0002YR188, B00H7HSSJ...","[B0002YR17E, B0030ZWEME, B0002YQZ6M, B0042U9N2...",B000BD5E52,tools home improvement hardware door hardware...,schlage fbn v geo b single cylinder deadbolt ...,the product is ni key entry deadbolt easy and ...
2939190,[B01AOHYC3O],"[B000BD6CD0, B006P1LIFE, B002DVN9FC, B000I19BM...",B000BD5G1O,tools home improvement hardware door hardware...,slideco s sliding glass door roller assembly inch,sliding glass door roller assembly inch steel...


In [35]:
# Reset the index
df_cleaned.reset_index(drop=True, inplace=True)

In [36]:
df_cleaned

Unnamed: 0,also_buy,also_view,asin,cleaned_category,cleaned_title,cleaned_feature
0,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5...","[B07H2Z6S9J, B077GQQKRV, B072XTTTK9, B002DMJOC...",6342509379,clothing shoes jewelry men clothing jeans den...,qiboe mens baggy jeans denim sweatpants loose ...,denim zipper closure material cotton style hip...
1,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW...","[B017M5BVXA, B00NSF70KM, B01LYRMI0Q, B017M55DI...",6342502315,clothing shoes jewelry women accessories scar...,crazy womens voile crinkle scarf shawl,materialvoile size cm x cm a scarf that looks ...
2,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT...","[B074XSR8LC, B01MZITI8H, B07JMZDG8C, B01LY4VKT...",6342522898,clothing shoes jewelry women clothing dresses...,crazy womens sexy leather backless bodycon clu...,polyester we use comfortable tissue to make t...
3,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY...","[B00UHFS00K, B01EKRMG8C, B01AHZSZ9A, B06XY5N95...",6342523002,clothing shoes jewelry women clothing lingeri...,fqq womens sexy lingerie babydoll dress sleepw...,polyester we use comfortable tissue to make t...
4,"[B077M99VNM, B07HVRPXYH, B07CP7DLL5, B07FDF7NZ...","[B077M99VNM, B00XKXI3WG, B07CP7DLL5, B07FDF7NZ...",6342503508,clothing shoes jewelry men clothing pants cas...,congs mens winter fleece lined military cargo ...,cotton zipper closure material cotton multi p...
...,...,...,...,...,...,...
706492,[B002HIZ75Q],"[B0009SS140, B002HIZ75Q, B000SKV4IY, B008OB0YL...",B000B9SGXS,tools home improvement hardware door hardware...,barn door or gate latch stable supplies,heavy duty steel inch latch complete with all...
706493,"[B00LCD3YPC, B01BLXRKEI, B00864W3CK, B007NXQJ7...","[B000RMF7UE, B011VW9G5K, B009NRZ4PM, B078B9QD5...",B000BC5MQ4,tools home improvement safety security perso...,msa vgard polyethylene protective cap with fa...,polyethylene shell provides superior impact pr...
706494,"[B0002YQZ5S, B0002YR17E, B0002YR188, B00H7HSSJ...","[B0002YR17E, B0030ZWEME, B0002YQZ6M, B0042U9N2...",B000BD5E52,tools home improvement hardware door hardware...,schlage fbn v geo b single cylinder deadbolt ...,the product is ni key entry deadbolt easy and ...
706495,[B01AOHYC3O],"[B000BD6CD0, B006P1LIFE, B002DVN9FC, B000I19BM...",B000BD5G1O,tools home improvement hardware door hardware...,slideco s sliding glass door roller assembly inch,sliding glass door roller assembly inch steel...


In [37]:
# Define the output file path
output_path = r"D:\Study Material\semester 6\Big Data\Assignments\Ass3\f_processed_data.json"

# Save the DataFrame to JSON file
df_cleaned.to_json(output_path, orient='records', lines=True)

print(f"DataFrame saved to {output_path}")


DataFrame saved to D:\Study Material\semester 6\Big Data\Assignments\Ass3\f_processed_data.json
