In [1]:
# This code imports the warnings module and suppresses the display of warnings generated during script execution
import warnings
warnings.simplefilter(action='ignore', category=Warning)

import pandas as pd

# Load the dataset from a data.world URL into a pandas DataFrame
amazon_ecommerce_sample_db = pd.read_csv('https://query.data.world/s/kpd37xmqr3xrxjaodwjbo5deefywvo?dws=00000')

# Listing the columns in the DataFrame
list(amazon_ecommerce_sample_db.columns)

['uniq_id',
 'product_name',
 'manufacturer',
 'price',
 'number_available_in_stock',
 'number_of_reviews',
 'number_of_answered_questions',
 'average_review_rating',
 'amazon_category_and_sub_category',
 'customers_who_bought_this_item_also_bought',
 'description',
 'product_information',
 'product_description',
 'items_customers_buy_after_viewing_this_item',
 'customer_questions_and_answers',
 'customer_reviews',
 'sellers']

In [2]:
# Dropping unnecessary columns from the DataFrame
amazon_ecommerce_sample_db.drop([
  'uniq_id',
  'number_available_in_stock',
  'number_of_reviews',
  'number_of_answered_questions',
  'average_review_rating',
  'customers_who_bought_this_item_also_bought',
  'description',
  'product_information',
  'product_description',
  'items_customers_buy_after_viewing_this_item',
  'customer_questions_and_answers'
], axis=1, inplace=True)

amazon_ecommerce_sample_db

Unnamed: 0,product_name,manufacturer,price,amazon_category_and_sub_category,customer_reviews,sellers
0,Hornby 2014 Catalogue,Hornby,£3.42,Hobbies > Model Trains & Railway Sets > Rail V...,Worth Buying For The Pictures Alone (As Ever) ...,"{""seller""=>[{""Seller_name_1""=>""Amazon.co.uk"", ..."
1,FunkyBuys® Large Christmas Holiday Express Fes...,FunkyBuys,£16.99,Hobbies > Model Trains & Railway Sets > Rail V...,Four Stars // 4.0 // 18 Dec. 2015 // By\n \...,"{""seller""=>{""Seller_name_1""=>""UHD WHOLESALE"", ..."
2,CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...,ccf,£9.99,Hobbies > Model Trains & Railway Sets > Rail V...,**Highly Recommended!** // 5.0 // 26 May 2015 ...,"{""seller""=>[{""Seller_name_1""=>""DEAL-BOX"", ""Sel..."
3,HORNBY Coach R4410A BR Hawksworth Corridor 3rd,Hornby,£39.99,Hobbies > Model Trains & Railway Sets > Rail V...,I love it // 5.0 // 22 July 2013 // By\n \n...,
4,Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...,Hornby,£32.19,Hobbies > Model Trains & Railway Sets > Rail V...,Birthday present // 5.0 // 14 April 2014 // By...,
...,...,...,...,...,...,...
9995,Batman 1966 TV Series Action Figures - The Rid...,Mattel,£22.95,Hobbies > Collectible Figures & Memorabilia > ...,Realistic // 5.0 // 31 Mar. 2014 // By\n \n...,"{""seller""=>[{""Seller_name_1""=>""Star Action Fig..."
9996,"Star Wars Costume, Kids Stormtrooper Costume S...",Star Wars,£39.99,Characters & Brands > Star Wars > Toys,... what I see my grandson us going to have fu...,
9997,Defiance Lawkeeper Metal Badge Prop Replica,Olde Scotland Yard Ltd.,£43.99,Novelty & Special Use > Novelty > Accessories ...,Five Stars // 5.0 // 18 Dec. 2015 // By\n \...,"{""seller""=>[{""Seller_name_1""=>""YUK"", ""Seller_p..."
9998,Justice League of America Series 3 Green Lante...,DC Comics,£49.81,Hobbies > Collectible Figures & Memorabilia > ...,The best sculpt in a while // 5.0 // 13 May 20...,"{""seller""=>[{""Seller_name_1""=>""Smaller World F..."


In [3]:
# Checking for missing values in each column
for column_name in amazon_ecommerce_sample_db.columns.tolist():
  print(amazon_ecommerce_sample_db[column_name].isnull().any())

False
True
True
True
True
True


In [4]:
# Counting null values in specific columns
[
    amazon_ecommerce_sample_db['manufacturer'].isnull().sum(),
    amazon_ecommerce_sample_db['price'].isnull().sum(),
    amazon_ecommerce_sample_db['amazon_category_and_sub_category'].isnull().sum(),
    amazon_ecommerce_sample_db['customer_reviews'].isnull().sum(),
    amazon_ecommerce_sample_db['sellers'].isnull().sum()
]

[7, 1435, 690, 21, 3082]

In [5]:
# Dropping rows with missing values in critical columns
amazon_ecommerce_sample_db.dropna(subset=['manufacturer'], inplace=True)
amazon_ecommerce_sample_db.dropna(subset=['amazon_category_and_sub_category'], inplace=True)
amazon_ecommerce_sample_db.dropna(subset=['customer_reviews'], inplace=True)

# Converting the 'sellers' column to binary format, 1 if data is present and 0 for NaN
amazon_ecommerce_sample_db['sellers'] = amazon_ecommerce_sample_db['sellers'].apply(lambda x: 0 if pd.isna(x) else 1)

In [6]:
# Re-checking for null values after cleanup
[
    amazon_ecommerce_sample_db['manufacturer'].isnull().sum(),
    amazon_ecommerce_sample_db['price'].isnull().sum(),
    amazon_ecommerce_sample_db['amazon_category_and_sub_category'].isnull().sum(),
    amazon_ecommerce_sample_db['customer_reviews'].isnull().sum(),
    amazon_ecommerce_sample_db['sellers'].isnull().sum()
]

[0, 1293, 0, 0, 0]

In [7]:
amazon_ecommerce_sample_db

Unnamed: 0,product_name,manufacturer,price,amazon_category_and_sub_category,customer_reviews,sellers
0,Hornby 2014 Catalogue,Hornby,£3.42,Hobbies > Model Trains & Railway Sets > Rail V...,Worth Buying For The Pictures Alone (As Ever) ...,1
1,FunkyBuys® Large Christmas Holiday Express Fes...,FunkyBuys,£16.99,Hobbies > Model Trains & Railway Sets > Rail V...,Four Stars // 4.0 // 18 Dec. 2015 // By\n \...,1
2,CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...,ccf,£9.99,Hobbies > Model Trains & Railway Sets > Rail V...,**Highly Recommended!** // 5.0 // 26 May 2015 ...,1
3,HORNBY Coach R4410A BR Hawksworth Corridor 3rd,Hornby,£39.99,Hobbies > Model Trains & Railway Sets > Rail V...,I love it // 5.0 // 22 July 2013 // By\n \n...,0
4,Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...,Hornby,£32.19,Hobbies > Model Trains & Railway Sets > Rail V...,Birthday present // 5.0 // 14 April 2014 // By...,0
...,...,...,...,...,...,...
9995,Batman 1966 TV Series Action Figures - The Rid...,Mattel,£22.95,Hobbies > Collectible Figures & Memorabilia > ...,Realistic // 5.0 // 31 Mar. 2014 // By\n \n...,1
9996,"Star Wars Costume, Kids Stormtrooper Costume S...",Star Wars,£39.99,Characters & Brands > Star Wars > Toys,... what I see my grandson us going to have fu...,0
9997,Defiance Lawkeeper Metal Badge Prop Replica,Olde Scotland Yard Ltd.,£43.99,Novelty & Special Use > Novelty > Accessories ...,Five Stars // 5.0 // 18 Dec. 2015 // By\n \...,1
9998,Justice League of America Series 3 Green Lante...,DC Comics,£49.81,Hobbies > Collectible Figures & Memorabilia > ...,The best sculpt in a while // 5.0 // 13 May 20...,1


In [8]:
# Cleans and standardizes the 'product_name' column in the amazon_ecommerce_sample_db DataFrame by trimming whitespace and converting text to lowercase, applying these transformations only to non-null entries
amazon_ecommerce_sample_db['product_name'] = amazon_ecommerce_sample_db['product_name'].apply(lambda x: x.strip().lower() if x is not None else x)

amazon_ecommerce_sample_db

Unnamed: 0,product_name,manufacturer,price,amazon_category_and_sub_category,customer_reviews,sellers
0,hornby 2014 catalogue,Hornby,£3.42,Hobbies > Model Trains & Railway Sets > Rail V...,Worth Buying For The Pictures Alone (As Ever) ...,1
1,funkybuys® large christmas holiday express fes...,FunkyBuys,£16.99,Hobbies > Model Trains & Railway Sets > Rail V...,Four Stars // 4.0 // 18 Dec. 2015 // By\n \...,1
2,classic toy train set track carriages light en...,ccf,£9.99,Hobbies > Model Trains & Railway Sets > Rail V...,**Highly Recommended!** // 5.0 // 26 May 2015 ...,1
3,hornby coach r4410a br hawksworth corridor 3rd,Hornby,£39.99,Hobbies > Model Trains & Railway Sets > Rail V...,I love it // 5.0 // 22 July 2013 // By\n \n...,0
4,hornby 00 gauge 0-4-0 gildenlow salt co. steam...,Hornby,£32.19,Hobbies > Model Trains & Railway Sets > Rail V...,Birthday present // 5.0 // 14 April 2014 // By...,0
...,...,...,...,...,...,...
9995,batman 1966 tv series action figures - the rid...,Mattel,£22.95,Hobbies > Collectible Figures & Memorabilia > ...,Realistic // 5.0 // 31 Mar. 2014 // By\n \n...,1
9996,"star wars costume, kids stormtrooper costume s...",Star Wars,£39.99,Characters & Brands > Star Wars > Toys,... what I see my grandson us going to have fu...,0
9997,defiance lawkeeper metal badge prop replica,Olde Scotland Yard Ltd.,£43.99,Novelty & Special Use > Novelty > Accessories ...,Five Stars // 5.0 // 18 Dec. 2015 // By\n \...,1
9998,justice league of america series 3 green lante...,DC Comics,£49.81,Hobbies > Collectible Figures & Memorabilia > ...,The best sculpt in a while // 5.0 // 13 May 20...,1


In [9]:
# Cleans and standardizes the 'manufacturer' column of the amazon_ecommerce_sample_db DataFrame by applying two transformations: removing leading and trailing spaces and converting text to lowercase
amazon_ecommerce_sample_db['manufacturer'] = amazon_ecommerce_sample_db['manufacturer'].apply(lambda x: x.strip().lower())

amazon_ecommerce_sample_db

Unnamed: 0,product_name,manufacturer,price,amazon_category_and_sub_category,customer_reviews,sellers
0,hornby 2014 catalogue,hornby,£3.42,Hobbies > Model Trains & Railway Sets > Rail V...,Worth Buying For The Pictures Alone (As Ever) ...,1
1,funkybuys® large christmas holiday express fes...,funkybuys,£16.99,Hobbies > Model Trains & Railway Sets > Rail V...,Four Stars // 4.0 // 18 Dec. 2015 // By\n \...,1
2,classic toy train set track carriages light en...,ccf,£9.99,Hobbies > Model Trains & Railway Sets > Rail V...,**Highly Recommended!** // 5.0 // 26 May 2015 ...,1
3,hornby coach r4410a br hawksworth corridor 3rd,hornby,£39.99,Hobbies > Model Trains & Railway Sets > Rail V...,I love it // 5.0 // 22 July 2013 // By\n \n...,0
4,hornby 00 gauge 0-4-0 gildenlow salt co. steam...,hornby,£32.19,Hobbies > Model Trains & Railway Sets > Rail V...,Birthday present // 5.0 // 14 April 2014 // By...,0
...,...,...,...,...,...,...
9995,batman 1966 tv series action figures - the rid...,mattel,£22.95,Hobbies > Collectible Figures & Memorabilia > ...,Realistic // 5.0 // 31 Mar. 2014 // By\n \n...,1
9996,"star wars costume, kids stormtrooper costume s...",star wars,£39.99,Characters & Brands > Star Wars > Toys,... what I see my grandson us going to have fu...,0
9997,defiance lawkeeper metal badge prop replica,olde scotland yard ltd.,£43.99,Novelty & Special Use > Novelty > Accessories ...,Five Stars // 5.0 // 18 Dec. 2015 // By\n \...,1
9998,justice league of america series 3 green lante...,dc comics,£49.81,Hobbies > Collectible Figures & Memorabilia > ...,The best sculpt in a while // 5.0 // 13 May 20...,1


In [10]:
# The first line converts the 'price' column to string type to ensure that the replace operation can be applied universally (including on non-string data types). It then removes the British Pound symbol ('£') from the price values
amazon_ecommerce_sample_db['price'] = amazon_ecommerce_sample_db['price'].astype(str).str.replace('£', '')

# This line attempts to convert the cleaned 'price' column values into numeric data type
amazon_ecommerce_sample_db['price'] = pd.to_numeric(amazon_ecommerce_sample_db['price'], errors='coerce')

# The final line explicitly converts the 'price' column to float data type
amazon_ecommerce_sample_db['price'] = amazon_ecommerce_sample_db['price'].astype(float)

amazon_ecommerce_sample_db

Unnamed: 0,product_name,manufacturer,price,amazon_category_and_sub_category,customer_reviews,sellers
0,hornby 2014 catalogue,hornby,3.42,Hobbies > Model Trains & Railway Sets > Rail V...,Worth Buying For The Pictures Alone (As Ever) ...,1
1,funkybuys® large christmas holiday express fes...,funkybuys,16.99,Hobbies > Model Trains & Railway Sets > Rail V...,Four Stars // 4.0 // 18 Dec. 2015 // By\n \...,1
2,classic toy train set track carriages light en...,ccf,9.99,Hobbies > Model Trains & Railway Sets > Rail V...,**Highly Recommended!** // 5.0 // 26 May 2015 ...,1
3,hornby coach r4410a br hawksworth corridor 3rd,hornby,39.99,Hobbies > Model Trains & Railway Sets > Rail V...,I love it // 5.0 // 22 July 2013 // By\n \n...,0
4,hornby 00 gauge 0-4-0 gildenlow salt co. steam...,hornby,32.19,Hobbies > Model Trains & Railway Sets > Rail V...,Birthday present // 5.0 // 14 April 2014 // By...,0
...,...,...,...,...,...,...
9995,batman 1966 tv series action figures - the rid...,mattel,22.95,Hobbies > Collectible Figures & Memorabilia > ...,Realistic // 5.0 // 31 Mar. 2014 // By\n \n...,1
9996,"star wars costume, kids stormtrooper costume s...",star wars,39.99,Characters & Brands > Star Wars > Toys,... what I see my grandson us going to have fu...,0
9997,defiance lawkeeper metal badge prop replica,olde scotland yard ltd.,43.99,Novelty & Special Use > Novelty > Accessories ...,Five Stars // 5.0 // 18 Dec. 2015 // By\n \...,1
9998,justice league of america series 3 green lante...,dc comics,49.81,Hobbies > Collectible Figures & Memorabilia > ...,The best sculpt in a while // 5.0 // 13 May 20...,1


In [11]:
# Initializes two empty lists, category and sub_category, which will be used to store the main category and sub-category extracted from the amazon_category_and_sub_category column of each row in the DataFrame
category, sub_category = [], []

# Iterates over the amazon_category_and_sub_category column in the DataFrame
for i in range(len(amazon_ecommerce_sample_db['amazon_category_and_sub_category'])):

  # Attempts to split each entry in the amazon_category_and_sub_category column by the '>' character, which separates the main category from the sub-category.
  # Each split component is then cleaned (converted to lowercase and stripped of leading/trailing whitespace) and added to the corresponding list. If any error
  # occurs during this process (e.g., the entry does not contain a '>' character, or there's an index error due to missing data), the except block catches the
  # exception and appends 'other' to both lists as a fallback
  try:
    split_elem = amazon_ecommerce_sample_db['amazon_category_and_sub_category'][i].split('>')
    category.append(split_elem[0].lower().strip())
    sub_category.append(split_elem[1].lower().strip())
  except:
    category.append('other')
    sub_category.append('other')

# After processing all entries, these lines add the category and sub_category lists as new columns in the amazon_ecommerce_sample_db DataFrame, effectively storing the extracted category and sub-category information in the DataFrame
amazon_ecommerce_sample_db['category'] = category
amazon_ecommerce_sample_db['sub_category'] = sub_category

# Removes the original amazon_category_and_sub_category column from the DataFrame since its data has been split into the new category and sub_category columns. This cleans up the DataFrame by removing redundant information
amazon_ecommerce_sample_db.drop('amazon_category_and_sub_category', axis=1, inplace=True)

amazon_ecommerce_sample_db

Unnamed: 0,product_name,manufacturer,price,customer_reviews,sellers,category,sub_category
0,hornby 2014 catalogue,hornby,3.42,Worth Buying For The Pictures Alone (As Ever) ...,1,hobbies,model trains & railway sets
1,funkybuys® large christmas holiday express fes...,funkybuys,16.99,Four Stars // 4.0 // 18 Dec. 2015 // By\n \...,1,hobbies,model trains & railway sets
2,classic toy train set track carriages light en...,ccf,9.99,**Highly Recommended!** // 5.0 // 26 May 2015 ...,1,hobbies,model trains & railway sets
3,hornby coach r4410a br hawksworth corridor 3rd,hornby,39.99,I love it // 5.0 // 22 July 2013 // By\n \n...,0,hobbies,model trains & railway sets
4,hornby 00 gauge 0-4-0 gildenlow salt co. steam...,hornby,32.19,Birthday present // 5.0 // 14 April 2014 // By...,0,hobbies,model trains & railway sets
...,...,...,...,...,...,...,...
9995,batman 1966 tv series action figures - the rid...,mattel,22.95,Realistic // 5.0 // 31 Mar. 2014 // By\n \n...,1,games,dominoes & tile games
9996,"star wars costume, kids stormtrooper costume s...",star wars,39.99,... what I see my grandson us going to have fu...,0,games,educational games
9997,defiance lawkeeper metal badge prop replica,olde scotland yard ltd.,43.99,Five Stars // 5.0 // 18 Dec. 2015 // By\n \...,1,games,board games
9998,justice league of america series 3 green lante...,dc comics,49.81,The best sculpt in a while // 5.0 // 13 May 20...,1,characters & brands,vtech


In [12]:
# This code imports the warnings module and suppresses the display of warnings generated during script execution
import warnings
warnings.simplefilter(action='ignore')

# Initializes a new, empty DataFrame new_df with specified columns
columns = ['product_name', 'manufacturer', 'price', 'sellers', 'category', 'sub_category', 'new_review', 'new_rate', 'new_date']
new_df = pd.DataFrame(columns=columns)

# This loop iterates through each row of the original amazon_ecommerce_sample_db DataFrame to process and extract customer reviews
for index, row in amazon_ecommerce_sample_db.iterrows():

    # Each customer review string is split by '|' to separate individual reviews. Then, each review is further split by '//' to separate components of the review, such as the title, body, rating, and date
    customer_reviews = row['customer_reviews'].split('|')

    product_name = row['product_name']
    manufacturer = row['manufacturer']
    price = row['price']
    sellers = row['sellers']
    category = row['category']
    sub_category = row['sub_category']

    for i in range(len(customer_reviews)):

      # For each processed review, a new row is created with the relevant data fields and then added to new_df
      review = customer_reviews[i].strip().split('//')

      if len(review) >= 5:
        new_review_title = review[0].strip()
        new_review_body = review[-1].strip()
        new_review = f'<t>{new_review_title}</t><b>{new_review_body}</b>'
        new_rate = review[1].strip()
        new_date = review[2].strip()

        new_row = {
            'product_name': product_name,
            'manufacturer': manufacturer,
            'price': price,
            'sellers': sellers,
            'category': category,
            'sub_category': sub_category,
            'new_review': new_review,
            'new_rate': new_rate,
            'new_date': new_date,
        }

        new_row = pd.DataFrame([new_row])
        new_df = pd.concat([new_df, new_row], ignore_index=True)

# Updates the amazon_ecommerce_sample_db variable to reference the newly created and populated DataFrame new_df, which now contains the processed and structured review data along with the other product information
amazon_ecommerce_sample_db = new_df

amazon_ecommerce_sample_db

Unnamed: 0,product_name,manufacturer,price,sellers,category,sub_category,new_review,new_rate,new_date
0,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>Worth Buying For The Pictures Alone (As Eve...,4.0,6 April 2014
1,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>Amazing detail fabulous photography.</t><b>...,5.0,11 April 2015
2,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>'Great Purchase'</t><b>This was purchased o...,5.0,23 April 2014
3,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>Great Catalogue</t><b>Everything I really n...,5.0,11 Jun. 2014
4,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>I collect them all as the glossy pictures a...,5.0,7 Dec. 2014
...,...,...,...,...,...,...,...,...,...
27208,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,"<t>Fantastic kit!</t><b>Fantastic kit, well fi...",5.0,13 Oct. 2015
27209,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,<t>NOT AS DESCRIBED</t><b>Ok so if you don't r...,2.0,5 Mar. 2016
27210,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,<t>Worth waiting for.</t><b>I bought this mode...,3.0,13 Jan. 2016
27211,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,<t>Y-Wing Bandai - don't miss out</t><b>This k...,5.0,21 Mar. 2016


In [13]:
# This line attempts to convert the values in the 'new_rate' column of the DataFrame amazon_ecommerce_sample_db to a numeric data type
amazon_ecommerce_sample_db['new_rate'] = pd.to_numeric(amazon_ecommerce_sample_db['new_rate'], errors='coerce')

# This line removes any rows in the DataFrame amazon_ecommerce_sample_db where the 'new_rate' column contains NaN values
amazon_ecommerce_sample_db.dropna(subset=['new_rate'], inplace=True)

# This line converts the data type of the 'new_rate' column to float, ensuring that it is explicitly recognized as containing floating-point numbers
amazon_ecommerce_sample_db['new_rate'].astype(float)

amazon_ecommerce_sample_db.dtypes

product_name     object
manufacturer     object
price           float64
sellers          object
category         object
sub_category     object
new_review       object
new_rate        float64
new_date         object
dtype: object

In [14]:
# Converting 'new_date' to datetime format
amazon_ecommerce_sample_db['new_date'] = amazon_ecommerce_sample_db['new_date'].apply(lambda x: pd.to_datetime(x))

amazon_ecommerce_sample_db

Unnamed: 0,product_name,manufacturer,price,sellers,category,sub_category,new_review,new_rate,new_date
0,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>Worth Buying For The Pictures Alone (As Eve...,4.0,2014-04-06
1,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>Amazing detail fabulous photography.</t><b>...,5.0,2015-04-11
2,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>'Great Purchase'</t><b>This was purchased o...,5.0,2014-04-23
3,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>Great Catalogue</t><b>Everything I really n...,5.0,2014-06-11
4,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>I collect them all as the glossy pictures a...,5.0,2014-12-07
...,...,...,...,...,...,...,...,...,...
27208,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,"<t>Fantastic kit!</t><b>Fantastic kit, well fi...",5.0,2015-10-13
27209,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,<t>NOT AS DESCRIBED</t><b>Ok so if you don't r...,2.0,2016-03-05
27210,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,<t>Worth waiting for.</t><b>I bought this mode...,3.0,2016-01-13
27211,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,<t>Y-Wing Bandai - don't miss out</t><b>This k...,5.0,2016-03-21


In [15]:
amazon_ecommerce_sample_db.dtypes

product_name            object
manufacturer            object
price                  float64
sellers                 object
category                object
sub_category            object
new_review              object
new_rate               float64
new_date        datetime64[ns]
dtype: object

In [16]:
from sklearn.impute import SimpleImputer
import numpy as np

# Creating a SimpleImputer object to fill missing values in 'price' column with the mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Applying the imputer to the 'price' column
amazon_ecommerce_sample_db['price'] = imputer.fit_transform(amazon_ecommerce_sample_db['price'].values.reshape(-1, 1))

amazon_ecommerce_sample_db

Unnamed: 0,product_name,manufacturer,price,sellers,category,sub_category,new_review,new_rate,new_date
0,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>Worth Buying For The Pictures Alone (As Eve...,4.0,2014-04-06
1,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>Amazing detail fabulous photography.</t><b>...,5.0,2015-04-11
2,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>'Great Purchase'</t><b>This was purchased o...,5.0,2014-04-23
3,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>Great Catalogue</t><b>Everything I really n...,5.0,2014-06-11
4,hornby 2014 catalogue,hornby,3.42,1,hobbies,model trains & railway sets,<t>I collect them all as the glossy pictures a...,5.0,2014-12-07
...,...,...,...,...,...,...,...,...,...
27208,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,"<t>Fantastic kit!</t><b>Fantastic kit, well fi...",5.0,2015-10-13
27209,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,<t>NOT AS DESCRIBED</t><b>Ok so if you don't r...,2.0,2016-03-05
27210,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,<t>Worth waiting for.</t><b>I bought this mode...,3.0,2016-01-13
27211,star wars 1/72 y-wing starfighter,bandai,21.20,0,characters & brands,disney,<t>Y-Wing Bandai - don't miss out</t><b>This k...,5.0,2016-03-21


In [17]:
# Checking again for missing values after imputation
for column_name in amazon_ecommerce_sample_db.columns.tolist():
  print(amazon_ecommerce_sample_db[column_name].isnull().any())

False
False
False
False
False
False
False
False
False


In [18]:
# Renaming columns
amazon_ecommerce_sample_db.columns = ['product_name', 'product_company', 'product_price', 'is_verfied_purchase', 'product_category', 'product_sub_category', 'review_descripiton', 'review_rating', 'review_date']

# Reordering the columns
new_order = ['product_name', 'product_company', 'review_descripiton', 'review_rating', 'review_date', 'is_verfied_purchase', 'product_category', 'product_sub_category', 'product_price']
amazon_ecommerce_sample_db = amazon_ecommerce_sample_db[new_order]

# Adding unique IDs
amazon_ecommerce_sample_db.insert(0, 'row_id', range(30613, 30613 + len(amazon_ecommerce_sample_db)))
amazon_ecommerce_sample_db

Unnamed: 0,row_id,product_name,product_company,review_descripiton,review_rating,review_date,is_verfied_purchase,product_category,product_sub_category,product_price
0,30613,hornby 2014 catalogue,hornby,<t>Worth Buying For The Pictures Alone (As Eve...,4.0,2014-04-06,1,hobbies,model trains & railway sets,3.42
1,30614,hornby 2014 catalogue,hornby,<t>Amazing detail fabulous photography.</t><b>...,5.0,2015-04-11,1,hobbies,model trains & railway sets,3.42
2,30615,hornby 2014 catalogue,hornby,<t>'Great Purchase'</t><b>This was purchased o...,5.0,2014-04-23,1,hobbies,model trains & railway sets,3.42
3,30616,hornby 2014 catalogue,hornby,<t>Great Catalogue</t><b>Everything I really n...,5.0,2014-06-11,1,hobbies,model trains & railway sets,3.42
4,30617,hornby 2014 catalogue,hornby,<t>I collect them all as the glossy pictures a...,5.0,2014-12-07,1,hobbies,model trains & railway sets,3.42
...,...,...,...,...,...,...,...,...,...,...
27208,57820,star wars 1/72 y-wing starfighter,bandai,"<t>Fantastic kit!</t><b>Fantastic kit, well fi...",5.0,2015-10-13,0,characters & brands,disney,21.20
27209,57821,star wars 1/72 y-wing starfighter,bandai,<t>NOT AS DESCRIBED</t><b>Ok so if you don't r...,2.0,2016-03-05,0,characters & brands,disney,21.20
27210,57822,star wars 1/72 y-wing starfighter,bandai,<t>Worth waiting for.</t><b>I bought this mode...,3.0,2016-01-13,0,characters & brands,disney,21.20
27211,57823,star wars 1/72 y-wing starfighter,bandai,<t>Y-Wing Bandai - don't miss out</t><b>This k...,5.0,2016-03-21,0,characters & brands,disney,21.20


`amazon_ecommerce_sample_db` is ready to be loaded.

In [19]:
# Create product dimension
product_dimension = amazon_ecommerce_sample_db[['row_id', 'product_name', 'product_category', 'product_sub_category', 'product_company', 'product_price']]

# Renaming the row_id column
product_dimension.rename(columns={'row_id': 'product_id'}, inplace=True)
product_dimension

Unnamed: 0,product_id,product_name,product_category,product_sub_category,product_company,product_price
0,30613,hornby 2014 catalogue,hobbies,model trains & railway sets,hornby,3.42
1,30614,hornby 2014 catalogue,hobbies,model trains & railway sets,hornby,3.42
2,30615,hornby 2014 catalogue,hobbies,model trains & railway sets,hornby,3.42
3,30616,hornby 2014 catalogue,hobbies,model trains & railway sets,hornby,3.42
4,30617,hornby 2014 catalogue,hobbies,model trains & railway sets,hornby,3.42
...,...,...,...,...,...,...
27208,57820,star wars 1/72 y-wing starfighter,characters & brands,disney,bandai,21.20
27209,57821,star wars 1/72 y-wing starfighter,characters & brands,disney,bandai,21.20
27210,57822,star wars 1/72 y-wing starfighter,characters & brands,disney,bandai,21.20
27211,57823,star wars 1/72 y-wing starfighter,characters & brands,disney,bandai,21.20


In [20]:
# Create review dimension
review_dimension = amazon_ecommerce_sample_db[['row_id', 'review_rating', 'review_descripiton']]
review_dimension['review_thumbsup_count'] = pd.Series()
review_dimension['review_country'] = pd.Series(dtype='string')

# Renaming the row_id column
review_dimension.rename(columns={'row_id': 'review_id'}, inplace=True)
review_dimension

Unnamed: 0,review_id,review_rating,review_descripiton,review_thumbsup_count,review_country
0,30613,4.0,<t>Worth Buying For The Pictures Alone (As Eve...,,
1,30614,5.0,<t>Amazing detail fabulous photography.</t><b>...,,
2,30615,5.0,<t>'Great Purchase'</t><b>This was purchased o...,,
3,30616,5.0,<t>Great Catalogue</t><b>Everything I really n...,,
4,30617,5.0,<t>I collect them all as the glossy pictures a...,,
...,...,...,...,...,...
27208,57820,5.0,"<t>Fantastic kit!</t><b>Fantastic kit, well fi...",,
27209,57821,2.0,<t>NOT AS DESCRIBED</t><b>Ok so if you don't r...,,
27210,57822,3.0,<t>Worth waiting for.</t><b>I bought this mode...,,
27211,57823,5.0,<t>Y-Wing Bandai - don't miss out</t><b>This k...,,


In [21]:
# Format the dates
dates_list = [[date.day, date.month, date.year] for date in amazon_ecommerce_sample_db['review_date'].copy()]

# Create date dimension
date_dimension = pd.DataFrame(dates_list, columns=['day', 'month', 'year'])

# Adding unique IDs
date_dimension.insert(0, 'date_id', range(30613, 30613 + len(date_dimension)))
date_dimension

Unnamed: 0,date_id,day,month,year
0,30613,6,4,2014
1,30614,11,4,2015
2,30615,23,4,2014
3,30616,11,6,2014
4,30617,7,12,2014
...,...,...,...,...
27207,57820,13,10,2015
27208,57821,5,3,2016
27209,57822,13,1,2016
27210,57823,21,3,2016


In [22]:
# Export the cleaned DataFrames to a JSON files
from google.colab import files

amazon_ecommerce_sample_db = amazon_ecommerce_sample_db.to_json(orient='records')
product_dimension = product_dimension.to_json(orient='records')
review_dimension = review_dimension.to_json(orient='records')
date_dimension = date_dimension.to_json(orient='records')

with open('amazon_ecommerce_sample_db.json', 'w') as f:
    f.write(amazon_ecommerce_sample_db)

with open('ecommerce_product_dimension.json', 'w') as f:
    f.write(product_dimension)

with open('ecommerce_review_dimension.json', 'w') as f:
    f.write(review_dimension)

with open('ecommerce_date_dimension.json', 'w') as f:
    f.write(date_dimension)

# Downloading the JSON files
files.download('amazon_ecommerce_sample_db.json')
files.download('ecommerce_product_dimension.json')
files.download('ecommerce_review_dimension.json')
files.download('ecommerce_date_dimension.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>