In [1]:
import pandas as pd

## Extract questions and answers

In [2]:
# Read original dataset
df = pd.read_csv('./amazon_co-ecommerce_sample.csv')

print(f'Shape of original dataset: {df.shape}')
df.head()

Shape of original dataset: (10000, 17)


Unnamed: 0,uniq_id,product_name,manufacturer,price,number_available_in_stock,number_of_reviews,number_of_answered_questions,average_review_rating,amazon_category_and_sub_category,customers_who_bought_this_item_also_bought,description,product_information,product_description,items_customers_buy_after_viewing_this_item,customer_questions_and_answers,customer_reviews,sellers
0,eac7efa5dbd3d667f26eb3d3ab504464,Hornby 2014 Catalogue,Hornby,£3.42,5 new,15,1.0,4.9 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Hornby-R8150-Catalogue...,Product Description Hornby 2014 Catalogue Box ...,Technical Details Item Weight640 g Product Dim...,Product Description Hornby 2014 Catalogue Box ...,http://www.amazon.co.uk/Hornby-R8150-Catalogue...,Does this catalogue detail all the previous Ho...,Worth Buying For The Pictures Alone (As Ever) ...,"{""seller""=>[{""Seller_name_1""=>""Amazon.co.uk"", ..."
1,b17540ef7e86e461d37f3ae58b7b72ac,FunkyBuys® Large Christmas Holiday Express Fes...,FunkyBuys,£16.99,,2,1.0,4.5 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Christmas-Holiday-Expr...,Size Name:Large FunkyBuys® Large Christmas Hol...,Technical Details Manufacturer recommended age...,Size Name:Large FunkyBuys® Large Christmas Hol...,http://www.amazon.co.uk/Christmas-Holiday-Expr...,can you turn off sounds // hi no you cant turn...,Four Stars // 4.0 // 18 Dec. 2015 // By\n \...,"{""seller""=>{""Seller_name_1""=>""UHD WHOLESALE"", ..."
2,348f344247b0c1a935b1223072ef9d8a,CLASSIC TOY TRAIN SET TRACK CARRIAGES LIGHT EN...,ccf,£9.99,2 new,17,2.0,3.9 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Classic-Train-Lights-B...,BIG CLASSIC TOY TRAIN SET TRACK CARRIAGE LIGHT...,Technical Details Manufacturer recommended age...,BIG CLASSIC TOY TRAIN SET TRACK CARRIAGE LIGHT...,http://www.amazon.co.uk/Train-With-Tracks-Batt...,What is the gauge of the track // Hi Paul.Trut...,**Highly Recommended!** // 5.0 // 26 May 2015 ...,"{""seller""=>[{""Seller_name_1""=>""DEAL-BOX"", ""Sel..."
3,e12b92dbb8eaee78b22965d2a9bbbd9f,HORNBY Coach R4410A BR Hawksworth Corridor 3rd,Hornby,£39.99,,1,2.0,5.0 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,,Hornby 00 Gauge BR Hawksworth 3rd Class W 2107...,Technical Details Item Weight259 g Product Dim...,Hornby 00 Gauge BR Hawksworth 3rd Class W 2107...,,,I love it // 5.0 // 22 July 2013 // By\n \n...,
4,e33a9adeed5f36840ccc227db4682a36,Hornby 00 Gauge 0-4-0 Gildenlow Salt Co. Steam...,Hornby,£32.19,,3,2.0,4.7 out of 5 stars,Hobbies > Model Trains & Railway Sets > Rail V...,http://www.amazon.co.uk/Hornby-R6367-RailRoad-...,Product Description Hornby RailRoad 0-4-0 Gild...,Technical Details Item Weight159 g Product Dim...,Product Description Hornby RailRoad 0-4-0 Gild...,http://www.amazon.co.uk/Hornby-R2672-RailRoad-...,,Birthday present // 5.0 // 14 April 2014 // By...,


In [3]:
# Drop NAN's and extract question and answer data
df_qa = df.dropna(subset=['customer_questions_and_answers'])[['customer_questions_and_answers']]

print(f'Shape of extracted dataset: {df_qa.shape}')
df_qa.head()

Shape of extracted dataset: (914, 1)


Unnamed: 0,customer_questions_and_answers
0,Does this catalogue detail all the previous Ho...
1,can you turn off sounds // hi no you cant turn...
2,What is the gauge of the track // Hi Paul.Trut...
5,is it possible to replace thr grain of wheat l...
7,Can this train go backwards as well as forward...


In [4]:
# Sample of QA data
list(df_qa.loc[0].values)

['Does this catalogue detail all the previous Hornby products please? // HiThe 2014 catalogue does indeed detail previous models but also includes new releases for 2014.You would be advised to purchase models as you need them to avoid them being discontinued in subsequent years…\n    \n      see more\n    \n  \n  \n    HiThe 2014 catalogue does indeed detail previous models but also includes new releases for 2014.You would be advised to purchase models as you need them to avoid them being discontinued in subsequent yearsHope this helps\n    \n      see less']

## Generate data file

In [5]:
# Function to clean text by removing special characters
def clean_text(text):
    text = text.replace('\n', ' ')
    # Replace Unicode ellipsis with a space
    text = text.replace('\u2026', ' ')

    # Extracting text between 'see more' and 'see less'
    start_keyword = 'see more'
    end_keyword = 'see less'
    start_index = text.find(start_keyword)
    end_index = text.find(end_keyword)

    if start_index != -1 and end_index != -1:
        # Extracting the whole answer
        text = text[start_index + len(start_keyword):end_index].strip()
    elif start_index != -1:
        # If only 'see more' is present
        text = text[start_index + len(start_keyword):].strip()
    # If neither 'see more' nor 'see less' is present, keep the text as is

    return text

def preprocess_qa(data):
    preprocessed_data = []

    for _, row in data.iterrows():
        # Splitting by '|' to get individual question-answer pairs
        qa_pairs = row['customer_questions_and_answers'].split('|')
        
        for qa_pair in qa_pairs:
            # Splitting by '//' to separate question and answer
            parts = qa_pair.split('//')
            # Skip if there's no answer or malformed pair
            if len(parts) < 2:
                continue

            question = parts[0].strip()
            answer = clean_text(parts[1].strip())

            preprocessed_data.append({
                'question': question,
                'answer': answer,
                'answer_length': len(answer)
            })

    return preprocessed_data

In [6]:
preprocessed_data = preprocess_qa(df_qa)

preprocessed_df = pd.DataFrame(preprocessed_data)
preprocessed_df.to_json('./qa_dataset.json', orient='records', lines=True)