In [None]:
# Packages used in the script
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import gzip
import spacy
import re
from contractions import contractions_dict

In [None]:
# source: https://jmcauley.ucsd.edu/data/amazon/qa/
# Reading the data
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

# Convert to Pandas DataFrame        
def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df_electronics = getDF('qa_Electronics.json.gz')

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

In [None]:
# Only extract the open-ended questions from the dataset
df_electronics_open = df_electronics[df_electronics['questionType'] == 'open-ended']

In [None]:
# Remove the columns that are not relevant 
df_electronics_amazon = df_electronics_open.drop(['unixTime', 'answerType', 'answerTime'], axis=1)

In [None]:
# Remove contraction using the contraction library 
contractions_dict.pop('em') #removed as this causes ambiguities
for contraction in contractions_dict:
    df_electronics_amazon["answer"] = df_electronics_amazon["answer"].apply(lambda x:str(x).replace(contraction,contractions_dict[contraction]))

In [None]:
df_electronics_amazon['answer'] = df_electronics_amazon['answer'].str.lower()

In [None]:
# Extract sentences from the answers
nlp = spacy.load("en_core_web_sm")

df_electronics_amazon['sentences'] = df_electronics_amazon['answer'].apply(lambda x: [str(sent).strip() for sent in nlp(str(x)).sents])

In [None]:
 # Create lists to fill with values
l_col1 = []
l_col2 = []

# iterate over each row and fill our lists
for ix, row in df_electronics_amazon.iterrows():
    for value in row['sentences']:
        l_col1.append(value)
        l_col2.append(row['asin'])
        #l_col3.append(row['answer'])

# Create new dataframe from the two lists
df_electro = pd.DataFrame({'sentences': l_col1 ,
                         'asin': l_col2})
df_electro = df_electro.rename(columns={"asin":"product_type"});

In [None]:
# Export DataFrame to CSV file
df_electro.to_csv(r'/Users/annabellesonneveldt/downloads/df_electronics.csv', index = False)

References dataset:

Wan, M., & McAuley, J. (2016). Modeling ambiguity, subjectivity, and diverging viewpoints in opinion question answering systems. In 2016 IEEE 16th international conference on data mining (ICDM) (pp. 489-498). IEEE.

McAuley, J., & Yang, A. (2016). Addressing complex and subjective product-related queries with customer reviews. In Proceedings of the 25th International Conference on World Wide Web (pp. 625-635).