In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from pandarallel import pandarallel

In [2]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [50]:
data = "../../../data/labelled/reviews/train"

### cell_accesses

In [51]:
cell_access = pd.read_json(os.path.join(data, "./Cell_Phones_and_Accessories_5.jsonl"), lines=True)

In [5]:
cell_access.head(2)

Unnamed: 0,review_info,pros,cons,verdict
0,- Looks even better in person. Be careful to n...,"[Looks great in person, Affordable, Fits perfe...","[Rhinestones fall off easily, Not very protect...",This case is a decorative item that fits secur...
1,- its super cute and makes my phone pretting a...,"[Cute and sparkly, Unique and well-made, Beaut...","[Broke easily, Jewels fall off easily, Bow cam...","Overall, this product is cute, sparkly, and we..."


In [6]:
input_prompt = "Below are the reviews of a product: \n"
task_prompt = "\n\nWhat are the pros, cons, and verdict for this product?"

In [7]:
print(f"{input_prompt}{cell_access.review_info[0]}{task_prompt}")

Below are the reviews of a product: 
- Looks even better in person. Be careful to not drop your phone so often because the rhinestones will fall off (duh). More of a decorative case than it is protective, but I will say that it fits perfectly and securely on my phone. Overall, very pleased with this purchase.
- When you don't want to spend a whole lot of cash but want a great deal...this is the shop to buy from!
- so the case came on time, i love the design. I'm actually missing 2 studs but nothing too noticeable the studding is almost a bit sloppy around the bow, but once again not too noticeable. I haven't put in my phone yet so this is just what I've notice so far
- DON'T CARE FOR IT.  GAVE IT AS A GIFT AND THEY WERE OKAY WITH IT.  JUST NOT WHAT I EXPECTED.
- I liked it because it was cute, but the studs fall off easily and to protect a phone this would not be recommended. Buy if you just like it for looks.
- The product looked exactly like the picture and it was very nice. However 

In [8]:
out = """pros:\n{pros}\n\ncons:\n{cons}\n\nverdict:\n{verdict}"""

In [9]:
pros = cell_access.pros[0]
cons = cell_access.cons[0]
verdict = cell_access.verdict[0]

if pros:
    pros_ = "- "+"\n- ".join(pros)
else:
    pros_ = "None"
    
if cons:
    cons_ = "- "+"\n- ".join(cons)
else:
    cons_ = "None"
    
if verdict:
    verdict_ = verdict.strip()
else:
    verdict_ = "None"

print(out.format_map({"pros": pros_, "cons": cons_, "verdict": verdict}))

pros:
- Looks great in person
- Affordable
- Fits perfectly on phone
- Cute design
- Sturdy bow

cons:
- Rhinestones fall off easily
- Not very protective
- Stud placement can be sloppy and uneven
- Rhinestones can start falling off immediately after receiving
- Not suitable for carrying in jeans pockets

verdict:
This case is a decorative item that fits securely on your phone but offers minimal protection due to the loose rhinestones. While it appears visually appealing, it requires careful handling to prevent further rhinestone loss. Consider purchasing it if you prioritize aesthetics over functionality.


In [10]:
def format_input(row: pd.Series):
    return f"{input_prompt}{row.review_info}{task_prompt}"

In [11]:
def format_labels(row: pd.Series):
    
    try:
        if row.pros:
            pros_ = "- "+"\n- ".join(row.pros)
        else:
            pros_ = "None"
            
        if row.cons:
            cons_ = "- "+"\n- ".join(row.cons)
        else:
            cons_ = "None"
            
        if row.verdict:
            if isinstance(row.verdict, list):
                verdict_ = " ".join(row.verdict)
            else:
                verdict_ = row.verdict.strip()
        else:
            verdict_ = "None"
            
        return out.format_map({"pros": pros_, "cons": cons_, "verdict": verdict_})
    except Exception as e:
        return None

In [12]:
cell_access.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129601 entries, 0 to 129600
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   review_info  129601 non-null  object
 1   pros         129601 non-null  object
 2   cons         129601 non-null  object
 3   verdict      129091 non-null  object
dtypes: object(4)
memory usage: 4.0+ MB


In [13]:
cell_access.dropna(subset=['verdict'], inplace=True)
cell_access.reset_index(drop=True, inplace=True)

In [14]:
cell_access["input_reviews"] = cell_access.parallel_apply(format_input, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=16137), Label(value='0 / 16137')))…

In [15]:
cell_access["label_reviews"] = cell_access.parallel_apply(format_labels, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=16137), Label(value='0 / 16137')))…

In [16]:
cell_access.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129091 entries, 0 to 129090
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   review_info    129091 non-null  object
 1   pros           129091 non-null  object
 2   cons           129091 non-null  object
 3   verdict        129091 non-null  object
 4   input_reviews  129091 non-null  object
 5   label_reviews  129087 non-null  object
dtypes: object(6)
memory usage: 5.9+ MB


In [17]:
cell_access.dropna(subset=['label_reviews'], inplace=True)
cell_access.reset_index(drop=True, inplace=True)

In [18]:
cell_access.shape

(129087, 6)

In [19]:
cell_access.head(2)

Unnamed: 0,review_info,pros,cons,verdict,input_reviews,label_reviews
0,- Looks even better in person. Be careful to n...,"[Looks great in person, Affordable, Fits perfe...","[Rhinestones fall off easily, Not very protect...",This case is a decorative item that fits secur...,Below are the reviews of a product: \n- Looks ...,pros:\n- Looks great in person\n- Affordable\n...
1,- its super cute and makes my phone pretting a...,"[Cute and sparkly, Unique and well-made, Beaut...","[Broke easily, Jewels fall off easily, Bow cam...","Overall, this product is cute, sparkly, and we...",Below are the reviews of a product: \n- its su...,pros:\n- Cute and sparkly\n- Unique and well-m...


In [20]:
cell_access.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129087 entries, 0 to 129086
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   review_info    129087 non-null  object
 1   pros           129087 non-null  object
 2   cons           129087 non-null  object
 3   verdict        129087 non-null  object
 4   input_reviews  129087 non-null  object
 5   label_reviews  129087 non-null  object
dtypes: object(6)
memory usage: 5.9+ MB


In [21]:
cell_access['category'] = ["cell_phones_accessories"]*len(cell_access)

In [22]:
cell_access.head(2)

Unnamed: 0,review_info,pros,cons,verdict,input_reviews,label_reviews,category
0,- Looks even better in person. Be careful to n...,"[Looks great in person, Affordable, Fits perfe...","[Rhinestones fall off easily, Not very protect...",This case is a decorative item that fits secur...,Below are the reviews of a product: \n- Looks ...,pros:\n- Looks great in person\n- Affordable\n...,cell_phones_accessories
1,- its super cute and makes my phone pretting a...,"[Cute and sparkly, Unique and well-made, Beaut...","[Broke easily, Jewels fall off easily, Bow cam...","Overall, this product is cute, sparkly, and we...",Below are the reviews of a product: \n- its su...,pros:\n- Cute and sparkly\n- Unique and well-m...,cell_phones_accessories


In [23]:
lengths = list(map(len, list(map(str.split, cell_access.input_reviews.to_list()))))
print(f"avg length of input: {sum(lengths)/len(lengths)}")

avg length of input: 452.4703572009575


In [24]:
lengths = list(map(len, list(map(str.split, cell_access.label_reviews.to_list()))))
print(f"avg length of output: {sum(lengths)/len(lengths)}")

avg length of output: 99.27260684654536


### electronics

In [25]:
elecs = pd.read_json(os.path.join(data, "./Electronics_5.jsonl"), lines=True)

In [26]:
elecs.shape

(78416, 4)

In [27]:
elecs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78416 entries, 0 to 78415
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_info  78416 non-null  object
 1   pros         78416 non-null  object
 2   cons         78416 non-null  object
 3   verdict      78241 non-null  object
dtypes: object(4)
memory usage: 2.4+ MB


In [28]:
elecs.dropna(subset=['verdict'], inplace=True)
elecs.reset_index(drop=True, inplace=True)

In [29]:
elecs["input_reviews"] = elecs.parallel_apply(format_input, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9781), Label(value='0 / 9781'))), …

In [30]:
elecs["label_reviews"] = elecs.parallel_apply(format_labels, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9781), Label(value='0 / 9781'))), …

In [31]:
elecs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78241 entries, 0 to 78240
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   review_info    78241 non-null  object
 1   pros           78241 non-null  object
 2   cons           78241 non-null  object
 3   verdict        78241 non-null  object
 4   input_reviews  78241 non-null  object
 5   label_reviews  78236 non-null  object
dtypes: object(6)
memory usage: 3.6+ MB


In [32]:
elecs.dropna(subset=['label_reviews'], inplace=True)
elecs.reset_index(drop=True, inplace=True)

In [33]:
elecs['category'] = ["electronics"]*len(elecs)

In [34]:
elecs.head(2)

Unnamed: 0,review_info,pros,cons,verdict,input_reviews,label_reviews,category
0,- I figured out how to use it. It's okay for ...,[Affordable price compared to other brands wit...,"[Does not record in stereo, Display is hard to...",This is a budget-friendly MP3 player with some...,Below are the reviews of a product: \n- I figu...,pros:\n- Affordable price compared to other br...,electronics
1,"- still playing this item everyday, no problem...","[Plays MP1, MP2, MP3, WMA, WMV, ASF and WAV au...","[FM radio reception was weak and unreliable, T...","This MP3 player receives mixed reviews, with s...",Below are the reviews of a product: \n- still ...,"pros:\n- Plays MP1, MP2, MP3, WMA, WMV, ASF an...",electronics


In [35]:
lengths = list(map(len, list(map(str.split, elecs.input_reviews.to_list()))))
print(f"avg length of input: {sum(lengths)/len(lengths)}")

avg length of input: 687.1229229510711


In [36]:
lengths = list(map(len, list(map(str.split, elecs.label_reviews.to_list()))))
print(f"avg length of output: {sum(lengths)/len(lengths)}")

avg length of output: 110.98243775244133


### concatenate

In [37]:
df = pd.concat([cell_access, elecs], axis=0)

In [38]:
df.shape

(207323, 7)

In [39]:
train, dev = train_test_split(df, test_size=30000, stratify=df['category'], random_state=42, shuffle=True)

In [40]:
dev, test = train_test_split(dev, test_size=20000, stratify=dev['category'], random_state=42, shuffle=True)

In [41]:
train.shape, dev.shape, test.shape

((177323, 7), (10000, 7), (20000, 7))

In [42]:
train.head(2)

Unnamed: 0,review_info,pros,cons,verdict,input_reviews,label_reviews,category
14221,- The gray rubber on this case fits too tightl...,"[Low cost, Durable, Good protection, Classy ap...","[Rubber fits too tightly, potentially damaging...","Overall, this case provides good protection at...",Below are the reviews of a product: \n- The gr...,pros:\n- Low cost\n- Durable\n- Good protectio...,cell_phones_accessories
18974,"- Works great. Great value\n- Good quality, e...","[Great value, Excellent quality and coating, N...","[It's just a filter, Some optical help, Slight...","Overall, the Sigma DG 52mm MultiCoated UV filt...",Below are the reviews of a product: \n- Works ...,pros:\n- Great value\n- Excellent quality and ...,electronics


In [43]:
train = train[['input_reviews', 'label_reviews']].reset_index(drop=True)
dev = dev[['input_reviews', 'label_reviews']].reset_index(drop=True)
test = test[['input_reviews', 'label_reviews']].reset_index(drop=True)

In [44]:
train.shape, dev.shape, test.shape

((177323, 2), (10000, 2), (20000, 2))

In [45]:
train.head(2)

Unnamed: 0,input_reviews,label_reviews
0,Below are the reviews of a product: \n- The gr...,pros:\n- Low cost\n- Durable\n- Good protectio...
1,Below are the reviews of a product: \n- Works ...,pros:\n- Great value\n- Excellent quality and ...


In [46]:
idx = 2

In [47]:
print(train.input_reviews[idx])

Below are the reviews of a product: 
- This product not only looks amazing but offers superior phone protection. I also love the kickstand and the fact that it can be mounted to my mountain bike and car windshield with separately purchased accessory. This case is well worth the money and a good investment. Trident cases have saved my phones for many years time and time again. :)
- Case was as described, nice bright color. Liked the case overall but it was slick and also just didn't seem to want to say in the clip. If you want just a protective case this may be fine, but for work needed to stay put in a belt clip not so much.
- This is a great phone case. Would be better if it had a built in screen cover instead of the old cling ons. Also needs filters over some other areas.
- This is a very solid case. I ordered it because I already managed to shatter a screen, and screen replacements are pricey. This case would likely protect against the pavement side impact that did me in. It is modu

In [48]:
print(train.label_reviews[idx])

pros:
- Superior phone protection
- Kickstand included
- Mountable with accessories
- Variety of uses
- Excellent quality

cons:
- Slick surface
- Belt clip doesn't hold well
- Lacks built-in screen cover
- Bulky design
- Heavy with built-in kickstand

verdict:
This case offers great protection and versatility with its modular design, but its bulkiness and lack of a built-in screen cover may be drawbacks for some users.


In [52]:
splits = "../../../data/labelled/reviews/splits"

In [53]:
train.to_csv(os.path.join(splits, "train.csv"), index=False)
dev.to_csv(os.path.join(splits, "dev.csv"), index=False)
test.to_csv(os.path.join(splits, "test.csv"), index=False)