In [1]:
import pandas as pd
import numpy as np
from langdetect import detect
from top2vec import Top2Vec

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
2023-05-23 15:15:39.526373: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 1 Read-in dataframe

In [2]:
# read in dataframe, only use the columns: reviewText, summary, overall, asin
df = pd.read_csv('apple.csv', usecols=["reviewText", "summary", "overall", "asin"])

In [3]:
print(df.isna().sum())
print(df.info())

print(df["overall"].min())
print(df["overall"].max())

overall         0
asin            0
reviewText    128
summary        72
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66089 entries, 0 to 66088
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   overall     66089 non-null  float64
 1   asin        66089 non-null  object 
 2   reviewText  65961 non-null  object 
 3   summary     66017 non-null  object 
dtypes: float64(1), object(3)
memory usage: 2.0+ MB
None
1.0
5.0


# 2 Replace na values and start ratings in reviewText

In [4]:
# look at the na values in summary
print(df[df["summary"].isna()].shape)
df[df["summary"].isna()].head()

(72, 4)


Unnamed: 0,overall,asin,reviewText,summary
3867,1.0,B003TGT48Q,received a defective Touch Screen. We had to b...,
5044,5.0,B004S227UW,I ordered black but,
5592,5.0,B004S227UW,Loved this phone too bad I now upgraded to a ip6,
9699,5.0,B005SSBE7W,,
13277,5.0,B0074R0Z3O,great phone love it. looks brand new,


In [5]:
# look at the "star rating" text in the summary column
stars = ["One Star", "Two Stars", "Three Stars", "Four Stars", "Five Stars", "1 Star", "2 Stars", "3 Stars", "4 Stars", "5 Stars"]
df[df["summary"].isin(stars)]

Unnamed: 0,overall,asin,reviewText,summary
31,5.0,B001AXA056,this was a past order but still works fine :D,Five Stars
32,4.0,B001AXA056,"very nice, product, fast delivery, i recommend...",Four Stars
33,5.0,B001AXA056,"My son loves it, I cant even touch it. LOL",Five Stars
34,1.0,B001AXA056,Not good at all!,One Star
35,4.0,B001AXA056,good,Four Stars
...,...,...,...,...
66071,5.0,B01H968VFI,phone is very good and has more to offer than ...,Five Stars
66073,5.0,B01H968VI0,The iphone works just fine.,Five Stars
66075,5.0,B01HB5RYF0,Love it.,Five Stars
66078,5.0,B01HB5RJ5A,"works great and delivered on time, just as adv...",Five Stars


In [6]:
# fill na values in summary with empty string
print("Fill {0} na values with empty string".format(df[df["summary"].isna()].shape[0]))
df.loc[df["summary"].isna(), "summary"] = ""

Fill 72 na values with empty string


In [7]:
# fill values in summary with only a star rating
print("Fill {0} na values with empty string".format(df[df["summary"].isin(stars)].shape[0]))
df.loc[df["summary"].isin(stars), "summary"] = ""
print("\nDoblecheck:")
indexes = [31, 32, 33, 34, 35, 132, 133, 134, 135, 136]
df.loc[indexes, :]

Fill 24641 na values with empty string

Doblecheck:


Unnamed: 0,overall,asin,reviewText,summary
31,5.0,B001AXA056,this was a past order but still works fine :D,
32,4.0,B001AXA056,"very nice, product, fast delivery, i recommend...",
33,5.0,B001AXA056,"My son loves it, I cant even touch it. LOL",
34,1.0,B001AXA056,Not good at all!,
35,4.0,B001AXA056,good,
132,4.0,B001CJTE0K,Great quality pouch fits my samsung galaxy S4 ...,
133,4.0,B001CJTE0K,I love it....Thank You.....!!!!!,
134,2.0,B001CJTE0K,ok,
135,3.0,B001CJTE0K,5 stars,
136,3.0,B001CJTE0K,b,


# 3 Replace na values in reviewText

In [8]:
# look at the na values in reviewText
print("\nLOOK AT THE NA VALUES IN summary")
print(df[df["reviewText"].isna()].shape)
df[df["reviewText"].isna()].head()


LOOK AT THE NA VALUES IN summary
(128, 4)


Unnamed: 0,overall,asin,reviewText,summary
3541,5.0,B003PJBWOQ,,
3772,5.0,B003PJBWOQ,,
4534,5.0,B004S227UW,,
4560,3.0,B004S227UW,,
4695,5.0,B004S227UW,,


In [9]:
# fill na values in reviewText with empty string
print("Fill {0} na values with empty string".format(df[df["reviewText"].isna()].shape[0]))
df.loc[df["reviewText"].isna(), "reviewText"] = ""
print("\nDoblecheck:")
indexes = [3541, 3772, 4534, 4560, 4695]
df.loc[indexes, :]

Fill 128 na values with empty string

Doblecheck:


Unnamed: 0,overall,asin,reviewText,summary
3541,5.0,B003PJBWOQ,,
3772,5.0,B003PJBWOQ,,
4534,5.0,B004S227UW,,
4560,3.0,B004S227UW,,
4695,5.0,B004S227UW,,


In [10]:
print("There are {0} rows with empty reviewText and non-empty summary".format(df.loc[(df["reviewText"] == "") & (df["summary"] != "")].shape[0]))
df.loc[(df["reviewText"] == "") & (df["summary"] != "")]

There are 11 rows with empty reviewText and non-empty summary


Unnamed: 0,overall,asin,reviewText,summary
7682,1.0,B005SSBE7W,,I need help I got the phone it works but my si...
17775,5.0,B00A83I8G2,,Actual apple product. A+
29486,5.0,B00K0Q2YV2,,It did well!
40057,5.0,B00YD53FHS,,Top
42601,5.0,B00YD54J8W,,Nice and fast shipment
42887,1.0,B00YD54J8W,,Damaged
45839,5.0,B00YD547Q6,,Seller was awesome to my questions
50051,5.0,B012BN15LQ,,Works fine
54463,4.0,B015E8VWEU,,Awaiting feedback from recipient
59304,5.0,B01EVPI68W,,Absolutely good


# 4 Combine reviewText and summary into corpus and clean

In [11]:
# combine the reviewText and summary columns
df["corpus"] = df["summary"] + " " + df["reviewText"]
df = df.drop(columns=["reviewText", "summary"])
df.head()

Unnamed: 0,overall,asin,corpus
0,1.0,B0013LKXEI,Dont buy these The only good thing about these...
1,2.0,B0013LKXEI,I suppose... These headphones came pretty quic...
2,5.0,B0013LKXEI,Awesome! I was replacing my earbuds and was re...
3,4.0,B0013LKXEI,iPod headphones They are pretty good. I would ...
4,1.0,B0014HKJKY,Fits Like a Glove... on OJ Simpson's Hand Thes...


In [12]:
# drop rows with empty corpus
print("There are {0} rows with empty corpus".format(df[df["corpus"] == " "].shape[0]))
df = df[df["corpus"] != " "]

There are 117 rows with empty corpus


In [13]:
# drop rows that solely contain non-english characters
print("There are {0} rows with only non-english characters".format(df[~df["corpus"].str.contains("[a-zA-Z]")].shape[0]))
df = df[df["corpus"].str.contains("[a-zA-Z]")]

There are 30 rows with only non-english characters


In [14]:
# drop rows that contain less than 10 words

min_size = 5

print("There are {0} rows with less than {1} words".format(df[df["corpus"].str.split().str.len() < min_size].shape[0], min_size))
print("\nRemoving rows such as:")
print(df[df["corpus"].str.split().str.len() < min_size].head(5))
df = df[df["corpus"].str.split().str.len() >= min_size]
print(f"\nNumber of rows after filtering: {df.shape[0]}")

There are 12640 rows with less than 5 words

Removing rows such as:
     overall        asin                             corpus
34       1.0  B001AXA056                   Not good at all!
35       4.0  B001AXA056                               good
133      4.0  B001CJTE0K   I love it....Thank You.....!!!!!
134      2.0  B001CJTE0K                                 ok
135      3.0  B001CJTE0K                            5 stars

Number of rows after filtering: 53302


In [15]:
# replace \n with space
print("There are {0} rows where corpus contains '\\n'".format(df[df["corpus"].str.contains("\n")].shape[0]))
df["corpus"] = df["corpus"].str.replace("\n", " ")

There are 4835 rows where corpus contains '\n'


In [16]:
add = []
counter = 0
for row in df["corpus"]:
    try:
        if detect(row) != "en":
            add.append(row)
            counter += 1
    except:
        print("This row throws and error:", row)
print(add)
print(counter)

918


In [201]:
df.to_csv("apple_preprocessed.csv", index=False)