In [101]:
import pandas as pd
import numpy as np
from langdetect import detect

# 1. Read-in dataframe

In [102]:
# read in dataframe, only use the columns: reviewText, summary, overall, asin
df = pd.read_csv('apple.csv', usecols=["reviewText", "summary", "overall", "asin"])

# 2. Drop na values in reviewText

In [103]:
# drop rows with missing values in reviewText
print("\nDROP ROWS WITH MISSING VALUES IN REVIEWTEXT:")
print("\nBEFORE:")
print(df.isna().sum())
df = df.dropna(subset=['reviewText'])
print("\nAFTER:")
print(df.isna().sum())


DROP ROWS WITH MISSING VALUES IN REVIEWTEXT:

BEFORE:
overall         0
asin            0
reviewText    128
summary        72
dtype: int64

AFTER:
overall        0
asin           0
reviewText     0
summary       65
dtype: int64


# 3. Replace na 

In [92]:
# look at the na values in summary
print("\nLOOK AT THE NA VALUES IN SUMMARY")
print(df[df["summary"].isna()].shape)
df[df["summary"].isna()].head()


LOOK AT THE NA VALUES IN SUMMARY
(65, 4)


Unnamed: 0,overall,asin,reviewText,summary
3867,1.0,B003TGT48Q,received a defective Touch Screen. We had to b...,
5044,5.0,B004S227UW,I ordered black but,
5592,5.0,B004S227UW,Loved this phone too bad I now upgraded to a ip6,
9699,5.0,B005SSBE7W,,
13277,5.0,B0074R0Z3O,great phone love it. looks brand new,


In [93]:
df.loc[9699, :]

overall                       5.0
asin                   B005SSBE7W
reviewText                       
summary                       NaN
Name: 9699, dtype: object

In [94]:
# fill na values in summary with empty string
print("\nFILL NA VALUES IN SUMMARY WITH EMPTY STRING")
df["summary"][df["summary"].isna()] = ""
print(df[df["summary"] == ""].shape)
df[df["summary"] == ""].head()


FILL NA VALUES IN SUMMARY WITH EMPTY STRING
(65, 4)


Unnamed: 0,overall,asin,reviewText,summary
3867,1.0,B003TGT48Q,received a defective Touch Screen. We had to b...,
5044,5.0,B004S227UW,I ordered black but,
5592,5.0,B004S227UW,Loved this phone too bad I now upgraded to a ip6,
9699,5.0,B005SSBE7W,,
13277,5.0,B0074R0Z3O,great phone love it. looks brand new,


In [95]:
# look at the "star rating" text in the summary column
stars = ["One Star", "Two Stars", "Three Stars", "Four Stars", "Five Stars"]
df[df["summary"].isin(stars)].head(10)

Unnamed: 0,overall,asin,reviewText,summary
31,5.0,B001AXA056,this was a past order but still works fine :D,Five Stars
32,4.0,B001AXA056,"very nice, product, fast delivery, i recommend...",Four Stars
33,5.0,B001AXA056,"My son loves it, I cant even touch it. LOL",Five Stars
34,1.0,B001AXA056,Not good at all!,One Star
35,4.0,B001AXA056,good,Four Stars
132,4.0,B001CJTE0K,Great quality pouch fits my samsung galaxy S4 ...,Four Stars
133,4.0,B001CJTE0K,I love it....Thank You.....!!!!!,Four Stars
134,2.0,B001CJTE0K,ok,Two Stars
135,3.0,B001CJTE0K,5 stars,Three Stars
136,3.0,B001CJTE0K,b,Three Stars


In [96]:
# replace the summary text that just replicates the rating with empty string
df.loc[df["summary"].isin(stars), "summary"] = ""
indexes = [31, 32, 33, 34, 35, 132, 133, 134, 135, 136]
df.loc[indexes, :]

Unnamed: 0,overall,asin,reviewText,summary
31,5.0,B001AXA056,this was a past order but still works fine :D,
32,4.0,B001AXA056,"very nice, product, fast delivery, i recommend...",
33,5.0,B001AXA056,"My son loves it, I cant even touch it. LOL",
34,1.0,B001AXA056,Not good at all!,
35,4.0,B001AXA056,good,
132,4.0,B001CJTE0K,Great quality pouch fits my samsung galaxy S4 ...,
133,4.0,B001CJTE0K,I love it....Thank You.....!!!!!,
134,2.0,B001CJTE0K,ok,
135,3.0,B001CJTE0K,5 stars,
136,3.0,B001CJTE0K,b,


In [97]:
# combine the reviewText and summary columns
df["corpus"] = df["summary"] + " " + df["reviewText"]
df.head()

Unnamed: 0,overall,asin,reviewText,summary,corpus
0,1.0,B0013LKXEI,The only good thing about these headphones is ...,Dont buy these,Dont buy these The only good thing about these...
1,2.0,B0013LKXEI,These headphones came pretty quick. I can say ...,I suppose...,I suppose... These headphones came pretty quic...
2,5.0,B0013LKXEI,I was replacing my earbuds and was really hesi...,Awesome!,Awesome! I was replacing my earbuds and was re...
3,4.0,B0013LKXEI,"They are pretty good. I would buy them again, ...",iPod headphones,iPod headphones They are pretty good. I would ...
4,1.0,B0014HKJKY,These don't even come close to fitting the Iph...,Fits Like a Glove... on OJ Simpson's Hand,Fits Like a Glove... on OJ Simpson's Hand Thes...


In [98]:
# drop the reviewText and summary columns
df = df.drop(columns=["reviewText", "summary"])
df.head()

Unnamed: 0,overall,asin,corpus
0,1.0,B0013LKXEI,Dont buy these The only good thing about these...
1,2.0,B0013LKXEI,I suppose... These headphones came pretty quic...
2,5.0,B0013LKXEI,Awesome! I was replacing my earbuds and was re...
3,4.0,B0013LKXEI,iPod headphones They are pretty good. I would ...
4,1.0,B0014HKJKY,Fits Like a Glove... on OJ Simpson's Hand Thes...


In [99]:
# look at rows that only contain non-english characters
df[~df["corpus"].str.contains("[a-zA-Z]")]

Unnamed: 0,overall,asin,corpus
5425,5.0,B004S227UW,:)
5831,1.0,B004YRBM1Q,", , . ."
9699,5.0,B005SSBE7W,
12484,5.0,B0074R1PI8,:-) :-)
13587,5.0,B0074R0Z3O,10/10
14122,5.0,B007WWII3A,:)
14351,5.0,B008VUZPUQ,=)
21883,4.0,B00F3IVOEA,:)
22502,4.0,B00F3IVOEA,:)
27111,1.0,B00F3J4B5S,:/


In [88]:
# delete rows that only contain special characters and no characters
rows_no_chars = df[~df["corpus"].str.contains("[a-zA-Z]")].shape[0]
print(f"DELETING {rows_no_chars} ROWS THAT DO NOT CONTAIN ANY ENGLISH CHARACTERS")
df = df[df["corpus"].str.contains("[a-zA-Z]")]

DELETING 0 ROWS THAT DO NOT CONTAIN ANY ENGLISH CHARACTERS


In [56]:
# drop rows that have less than X words in the corpus
df["filter"] = df["corpus"].apply(lambda x: x if len(x.split()) > 10 else np.nan)
print(f"Number of rows before filtering: {df.shape[0]}")
df = df.dropna(subset=["filter"])
print(f"Number of rows after filtering: {df.shape[0]}")

Number of rows before filtering: 65931
Number of rows after filtering: 42175


KeyError: "None of [Index([30, 31, 33, 34, 35], dtype='int64')] are in the [index]"

In [257]:
print(df.shape)
df[df["filter"].isna()].shape
df[df["filter"].isna()].head()

(42175, 4)


Unnamed: 0,overall,asin,corpus,filter


In [127]:



#df_new = df.dropna(subset=['corpus'])
#print(df_new.shape)

(65961, 6)
(21830, 6)


Unnamed: 0,overall,asin,reviewText,summary,corpus,filter
30,5.0,B001AXA056,Loved it wen i used,Great Phone,Great Phone Loved it wen i used,
34,1.0,B001AXA056,Not good at all!,One Star,One Star Not good at all!,
35,4.0,B001AXA056,good,Four Stars,Four Stars good,
36,1.0,B001AXA056,Broke even after a exchange. No recommended.,No recommended.,No recommended. Broke even after a exchange. N...,
133,4.0,B001CJTE0K,I love it....Thank You.....!!!!!,Four Stars,Four Stars I love it....Thank You.....!!!!!,


In [129]:
df["corpus"][34]

'One Star Not good at all!'

In [None]:
df = df[df['reviewText'].str.len() > 10]
type(df["reviewText"][65872])
df.head(2)

In [14]:
df.iloc[:, 9]

0         NaN
1         NaN
2         NaN
3           3
4         NaN
         ... 
66084     NaN
66085     4.0
66086     2.0
66087     7.0
66088    17.0
Name: vote, Length: 66089, dtype: object

In [5]:
# drop nan values in the reviewText column
print(df.isna().sum())
df_new = df.dropna(subset=['reviewText'])
df_new.isna().sum()

overall               0
verified              0
reviewTime            0
reviewerID            0
asin                  0
reviewerName          5
reviewText          128
summary              72
unixReviewTime        0
vote              55682
image             64915
style             32399
dtype: int64


overall               0
verified              0
reviewTime            0
reviewerID            0
asin                  0
reviewerName          5
reviewText            0
summary              65
unixReviewTime        0
vote              55574
image             64814
style             32348
dtype: int64

In [6]:
# combine the rows of reviewText and summary into one column if the summary is not nan
df_new["summary"] = df_new["summary"].replace(np.nan, "")
df_new["corpus"] = df_new["reviewText"] + " " + df_new["summary"]
print(df_new.isna().sum())
df_new.head(2)

overall               0
verified              0
reviewTime            0
reviewerID            0
asin                  0
reviewerName          5
reviewText            0
summary               0
unixReviewTime        0
vote              55574
image             64814
style             32348
corpus                0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new["summary"] = df_new["summary"].replace(np.nan, "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new["corpus"] = df_new["reviewText"] + " " + df_new["summary"]


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style,corpus
0,1.0,True,"07 2, 2013",A2X38TJZHB6EC6,B0013LKXEI,Ruckuss,The only good thing about these headphones is ...,Dont buy these,1372723200,,,,The only good thing about these headphones is ...
1,2.0,True,"06 29, 2013",A3VNPI9ZGMMB45,B0013LKXEI,Karen,These headphones came pretty quick. I can say ...,I suppose...,1372464000,,,,These headphones came pretty quick. I can say ...


In [7]:
df_new["corpus"] = df_new["corpus"].str.replace("\n", " ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new["corpus"] = df_new["corpus"].str.replace("\n", " ")


In [9]:
add = []
for row in df_new["corpus"]:
    try:
        if detect(row) != "en":
            add.append(row)
    except:
        print("This row throws and error:", row)
print(add)

This row throws and error:                     
This row throws and error: :-) :-)
This row throws and error: 5* 5*


In [14]:
detect(df_new["corpus"][9688])

'en'

In [None]:
from googletrans import Translator
translator = Translator()
translator.translate('안녕하세요.')

In [11]:
detect('ok Two Stars')

'pl'

# Notes
- Filter reviews for length > 50 words
- lower case