In [None]:
# import related packages
import pandas as pd
import numpy as np

In [None]:
# read the data
reviews = pd.read_csv('iphone14_customer_review.csv')
reviews.head()

Unnamed: 0,title,rating,review,customer_name,dates,customer_location
0,Terrific,5.0,I bought iPhone 14 in big billion days. Very h...,Sathvick Kumaran,4 months ago,The Nilgiris District
1,Fabulous!,5.0,Best smart phone under this price range compar...,Rahul Prasad,"Jan, 2023",Debipur
2,Great product,5.0,Nice camera but battery drain fast specially o...,Tara singh mehra,11 months ago,Ramnagar
3,Just wow!,5.0,GoodREAD MORE,Avi Nash,"Feb, 2023",Bengaluru
4,Good quality product,4.0,Awesome üëçREAD MORE,Ashwini biswal,"Oct, 2022",Bhubaneswar


In [None]:
# count the number of rows in the dataset
reviews_num = reviews.shape[0]
print('number of rows:', reviews_num)

# count number of reviews that contain "READ MORE"
reviews_read_more_num = (reviews['review'].str.contains('READ MORE')).sum()
print('number of reviews(contain READ MORE):', reviews_read_more_num)

number of rows: 1024
number of reviews(contain READ MORE): 1024


In [None]:
# we notice every review contain 'READ MORE', which is meaningless for our text mining.
# delete 'READ MORE' from each review
reviews['review'] = reviews['review'].str.replace('READ MORE','',regex=False)

reviews_read_more_num = (reviews['review'].str.contains('READ MORE')).sum()
print('number of reviews(contain READ MORE):', reviews_read_more_num)
reviews.head()

number of reviews(contain READ MORE): 0


Unnamed: 0,title,rating,review,customer_name,dates,customer_location
0,Terrific,5.0,I bought iPhone 14 in big billion days. Very h...,Sathvick Kumaran,4 months ago,The Nilgiris District
1,Fabulous!,5.0,Best smart phone under this price range compar...,Rahul Prasad,"Jan, 2023",Debipur
2,Great product,5.0,Nice camera but battery drain fast specially o...,Tara singh mehra,11 months ago,Ramnagar
3,Just wow!,5.0,Good,Avi Nash,"Feb, 2023",Bengaluru
4,Good quality product,4.0,Awesome üëç,Ashwini biswal,"Oct, 2022",Bhubaneswar


In [None]:
# reviews in some rows are not in English
# delete rows where reviews are not in English

# install packages
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m276.5/981.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m972.8/981.5 kB[0m [31m18.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m981.5/981.5 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Buil

In [None]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0 # make results consistent

# create a function to detect language
def detect_language(text):
  if isinstance(text,str) and text.strip() and any(c.isalpha() for c in text):
    try:
      return detect(text)
    except:
      return 'unknown'
  return 'unknown'

# create a column to store the detected language
reviews['language'] = reviews['review'].apply(detect_language)

# keey only reviews that are written in English
reviews = reviews[reviews['language']=='en'].drop(columns='language')

reviews_only_english = reviews.shape[0]
print('number of reviews that are written in English:',reviews_only_english)

number of reviews that are written in English: 732


In [None]:
# store the revised dataset to a new csv. file
reviews.to_csv('ihpone_14_customer_reviews_english.csv',index=False)