# Install and Prepare Packages

In [None]:
#install package from github
pip install --upgrade https://github.com/JoMingyu/google-play-scraper/tarball/master #Google Play Scraper

In [None]:
pip install Sastrawi #NLP of Bahasa Indonesia

In [None]:
#Preparing the library

#Scraping
import json
import pandas as pd
from tqdm import tqdm

from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter

from google_play_scraper import Sort, reviews, app

#Visualizing
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

#Cleansing
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from stop_words import get_stop_words
import re
import string
import collections

#TF IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Clustering
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.extmath import randomized_svd


# Data Scraping

In [None]:
#List of the apps, put the words after id=
app_packages = [
  'com.tokopedia.tkpd',
  'com.tokopedia.sellerapp',
  'com.tokopedia.kelontongapp',
  'com.shopee.id',
  'com.shopee.mitra.id',
  'com.lazada.android',
  'com.sc.lazada',
  'com.bukalapak.android',
  'com.bukalapak.mitra',
  'blibli.mobile.commerce',
  'blibli.instore.mitra',
  'com.gdn.blibli.mta',
  'jd.cdyjy.overseas.market.indonesia',
  'com.jdid.fans',
  'id.jd.cn.seller',
]

In [None]:
app_infos = []

for ap in tqdm(app_packages):
  info = app(ap, lang='id', country='id')
  del info['comments']
  app_infos.append(info)

In [None]:
def print_json(json_object):
  json_str = json.dumps(
    json_object,
    indent=2,
    sort_keys=True,
    default=str
  )
  print(highlight(json_str, JsonLexer(), TerminalFormatter()))

In [None]:
print_json(app_infos[0]) #App info of the first app

In [None]:
#Icons of the apps

def format_title(title):
  sep_index = title.find(':') if title.find(':') != -1 else title.find('-')
  if sep_index != -1:
    title = title[:sep_index]
  return title[:10]

fig, axs = plt.subplots(3, len(app_infos) // 3, figsize=(14, 8))

for i, ax in enumerate(axs.flat):
  ai = app_infos[i]
  img = plt.imread(ai['icon'])
  ax.imshow(img)
  ax.set_title(format_title(ai['title']))
  ax.axis('off')

In [None]:
#Export App Infos to CSV
app_infos_df = pd.DataFrame(app_infos)
app_infos_df.to_csv('D:/Analyze the Data Series/Ep 2 Extracting and Scrapping/Play Store/Online Retail Indonesia/apps_info_indonesia.csv', index=None, header=True)

In [None]:
#Scrapping app reviewsS

app_reviews = []

for ap in tqdm(app_packages):
  for score in list(range(1, 6)):
    for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
      rvs, _ = reviews(
        ap,
        country='id',
        sort=sort_order,
        count= 200 if score == 3 else 100, #Define Sample Size
        filter_score_with=score
      )
      for r in rvs:
        r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
        r['appId'] = ap
      app_reviews.extend(rvs)
 
#source : https://towardsdatascience.com/create-dataset-for-sentiment-analysis-by-scraping-google-play-app-reviews-using-python-ceaaa0e41c1

In [None]:
len(app_reviews)

In [None]:
print_json(app_reviews[12875])

In [None]:
#Extract App Reviews to CSV
app_reviews_df = pd.DataFrame(app_reviews)
app_reviews_df.to_csv('D:/Analyze the Data Series/Ep 2 Extracting and Scrapping/Play Store/Online Retail Indonesia/reviews_online_retail_eng.csv', index=None, header=True)

# Data Cleansing

In [None]:
#Import Data 
app_reviews_df = pd.read_csv('D:/Analyze the Data Series/Ep 2 Extracting and Scrapping/Play Store/Online Retail Indonesia/reviews_online_retail_indonesia.csv')
app_reviews_df.head()

In [None]:
#Check Null Value
print('Null Data:')
print(app_reviews_df.isnull().sum())

In [None]:
# Drop Duplicate rows whose particular column is duplicate, the same people can't the same score, same review, and at the same time
data1 = app_reviews_df.drop_duplicates(subset=['userName','at','score','content'])
print('{:,} rows; {:,} columns'
      .format(data1.shape[0], data1.shape[1]))

In [None]:
# Drop Duplicate rows whose particular column is duplicate
data2 = data1.drop_duplicates(subset=['content'])
print('{:,} rows; {:,} columns'
      .format(data2.shape[0], data2.shape[1]))

#    #        Text Pre-Processing

In [None]:
#create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def case_fold(text, title = None):
        text = text.lower() #lowercase
        text = re.sub(r"\d+", " ", text) #remove numbers
        text = re.sub(r"[,.;@#?!&$]+\ *", " ", text) #renove punctuation
        text = text.strip() #remove whitepace
        text = stemmer.stem(text) #stemming
        return text

#source https://medium.com/@ksnugroho/dasar-text-preprocessing-dengan-python-a4fa52608ffe

In [None]:
#list of stopwords, Adjust stopwords to your contents and your topics
stopwords = set(get_stop_words('indonesian'))
stopwords.update(get_stop_words('english'))
stopwords.update(["nya",'yg','gak','ga','udah','gk','kalo','sy','ya','tdk','sya','pa','nih','uda','udh','kl','lg','lgi','koq',
                  'tpi','tp','aja','ja','lbh','lbih','dr','dri','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
                 'q','r','s','t','u','v','w','x','y','z','dah','gue','deh','sih','lo','banget','bgt','blm','sdh','klo','jgn',
                  'jg','kya','gw','ngga','dpt','dpat','ko','sampe','smp','smpe','juga','gtu','bs','bsa','nggak','loh','ni','kak',
                 'ehh','yaa','kaya','krna','krn','karna','jadi','untuk','terus'])
stopwords.update(['aplikasi','mitra','seller','tokopedia','blibli','bukalapak','shopee','lazada','jdid','jd','id'])


In [None]:
#Adjust lowercase, remove numbers, punctuation, and stemming
data2["processed"] = data2["content"].apply(case_fold)

In [None]:
#remove stopwords
data2['cleaned'] = data2['processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
print(data2[['appId','content','score','processed','cleaned']])

In [None]:
#Extract review after pre-processing text to CSV
reviews_clean = data2[['appId','content','score','processed','cleaned']]
reviews_clean_df = pd.DataFrame(reviews_clean)
reviews_clean_df.to_csv('D:/Analyze the Data Series/Ep 2 Extracting and Scrapping/Play Store/Online Retail Indonesia/reviews_clean.csv', index=None, header=True)

# Word frequency Visualization

In [None]:
#Visualize Word Frequency with Bar Chart and word Cloud
#Update your stopwords by checking on these Graph

In [None]:
#Import Data 
reviews_clean_df = pd.read_csv('D:/Analyze the Data Series/Ep 2 Extracting and Scrapping/Play Store/Online Retail Indonesia/reviews_clean.csv')
reviews_clean_df.head()

In [None]:
#Word Cloud

def show_wordcloud(data, title = None):
    text = " ".join(review for review in data)
    text = text.lower()
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(text))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud,interpolation='bilinear')
    plt.show()

show_wordcloud(reviews_clean_df['processed'])

In [None]:
#Cleaned reviews
product = reviews_clean_df['cleaned'].values.tolist()
words_in_review = [word.split() for word in product]

# List of all words across tweets
all_words = [item for sublist in review_nsw for item in sublist]

# Create counter
import collections
counts_words = collections.Counter(all_words)

counts_words.most_common(15)

In [None]:
most_words = pd.DataFrame(counts_words.most_common(15),
                             columns=['words', 'count'])


fig, ax = plt.subplots(figsize=(8, 8))

# Plot horizontal bar graph
most_words.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="purple")

ax.set_title("Common Words Found in Review (Including All Words)")

plt.show()

# Analyze Reviews by Clustering the words

In [None]:
#Import The Data
reviews_clean_df = pd.read_csv('D:/Analyze the Data Series/Ep 2 Extracting and Scrapping/Play Store/Online Retail Indonesia/reviews_clean.csv')

In [None]:
#For example, I want to know what user dont like about the apps, so they gave 1 and 2 score for the review.
neg_reviews = reviews_clean_df[reviews_clean_df["score"].isin([1, 2])]
neg_reviews.shape

In [1]:
#Check null rows
print('Null Data:')
print(neg_reviews.isnull().sum())

Null Data:


NameError: name 'neg_reviews' is not defined

In [None]:
# If you want to drop rows whose particular column is null, use this code
data_neg = neg_reviews.dropna(subset=['cleaned'])
print('{:,} rows; {:,} columns'
      .format(data_neg.shape[0], data_neg.shape[1]))

In [None]:
# tfidf vectorizer of scikit learn
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data_neg['cleaned'].values.astype('U'))
print(X.shape) # check shape of the document-term matrix
terms = vectorizer.get_feature_names()

In [None]:
#Find the Optimal number of Cluster

# Create empty sse dictionary
sse = {}

# Fit KMeans algorithm on k values between 1 and 11
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    sse[k] = kmeans.inertia_

In [None]:
#Graph of How to Find The Optimal Number of Cluster

# Add the title to the plot
plt.title('Elbow criterion method chart')

# Create and display a scatter plot
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()

In [None]:
num_clusters = 3 # Define the number of clusters based on the graph
km = KMeans(n_clusters=num_clusters).fit(X)
clusters = km.labels_.tolist()

In [None]:
# applying lsa //////////////////////////////
# The words on clusters

U, Sigma, VT = randomized_svd(X, n_components=num_clusters, n_iter=100,
                              random_state=549)
#printing the concepts
for i, comp in enumerate(VT):
        terms_comp = zip(terms, comp)
        sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
        print("Concept "+str(i)+": ")
        for t in sorted_terms:
            print(t[0])
        print(" ")
        
#https://medium.com/kuzok/news-documents-clustering-using-python-latent-semantic-analysis-b95c7b68861c

In [None]:
#Merge data with label
label = km.labels_
label = pd.DataFrame(label)

df_label_neg = pd.concat([data_neg.reset_index(drop=True), label], axis=1)
df_label_neg.head()

In [None]:
print(df_label_neg.shape)
print(label[0].value_counts()) # Count the members of each label

In [None]:
#Extract File into CSV
df_label_neg.to_csv('D:/Analyze the Data Series/Ep 2 Extracting and Scrapping/Play Store/Online Retail Indonesia/label_neg.csv', index=None, header=True)