# Install Dependencies


In [None]:
!pip install googletrans emoji pycorenlp



# Import Libraries


In [None]:
import json
import pickle
import re
import string
import emoji
import nltk
import pandas as pd
from googletrans import Translator
from pycorenlp import StanfordCoreNLP
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load Dataset


In [2]:
df = pd.read_csv("./app_review.csv")

In [3]:
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,32394922-251d-471d-a140-f4697939ef47,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Sakit bgt sebenarnya, aku tau peraturan. Tapi ...",1,10,2.20.02,2025-03-29 05:08:24,,,2.20.02
1,7723afaa-fe55-437f-b3a9-d5d85ac18b02,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,ini gimana sih. sudah punya akun login susah a...,1,1,2.20.03,2025-03-28 11:31:03,"Hai kurnia alamsyah, Mohon maaf untuk ketidakn...",2025-03-28 11:49:18,2.20.03
2,0ed27bd9-cefb-4c3a-8b43-32d9cf013ab6,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"07-03-2025 Di saat butuh dihari ini, tidak bis...",1,14,2.20.03,2025-03-28 22:54:51,"Hai Ali Umar, Mohon maaf untuk ketidaknyamanny...",2025-03-07 20:00:52,2.20.03
3,0cbc5ca6-5f68-4513-afc6-4c83e33be83f,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"aplikasi sering tidak bisa dibuka, logout pun ...",2,0,2.20.03,2025-03-30 11:29:02,Hai Happy Blessed. Mohon maaf untuk ketidaknya...,2025-03-30 11:30:25,2.20.03
4,56bbc0c1-d469-45d3-888e-aaa00efe5387,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Sudah bayar malah d tagih terus. Dkira ga baya...,1,11,2.20.02,2025-03-20 23:57:10,Hai Jelita Jentik. Mohon maaf untuk ketidaknya...,2025-02-05 07:59:01,2.20.02


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21074 entries, 0 to 21073
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              21074 non-null  object
 1   userName              21074 non-null  object
 2   userImage             21074 non-null  object
 3   content               21074 non-null  object
 4   score                 21074 non-null  int64 
 5   thumbsUpCount         21074 non-null  int64 
 6   reviewCreatedVersion  15629 non-null  object
 7   at                    21074 non-null  object
 8   replyContent          18481 non-null  object
 9   repliedAt             18481 non-null  object
 10  appVersion            15629 non-null  object
dtypes: int64(2), object(9)
memory usage: 1.8+ MB


# Data Wrangling


In [17]:
async def translate_text(text):
    translator = Translator(service_urls=['translate.googleapis.com'])
    translated = await translator.translate(text, src='id', dest='en')
    return translated.text

In [None]:
for index, row in df.iterrows():
    translated_content = await translate_text(row['content'])
    translated_replyContent = await translate_text(row['replyContent'])

    df.loc[index, 'content_en'] = translated_content
    df.loc[index, 'replyContent_en'] = translated_replyContent

In [20]:
df.to_csv("./app_review_translated.csv")

# Data Cleaning


## Helper Function


In [4]:
def casefold(text):
    return text.lower()


def replace_punctuations(text):
    punctuations = set(string.punctuation)
    for char in text:
        if char in punctuations:
            text = text.replace(char, ' ')
    return text


def clear_emoji(text):
    return emoji.replace_emoji(text, ' ')


def tokenize_text(text):
    text = word_tokenize(text)
    return text


def lemmatize_tokens(word_tokens, word_lemmatizer):
    lemmatized_tokens = [word_lemmatizer.lemmatize(
        word) for word in word_tokens]
    return lemmatized_tokens


def remove_stopwords(word_tokens, stopwords_set):
    text = ' '.join(
        [word for word in word_tokens if word not in stopwords_set])
    text = text.strip()
    return text


lemmatizer = WordNetLemmatizer()
stopwords_English = set(stopwords.words('english'))


def complete_clean(text):
    text = clear_emoji(text)
    text = casefold(text)
    text = re.sub(r'[0-9]+', ' ', text)
    text = text.replace("'", "'")
    text = text.replace("’", "'")
    text = text.replace("´", "'")
    text = text.replace("-", " ")
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = replace_punctuations(text)
    text = tokenize_text(text)
    text = lemmatize_tokens(text, word_lemmatizer=lemmatizer)
    text = remove_stopwords(text, stopwords_English)
    text = text.strip()
    return text

## Clean Data


In [11]:
df_processed = pd.read_csv("./app_review_translated.csv", index_col=0)

In [12]:
df_processed.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,content_en,replyContent_en
0,32394922-251d-471d-a140-f4697939ef47,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Sakit bgt sebenarnya, aku tau peraturan. Tapi ...",1,10,2.20.02,2025-03-29 05:08:24,,,2.20.02,"Really sick, I know the rules. But cook to tol...",Nan
1,7723afaa-fe55-437f-b3a9-d5d85ac18b02,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,ini gimana sih. sudah punya akun login susah a...,1,1,2.20.03,2025-03-28 11:31:03,"Hai kurnia alamsyah, Mohon maaf untuk ketidakn...",2025-03-28 11:49:18,2.20.03,What is this. Already have a very difficult lo...,"Hi Kurnia Alamsyah, apologize for the discomfo..."
2,0ed27bd9-cefb-4c3a-8b43-32d9cf013ab6,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"07-03-2025 Di saat butuh dihari ini, tidak bis...",1,14,2.20.03,2025-03-28 22:54:51,"Hai Ali Umar, Mohon maaf untuk ketidaknyamanny...",2025-03-07 20:00:52,2.20.03,"07-03-2025 When you need this day, the applica...","Hi Ali Umar, apologize for the inconvenience. ..."
3,0cbc5ca6-5f68-4513-afc6-4c83e33be83f,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"aplikasi sering tidak bisa dibuka, logout pun ...",2,0,2.20.03,2025-03-30 11:29:02,Hai Happy Blessed. Mohon maaf untuk ketidaknya...,2025-03-30 11:30:25,2.20.03,"Applications often cannot be opened, logouts s...",Hi happy blessed. Sorry for the inconvenience....
4,56bbc0c1-d469-45d3-888e-aaa00efe5387,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Sudah bayar malah d tagih terus. Dkira ga baya...,1,11,2.20.02,2025-03-20 23:57:10,Hai Jelita Jentik. Mohon maaf untuk ketidaknya...,2025-02-05 07:59:01,2.20.02,Already paid even d bolt continuously. Dkira d...,"Hi, Jentic. Sorry for the inconvenience. Curre..."


In [13]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21074 entries, 0 to 21073
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              21074 non-null  object
 1   userName              21074 non-null  object
 2   userImage             21074 non-null  object
 3   content               21074 non-null  object
 4   score                 21074 non-null  int64 
 5   thumbsUpCount         21074 non-null  int64 
 6   reviewCreatedVersion  15629 non-null  object
 7   at                    21074 non-null  object
 8   replyContent          18481 non-null  object
 9   repliedAt             18481 non-null  object
 10  appVersion            15629 non-null  object
 11  content_en            21074 non-null  object
 12  replyContent_en       21074 non-null  object
dtypes: int64(2), object(11)
memory usage: 2.3+ MB


In [None]:
df_processed.dropna(axis=0, inplace=True)
df_processed.drop_duplicates(inplace=True)
df_processed.reset_index(drop=True, inplace=True)
df_processed.drop(columns="replyContent_en", inplace=True)
df_processed.loc[:, 'cleaned_content_en'] = df_processed.loc[:,
                                                             'content_en'].apply(complete_clean)

In [16]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13631 entries, 0 to 13630
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              13631 non-null  object
 1   userName              13631 non-null  object
 2   userImage             13631 non-null  object
 3   content               13631 non-null  object
 4   score                 13631 non-null  int64 
 5   thumbsUpCount         13631 non-null  int64 
 6   reviewCreatedVersion  13631 non-null  object
 7   at                    13631 non-null  object
 8   replyContent          13631 non-null  object
 9   repliedAt             13631 non-null  object
 10  appVersion            13631 non-null  object
 11  content_en            13631 non-null  object
 12  cleaned_content_en    13631 non-null  object
dtypes: int64(2), object(11)
memory usage: 1.4+ MB


# Data Labeling


## Setup StanfordNLP


In [3]:
!python -m wget "https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip"
!python -m wget "https://nlp.stanford.edu/software/stanford-english-corenlp-2018-10-05-models.jar"
!unzip "stanford-corenlp-full-2018-10-05.zip"
!mv "stanford-english-corenlp-2018-10-05-models.jar" "stanford-corenlp-full-2018-10-05"


Saved under stanford-corenlp-full-2018-10-05 (1).zip

Saved under stanford-english-corenlp-2018-10-05-models.jar
Archive:  stanford-corenlp-full-2018-10-05.zip
   creating: stanford-corenlp-full-2018-10-05/
  inflating: stanford-corenlp-full-2018-10-05/jaxb-core-2.3.0.1-sources.jar  
  inflating: stanford-corenlp-full-2018-10-05/xom-1.2.10-src.jar  
  inflating: stanford-corenlp-full-2018-10-05/CoreNLP-to-HTML.xsl  
  inflating: stanford-corenlp-full-2018-10-05/README.txt  
  inflating: stanford-corenlp-full-2018-10-05/jollyday-0.4.9-sources.jar  
  inflating: stanford-corenlp-full-2018-10-05/LIBRARY-LICENSES  
   creating: stanford-corenlp-full-2018-10-05/sutime/
  inflating: stanford-corenlp-full-2018-10-05/sutime/british.sutime.txt  
  inflating: stanford-corenlp-full-2018-10-05/sutime/defs.sutime.txt  
  inflating: stanford-corenlp-full-2018-10-05/sutime/spanish.sutime.txt  
  inflating: stanford-corenlp-full-2018-10-05/sutime/english.sutime.txt  
  inflating: stanford-corenlp-ful

Run this code in terminal to activate the Stanford Core NLP Server

```bash
cd stanford-corenlp-full-2018-10-05
```

```bash
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
```


In [17]:
coreNLP_Parser = StanfordCoreNLP('http://localhost:9000')

In [None]:
def StanfordNLP_Label(dataframe, column, parser=coreNLP_Parser):
    """ 
    This function returns labels from Very Negative (0) to Very Positive (1)
    Batch process to circumvent coreNLP limit
    """
    score = []
    labels = []
    column_Index = list(dataframe.columns).index(column)
    dataframe_length = len(df_processed)
    for starting_row in range(0, dataframe_length, 500):
        ending_row = starting_row + 500
        if ending_row >= dataframe_length:
            ending_row = dataframe_length+1
        result = json.loads(parser.annotate(
            text=' . '.join(
                dataframe.iloc[starting_row:ending_row, column_Index]),
            properties={'annotators': 'sentiment', 'outputFormat': 'json'}
        ))
        score_result = [value['sentimentValue']
                        for value in (result['sentences'])]
        label_result = [value['sentiment'] for value in (result['sentences'])]
        score.extend(score_result)
        labels.extend(label_result)
    return score, labels

## Labelling Data


In [19]:
df_processed.loc[:, 'Stanford Score'], df_processed.loc[:,
                                                        'Stanford Label'] = StanfordNLP_Label(df_processed, 'cleaned_content_en')

In [20]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13631 entries, 0 to 13630
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              13631 non-null  object
 1   userName              13631 non-null  object
 2   userImage             13631 non-null  object
 3   content               13631 non-null  object
 4   score                 13631 non-null  int64 
 5   thumbsUpCount         13631 non-null  int64 
 6   reviewCreatedVersion  13631 non-null  object
 7   at                    13631 non-null  object
 8   replyContent          13631 non-null  object
 9   repliedAt             13631 non-null  object
 10  appVersion            13631 non-null  object
 11  content_en            13631 non-null  object
 12  cleaned_content_en    13631 non-null  object
 13  Stanford Score        13631 non-null  object
 14  Stanford Label        13631 non-null  object
dtypes: int64(2), object(13)
memory usage

# Saving Preprocessed Data


In [21]:
with open('./processed_objects/app_review_labeled.pkl', 'wb') as file:
    pickle.dump(df_processed, file)