# Dependencies

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
!pip install nlp-id



# Deep Cleaning

## Get dirty data

In [None]:
%cd '/content/drive/MyDrive/Magang/DDB/sentiment mypertamina'

/content/drive/MyDrive/Magang/DDB/sentiment mypertamina


In [None]:
df = pd.read_csv('appStore_filtered_reviews.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      79 non-null     object
 1   Rating     79 non-null     int64 
 2   Review     79 non-null     object
 3   User Name  79 non-null     object
 4   Date       79 non-null     object
 5   Is Edited  79 non-null     bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 3.3+ KB


In [None]:
df.head(3)

Unnamed: 0,Title,Rating,Review,User Name,Date,Is Edited
0,Sangat buruk,1,Saya mendaftarkan aplikasi ini utk orang tua s...,pertaminj,2024-01-01 09:54:16,False
1,Server error terus pas mau bayar via QR,1,Tolong diperbaiki layanannya. Sudah beberapa k...,Dika012,2024-03-21 02:47:42,False
2,Aplikasi dak jelas,1,"Aplikasi dak jelas, udah daftar tapi masih aja...",Evans sanders,2024-03-21 00:33:31,False


In [None]:
combined = []

# combining title and review column
for idx in range(len(df)):
    combined.append(df['Title'][idx] + " " + df['Review'][idx])

len(combined)

79

In [None]:
combined = {'original_reviews' : combined}
combined = pd.DataFrame(combined)
combined.head(3)

Unnamed: 0,original_reviews
0,Sangat buruk Saya mendaftarkan aplikasi ini ut...
1,Server error terus pas mau bayar via QR Tolong...
2,"Aplikasi dak jelas Aplikasi dak jelas, udah da..."


## Load necessary file for cleaning

In [None]:
%cd '/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore'

/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore


### root words

In [None]:
file_path = 'combined_root_words.txt'

with open(file_path, "r") as file:
    root_words = file.readlines()

root_words = [word.strip() for word in root_words]

In [None]:
len(root_words)

28010

In [None]:
root_words[:3]

['abad', 'abah', 'abangan']

### slang words

In [None]:
import ast

# Specify the path to your text file
file_path = 'combined_slang_words.txt'

# Open the file and read its contents
with open(file_path, "r") as file:
    # Read the entire contents of the file as a string
    file_contents = file.read()

    # Safely parse the string representation of the dictionary
    slang_words = ast.literal_eval(file_contents)

In [None]:
len(slang_words)

1018

In [None]:
slang_words

{'@': 'di',
 'abis': 'habis',
 'ad': 'ada',
 'adlh': 'adalah',
 'afaik': 'as far as i know',
 'ahaha': 'haha',
 'aj': 'saja',
 'ajep-ajep': 'dunia gemerlap',
 'ak': 'saya',
 'akika': 'aku',
 'akkoh': 'aku',
 'akuwh': 'aku',
 'alay': 'norak',
 'alow': 'halo',
 'ambilin': 'ambilkan',
 'ancur': 'hancur',
 'anjrit': 'anjing',
 'anter': 'antar',
 'ap2': 'apa-apa',
 'apasih': 'apa sih',
 'apes': 'sial',
 'aps': 'apa',
 'aq': 'saya',
 'aquwh': 'aku',
 'asbun': 'asal bunyi',
 'aseekk': 'asyik',
 'asekk': 'asyik',
 'asem': 'asam',
 'aspal': 'asli tetapi palsu',
 'astul': 'asal tulis',
 'ato': 'atau',
 'au ah': 'tidak mau tahu',
 'awak': 'saya',
 'ay': 'sayang',
 'ayank': 'sayang',
 'b4': 'sebelum',
 'bakalan': 'akan',
 'bandes': 'bantuan desa',
 'bangedh': 'banget',
 'banpol': 'bantuan polisi',
 'banpur': 'bantuan tempur',
 'basbang': 'basi',
 'bcanda': 'bercanda',
 'bdg': 'bandung',
 'begajulan': 'nakal',
 'beliin': 'belikan',
 'bencong': 'banci',
 'bentar': 'sebentar',
 'ber3': 'bertiga',
 'b

### stop words

In [None]:
from nlp_id.stopword import StopWord
stop_words = StopWord()
stop_words = stop_words.get_stopword()

In [None]:
stop_words[:20]

['ada',
 'adalagi',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agak-agak',
 'agaknya',
 'agar',
 'aja',
 'akan',
 'akankah',
 'akankan',
 'akhir',
 'akhir-akhirnya',
 'akhirannya',
 'akhiri',
 'akhirinya',
 'akhirnya',
 'aku']

In [None]:
len(stop_words)

1168

## Cleaning Process

In [None]:
df = combined.copy()
df.head(5)

Unnamed: 0,original_reviews
0,Sangat buruk Saya mendaftarkan aplikasi ini ut...
1,Server error terus pas mau bayar via QR Tolong...
2,"Aplikasi dak jelas Aplikasi dak jelas, udah da..."
3,Barcode eror Sering eror !
4,"APLIKASI SAMPAH Daftar udah, tapu nunggu verif..."


### Text Preprocessing

In [None]:
def preprocessing(text):
    '''
    1. transform all alphabet into lower case
    2. removes all punctuation, numbers, and special character
    3. removes unnecessary space, tab etc character
    '''
    text = text.lower()
    text = re.sub(r'[^A-Za-z]',' ', text)

    regex_pattern = r"\s{2,}|\t|\n"

    text = re.sub(regex_pattern, " ", text).strip()

    return text

In [None]:
preprocessed_data = df['original_reviews'].apply(preprocessing)
preprocessed_data[0]

'sangat buruk saya mendaftarkan aplikasi ini utk orang tua saya saya berbeda kota dengan orang tua saya bersyukur kota yang saya tinggali tidak peru menggunkan aplikasi ini orang tua saya gaptek sehingga saya bantu utk mendaftarkan dan mengajari cara pemakaian ntah kenapa aplikasi ini sangat ngeleg dan minta direset sy beberapa kali mereset dan gagal dengan tampilan undefined is not an object evaluating f o tempareaadministrative b kalau memang wajib pakai aplikasi ini utk mengisi bbm cobalah utk meningkatkan performa aplikasi ini terlebih dahulu dan siapkan petugas di spbu yang siap dimintai tolong utk membantu menggunakan aplikasi'

### removing stop words

In [None]:
from nlp_id.stopword import StopWord
stopword = StopWord()

def clean_stop_words(text):
    return stopword.remove_stopword(text)


In [None]:
cleaned_text = preprocessed_data.apply(clean_stop_words)

In [None]:
cleaned_text[0]

'buruk mendaftarkan aplikasi utk orang tua berbeda kota orang tua bersyukur kota tinggali peru menggunkan aplikasi orang tua gaptek bantu utk mendaftarkan mengajari pemakaian ntah aplikasi ngeleg direset sy mereset gagal tampilan undefined is not an object evaluating f o tempareaadministrative b wajib pakai aplikasi utk mengisi bbm cobalah utk meningkatkan performa aplikasi siapkan petugas spbu tolong utk membantu aplikasi'

### Lemmatization

In [None]:
!pip install nlp-id



In [None]:
from nlp_id.lemmatizer import Lemmatizer
lemmatizer = Lemmatizer()

def lemmatizing(text):
    return lemmatizer.lemmatize(text)


In [None]:
lemmatized = cleaned_text.apply(lemmatizing)

In [None]:
lemmatized[0]

'buruk daftar aplikasi utk orang tua beda kota orang tua syukur kota tinggal peru gun aplikasi orang tua gaptek bantu utk daftar ajar pakai ntah aplikasi ngeleg direset sy mereset gagal tampil undefined is not an object evaluating f o tempareaadministrative b wajib pakai aplikasi utk isi bbm coba utk tingkat performa aplikasi siap tugas spbu tolong utk bantu aplikasi'

### Normalize slang words

In [None]:
from nlp_id.tokenizer import Tokenizer
tokenizer = Tokenizer()

def tokenizing(text):
    return tokenizer.tokenize(text)

In [None]:
tokenized = lemmatized.apply(tokenizing)
tokenized[0]

['buruk',
 'daftar',
 'aplikasi',
 'utk',
 'orang',
 'tua',
 'beda',
 'kota',
 'orang',
 'tua',
 'syukur',
 'kota',
 'tinggal',
 'peru',
 'gun',
 'aplikasi',
 'orang',
 'tua',
 'gaptek',
 'bantu',
 'utk',
 'daftar',
 'ajar',
 'pakai',
 'ntah',
 'aplikasi',
 'ngeleg',
 'direset',
 'sy',
 'mereset',
 'gagal',
 'tampil',
 'undefined',
 'is',
 'not',
 'an',
 'object',
 'evaluating',
 'f',
 'o',
 'tempareaadministrative',
 'b',
 'wajib',
 'pakai',
 'aplikasi',
 'utk',
 'isi',
 'bbm',
 'coba',
 'utk',
 'tingkat',
 'performa',
 'aplikasi',
 'siap',
 'tugas',
 'spbu',
 'tolong',
 'utk',
 'bantu',
 'aplikasi']

In [None]:
def normalizing_slang(tokenized_text):
    result = [word if word not in slang_words.keys() else slang_words[word] for word in tokenized_text]
    return result

In [None]:
normalized_slang = tokenized.apply(normalizing_slang)

In [None]:
normalized_slang[5]

['ga',
 'register',
 'kendara',
 'aplikasi',
 'dsuruh',
 'daftar',
 'untuk',
 'bbm',
 'subsidi',
 'tapi',
 'gilir',
 'kendara',
 'muncul',
 'error',
 'server',
 'jadi',
 'dr',
 'tunggu',
 'say',
 'baik',
 'nihil']

### Normalize words which is not stored in slang words and stop words

#### Check the progress

In [None]:
def join(list_word):
    return " ".join(list_word)

joining = normalized_slang.apply(join)

In [None]:
comparison = pd.concat([df['original_reviews'], joining], axis=1)
comparison.columns = ['before', 'after']

In [None]:
comparison

Unnamed: 0,before,after
0,Sangat buruk Saya mendaftarkan aplikasi ini ut...,buruk daftar aplikasi untuk orang tua beda kot...
1,Server error terus pas mau bayar via QR Tolong...,server error bayar qr tolong baik layan bayar ...
2,"Aplikasi dak jelas Aplikasi dak jelas, udah da...",aplikasi dak aplikasi dak daftar ribet dak ban...
3,Barcode eror Sering eror !,barcode eror eror
4,"APLIKASI SAMPAH Daftar udah, tapu nunggu verif...",aplikasi sampah daftar tapu nunggu verifikasi ...
...,...,...
74,"User interfaca User interface lebih simple, sk...",user interfaca user interface simple skrg gak ...
75,Dongoooo Aplikasi SSIIAALLAANN,dongoooo aplikasi ssiiaallaann
76,Point sampah Adain fitur point yg bisa dituker...,point sampah adain fitur point dituker voucher...
77,Gajelas Makin update makin gak jelas? reset co...,gajelas update gak reset code claim gak


#### scan the unknown words

In [None]:
unknown_words = []

def detecting_unknown(list_word):
    for word in list_word:
        if word not in root_words and word not in unknown_words:
            unknown_words.append(word)

    return list_word

In [None]:
detect = normalized_slang.apply(detecting_unknown)

In [None]:
len(unknown_words)

317

In [None]:
unknown_words[:5]

['peru', 'gagap teknologi', 'ntah', 'ngeleg', 'direset']

#### using generative AI to obtain dictionary for normalizing

In [None]:
!pip install -U google-generativeai



In [None]:
import google.generativeai as genai
import os
from google.colab import userdata
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

# https://aistudio.google.com/app/u/3/apikey
# https://github.com/google-gemini/cookbook/blob/main/quickstarts/Authentication.ipynb

In [None]:
model = genai.GenerativeModel('gemini-1.0-pro-latest')

response = model.generate_content(
    f"I have a list of Indonesian words which do not exist in the Indonesian \
    dictionary. Provide me with a Python dictionary containing key-value pairs \
    of unnormalized and normalised words. Here is the list of Indonesian words.\
     {unknown_words[:100]}. provide me a whole dictionary"
    )

result = response.text
print(result)

```python
unnormalized_normalized_dict = {
    'peru': 'peru',
    'gagap teknologi': 'gagap teknologi',
    'ntah': 'entah',
    'ngeleg': 'lemot',
    'direset': 'diatur ulang',
    'saya': 'saya',
    'mereset': 'mengatur ulang',
    'undefined': 'tidak terdefinisi',
    'is': 'adalah',
    'an': 'sebuah',
    'object': 'objek',
    'evaluating': 'mengevaluasi',
    'tempareaadministrative': 'tempat wilayah administratif',
    'wajib': 'wajib',
    'bbm': 'bahan bakar minyak',
    'spbu': 'stasiun pengisian bahan bakar umum',
    'tolong': 'tolong',
    'server': 'server',
    'error': 'kesalahan',
    'qr': 'kode QR',
    'dak': 'tidak',
    'ribet': 'rumit',
    'bangka': 'bangka',
    'jakarta': 'jakarta',
    'lebih baik': 'lebih baik',
    'vivo': 'vivo',
    'bangkrut': 'bangkrut',
    'pertamina': 'pertamina',
    'ngalah': 'mengalah',
    'barcode': 'kode batang',
    'tapu': 'kentang',
    'nunggu': 'menunggu',
    'verifikasi': 'verifikasi',
    'ga': 'tidak',
    'dongo':

In [None]:
response2 = model.generate_content(
    f"I have a list of Indonesian words which do not exist in the Indonesian \
    dictionary. Provide me with a Python dictionary containing key-value pairs \
    of unnormalized and normalised words. Here is the list of Indonesian words.\
     {unknown_words[100:200]}. provide me a whole dictionary"
    )

result2 = response2.text
print(result2)

```python
unnormalized_to_normalized_words = {
    "cash": "kas",
    "cashless": "tanpa uang tunai",
    "darkweb": "darkweb",
    "slow": "lambat",
    "dislowkan": "diperlambat",
    "cc": "cc",
    "cobain": "coba",
    "always": "selalu",
    "shell": "shell",
    "v": "v",
    "power": "kekuatan",
    "pertamax": "pertamax",
    "link": "tautan",
    "pesan": "pesan",
    "errornya": "kesalahannya",
    "yowis": "yowis",
    "login": "masuk",
    "issue": "masalah",
    "renders": "memberikan",
    "mypertamina": "mypertamina",
    "unusable": "tidak dapat digunakan",
    "recently": "baru-baru ini",
    "attempted": "mencoba",
    "to": "untuk",
    "utilize": "menggunakan",
    "the": "the",
    "application": "aplikasi",
    "access": "akses",
    "its": "aplikasinya",
    "features": "fiturnya",
    "only": "hanya",
    "encounter": "menemui",
    "significant": "signifikan",
    "hurdle": "rintangan",
    "with": "dengan",
    "process": "proses",
    "despite": "meskipun",


In [None]:
response3 = model.generate_content(
    f"I have a list of Indonesian words which do not exist in the Indonesian \
    dictionary. Provide me with a Python dictionary containing key-value pairs \
    of unnormalized and normalised words. Here is the list of Indonesian words.\
     {unknown_words[200:300]}. provide me a whole dictionary"
    )

result3 = response3.text
print(result3)

```python
unnormalized_normalized_dict = {
    'during': 'lama',
    'using': 'menggunakan',
    'saying': 'mengatakan',
    'that': 'bahwa',
    'overload': 'beban berlebih',
    'or': 'atau',
    'something': 'sesuatu',
    'useless': 'tidak berguna',
    'bug': 'kesalahan',
    'apps': 'aplikasi',
    'tidak ada uang': 'tidak punya uang',
    'recomended': 'disarankan',
    'otp': 'kata sandi sekali pakai',
    'password': 'kata sandi',
    'gk': 'tidak',
    'web': 'situs web',
    'trus': 'terus',
    'apk': 'paket aplikasi',
    'gunany': 'gunanya',
    'nyusahkan': 'menyusahkan',
    'verivikasi': 'verifikasi',
    'makassar': 'Makassar',
    'jl': 'Jalan',
    'rappocini': 'Rappocini',
    'pettarani': 'Pettarani',
    'diinput': 'dimasukkan',
    'etc': 'dan lain-lain',
    'notif': 'pemberitahuan',
    'salah': 'keliru',
    'reset': 'setel ulang',
    'install': 'pasang',
    'ngurusin': 'mengurus',
    'loadingnya': 'proses pemuatan',
    'gabisa': 'tidak bisa',
    'segla'

In [None]:
response4 = model.generate_content(
    f"I have a list of Indonesian words which do not exist in the Indonesian \
    dictionary. Provide me with a Python dictionary containing key-value pairs \
    of unnormalized and normalised words. Here is the list of Indonesian words.\
     {unknown_words[300:]}. provide me a whole dictionary"
    )

result4 = response4.text
print(result4)

```python
unnormalized_normalized_dict = {
    'uninstalled': 'uninstall',
    'mp': 'empe',
    'anehhh': 'aneh',
    'operator': 'operator',
    'ngumpulin': 'kumpulkan',
    'relogin': 'login ulang',
    'interfaca': 'interface',
    'simple': 'sederhana',
    'skrg': 'sekarang',
    'terutilisasi': 'termanfaatkan',
    'dongoooo': 'dong',
    'ssiiaallaann': 'sialan',
    'adain': 'adakan',
    'fitur': 'fitur',
    'gajelas': 'tidak jelas',
    'code': 'kode',
    'claim': 'klaim',
}
```


In [None]:
import ast

def str_to_dict(response):
    start = response.index("{")
    end = response.index("}") + 1

    return ast.literal_eval(response[ start : end ])

In [None]:
result = str_to_dict(result)
result2 = str_to_dict(result2)
result3 = str_to_dict(result3)
result4 = str_to_dict(result4)

In [None]:
norm = {key: value for dic in [result, result2, result3, result4] for key, value in dic.items()}

print(norm)

{'peru': 'peru', 'gagap teknologi': 'gagap teknologi', 'ntah': 'entah', 'ngeleg': 'lemot', 'direset': 'diatur ulang', 'saya': 'saya', 'mereset': 'mengatur ulang', 'undefined': 'tidak terdefinisi', 'is': 'adalah', 'an': 'sebuah', 'object': 'objek', 'evaluating': 'mengevaluasi', 'tempareaadministrative': 'tempat wilayah administratif', 'wajib': 'wajib', 'bbm': 'bahan bakar minyak', 'spbu': 'stasiun pengisian bahan bakar umum', 'tolong': 'tolong', 'server': 'server', 'error': 'kesalahan', 'qr': 'kode QR', 'dak': 'tidak', 'ribet': 'rumit', 'bangka': 'bangka', 'jakarta': 'jakarta', 'lebih baik': 'lebih baik', 'vivo': 'vivo', 'bangkrut': 'bangkrut', 'pertamina': 'pertamina', 'ngalah': 'mengalah', 'barcode': 'kode batang', 'tapu': 'kentang', 'nunggu': 'menunggu', 'verifikasi': 'verifikasi', 'ga': 'tidak', 'dongo': 'bodoh', 'register': 'mendaftar', 'dsuruh': 'disuruh', 'dr': 'dokter', 'say': 'kata', 'months': 'bulan', 'scan': 'memindai', 'all': 'semua', 'outlet': 'gerai', 'balikpapan': 'balikp

In [None]:
# check apakah kata-kata yang kita normalisasi jumlahnya sama
len(norm) == len(unknown_words)

True

#### normalizing words do not exist in indonesian dictionary

In [None]:
normalized_slang

0     [buruk, daftar, aplikasi, untuk, orang, tua, b...
1     [server, error, bayar, qr, tolong, baik, layan...
2     [aplikasi, dak, aplikasi, dak, daftar, ribet, ...
3                                 [barcode, eror, eror]
4     [aplikasi, sampah, daftar, tapu, nunggu, verif...
                            ...                        
74    [user, interfaca, user, interface, simple, skr...
75                   [dongoooo, aplikasi, ssiiaallaann]
76    [point, sampah, adain, fitur, point, dituker, ...
77      [gajelas, update, gak, reset, code, claim, gak]
78    [bayar, pakai, my, pertamina, tolong, tingkat,...
Name: original_reviews, Length: 79, dtype: object

In [None]:
def normalizing_unknown(list_words):
    for word in list_words:
        if word in norm.keys():
            index = list_words.index(word)
            list_words = list_words[:index] + [norm[word]] + list_words[index+1:]

    return list_words

In [None]:
normalized_unknown = normalized_slang.apply(normalizing_unknown)

In [None]:
normalized_unknown

0     [buruk, daftar, aplikasi, untuk, orang, tua, b...
1     [server, kesalahan, bayar, kode QR, tolong, ba...
2     [aplikasi, tidak, aplikasi, tidak, daftar, rum...
3                             [kode batang, eror, eror]
4     [aplikasi, sampah, daftar, kentang, menunggu, ...
                            ...                        
74    [pengguna, antarmuka, pengguna, interface, sed...
75                             [dong, aplikasi, sialan]
76    [point, sampah, adakan, fitur, point, ditukar,...
77    [tidak jelas, memperbarui, tidak, setel ulang,...
78    [bayar, pakai, saya, pertamina, tolong, tingka...
Name: original_reviews, Length: 79, dtype: object

#### repeat some cleaning process for better result

In [None]:
normalized_unknown = normalized_unknown.apply(join)

In [None]:
normalized_unknown

0     buruk daftar aplikasi untuk orang tua beda kot...
1     server kesalahan bayar kode QR tolong baik lay...
2     aplikasi tidak aplikasi tidak daftar rumit tid...
3                                 kode batang eror eror
4     aplikasi sampah daftar kentang menunggu verifi...
                            ...                        
74    pengguna antarmuka pengguna interface sederhan...
75                                 dong aplikasi sialan
76    point sampah adakan fitur point ditukar kupon ...
77    tidak jelas memperbarui tidak setel ulang kode...
78    bayar pakai saya pertamina tolong tingkat stas...
Name: original_reviews, Length: 79, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(clean_stop_words)

In [None]:
normalized_unknown

0     buruk daftar aplikasi orang tua beda kota oran...
1     server kesalahan bayar kode QR tolong layan ba...
2     aplikasi aplikasi daftar rumit bangka belitung...
3                                 kode batang eror eror
4     aplikasi sampah daftar kentang menunggu verifi...
                            ...                        
74    pengguna antarmuka pengguna interface sederhan...
75                                      aplikasi sialan
76    point sampah adakan fitur point ditukar kupon ...
77                   memperbarui setel ulang kode klaim
78    bayar pakai pertamina tolong tingkat stasiun p...
Name: original_reviews, Length: 79, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(lemmatizing)

In [None]:
normalized_unknown

0     buruk daftar aplikasi orang tua beda kota oran...
1     server salah bayar kode qr tolong layan bayar ...
2     aplikasi aplikasi daftar rumit bangka belitung...
3                                 kode batang eror eror
4     aplikasi sampah daftar kentang tunggu verifika...
                            ...                        
74    guna antarmuka guna interface sederhana lot te...
75                                        aplikasi sial
76    point sampah adakan fitur point tukar kupon be...
77                          baru setel ulang kode klaim
78    bayar pakai pertamina tolong tingkat stasiun i...
Name: original_reviews, Length: 79, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(tokenizing)

In [None]:
normalized_unknown

0     [buruk, daftar, aplikasi, orang, tua, beda, ko...
1     [server, salah, bayar, kode, qr, tolong, layan...
2     [aplikasi, aplikasi, daftar, rumit, bangka, be...
3                            [kode, batang, eror, eror]
4     [aplikasi, sampah, daftar, kentang, tunggu, ve...
                            ...                        
74    [guna, antarmuka, guna, interface, sederhana, ...
75                                     [aplikasi, sial]
76    [point, sampah, adakan, fitur, point, tukar, k...
77                    [baru, setel, ulang, kode, klaim]
78    [bayar, pakai, pertamina, tolong, tingkat, sta...
Name: original_reviews, Length: 79, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(normalizing_slang)

In [None]:
normalized_unknown

0     [buruk, daftar, aplikasi, orang, tua, beda, ko...
1     [server, salah, bayar, kode, qr, tolong, layan...
2     [aplikasi, aplikasi, daftar, rumit, bangka, be...
3                            [kode, batang, eror, eror]
4     [aplikasi, sampah, daftar, kentang, tunggu, ve...
                            ...                        
74    [guna, antarmuka, guna, interface, sederhana, ...
75                                     [aplikasi, sial]
76    [point, sampah, adakan, fitur, point, tukar, k...
77                    [baru, setel, ulang, kode, klaim]
78    [bayar, pakai, pertamina, tolong, tingkat, sta...
Name: original_reviews, Length: 79, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(join)

In [None]:
df = normalized_unknown.to_frame()
df.head()

Unnamed: 0,original_reviews
0,buruk daftar aplikasi orang tua beda kota oran...
1,server salah bayar kode qr tolong layan bayar ...
2,aplikasi aplikasi daftar rumit bangka belitung...
3,kode batang eror eror
4,aplikasi sampah daftar kentang tunggu verifika...


## translating

In [None]:
import pandas as pd
df.head()

Unnamed: 0,original_reviews
0,buruk daftar aplikasi orang tua beda kota oran...
1,server salah bayar kode qr tolong layan bayar ...
2,aplikasi aplikasi daftar rumit bangka belitung...
3,kode batang eror eror
4,aplikasi sampah daftar kentang tunggu verifika...


In [None]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m41.0/42.3 kB[0m [31m913.1 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m792.1 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [None]:
from deep_translator import GoogleTranslator

def convert_eng(tweet):
    translator = GoogleTranslator(source='id', target='en')
    translation = translator.translate(tweet)
    return translation


df['original_reviews'] = df['original_reviews'].apply(convert_eng)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def preprocessing(text):
    '''
    1. transform all alphabet into lower case
    2. removes all punctuation, numbers, and special character
    '''
    text = text.lower()
    text = re.sub(r'[^A-Za-z]',' ', text)
    return text

def tokenization(text):
    '''
    split the sentence into list of words
    '''
    token = word_tokenize(text)
    return token

def remove_stopwords(tokenized_text):
    '''
    filter out stopwords from the tokenized text (list of words)
    '''
    words = [i for i in tokenized_text if i not in stopwords.words("english")]
    return words

def lemmatization(removed_stopwords):
    '''
    find the root form of each word
    '''
    processed = [lemmatizer.lemmatize(word) for word in removed_stopwords]
    return " ".join(processed)

In [None]:
# applying the preprocessing function to the dataframe

preproces = df['original_reviews'].apply(preprocessing)
preproces[0:3]

0    bad list of applications parents are in differ...
1    server pays incorrectly with the qr code  plea...
2    complicated list application application  bang...
Name: original_reviews, dtype: object

In [None]:
# applying the tokenization function to the dataframe

tokenize = preproces.apply(tokenization)
tokenize[0:3]

0    [bad, list, of, applications, parents, are, in...
1    [server, pays, incorrectly, with, the, qr, cod...
2    [complicated, list, application, application, ...
Name: original_reviews, dtype: object

In [None]:
# applying the remove_stopwords function to the dataframe

remove_sw = tokenize.apply(remove_stopwords)
remove_sw[0:3]

0    [bad, list, applications, parents, different, ...
1    [server, pays, incorrectly, qr, code, please, ...
2    [complicated, list, application, application, ...
Name: original_reviews, dtype: object

In [None]:
# applying the lemmatization function to the dataframe

lemmatize = remove_sw.apply(lemmatization)
lemmatize[0:3]

0    bad list application parent different city par...
1    server pay incorrectly qr code please pay usin...
2    complicated list application application bangk...
Name: original_reviews, dtype: object

In [None]:
# store preprocessed reviwes into dataframe

df['preprocessed_content'] = lemmatize

In [None]:
%cd '/content/drive/MyDrive/Magang/DDB/sentiment mypertamina'

/content/drive/MyDrive/Magang/DDB/sentiment mypertamina


In [None]:
df.to_csv("deep_cleaned_ready_for_predict_appstore.csv", index=False)

In [None]:
df.head()

Unnamed: 0,original_reviews,preprocessed_content
0,bad list of applications parents are in differ...,bad list application parent different city par...
1,"server pays incorrectly with the QR code, plea...",server pay incorrectly qr code please pay usin...
2,"Complicated list application application, Bang...",complicated list application application bangk...
3,error barcode error,error barcode error
4,rubbish application potato list wait for verif...,rubbish application potato list wait verificat...
