# Dependencies

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
!pip install nlp-id



# Deep Cleaning

## Get dirty data

In [None]:
%cd '/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore'

/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore


In [None]:
# we can get the data by scraping at playstore_mypertamina
df = pd.read_csv('filtered_data.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   reviewId              185 non-null    object 
 1   userName              185 non-null    object 
 2   userImage             185 non-null    object 
 3   content               185 non-null    object 
 4   score                 185 non-null    int64  
 5   thumbsUpCount         185 non-null    int64  
 6   reviewCreatedVersion  185 non-null    object 
 7   at                    185 non-null    object 
 8   replyContent          0 non-null      float64
 9   repliedAt             0 non-null      float64
 10  appVersion            185 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 16.0+ KB


In [None]:
df = df[['content']]
df.head(5)

Unnamed: 0,content
0,"Aplikasi itu harusnya memudahkan, ini malah se..."
1,Aplikasinya dalam 2 bulan terakhir ini mengece...
2,Aplikasi sampah. Sudah setahun lebih gak ada i...
3,"Aplikasi tidak berguna dan membingungkan, pend..."
4,SUGGESTION: Pada halaman ketika mengisi biodat...


## Load necessary file for cleaning

In [None]:
%cd '/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore'

/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore


### root words

In [None]:
file_path = 'combined_root_words.txt'

with open(file_path, "r") as file:
    root_words = file.readlines()

root_words = [word.strip() for word in root_words]

In [None]:
len(root_words)

28010

In [None]:
root_words[:3]

['abad', 'abah', 'abangan']

### slang words

In [None]:
import ast

# Specify the path to your text file
file_path = 'combined_slang_words.txt'

# Open the file and read its contents
with open(file_path, "r") as file:
    # Read the entire contents of the file as a string
    file_contents = file.read()

    # Safely parse the string representation of the dictionary
    slang_words = ast.literal_eval(file_contents)

In [None]:
len(slang_words)

1018

In [None]:
slang_words

{'@': 'di',
 'abis': 'habis',
 'ad': 'ada',
 'adlh': 'adalah',
 'afaik': 'as far as i know',
 'ahaha': 'haha',
 'aj': 'saja',
 'ajep-ajep': 'dunia gemerlap',
 'ak': 'saya',
 'akika': 'aku',
 'akkoh': 'aku',
 'akuwh': 'aku',
 'alay': 'norak',
 'alow': 'halo',
 'ambilin': 'ambilkan',
 'ancur': 'hancur',
 'anjrit': 'anjing',
 'anter': 'antar',
 'ap2': 'apa-apa',
 'apasih': 'apa sih',
 'apes': 'sial',
 'aps': 'apa',
 'aq': 'saya',
 'aquwh': 'aku',
 'asbun': 'asal bunyi',
 'aseekk': 'asyik',
 'asekk': 'asyik',
 'asem': 'asam',
 'aspal': 'asli tetapi palsu',
 'astul': 'asal tulis',
 'ato': 'atau',
 'au ah': 'tidak mau tahu',
 'awak': 'saya',
 'ay': 'sayang',
 'ayank': 'sayang',
 'b4': 'sebelum',
 'bakalan': 'akan',
 'bandes': 'bantuan desa',
 'bangedh': 'banget',
 'banpol': 'bantuan polisi',
 'banpur': 'bantuan tempur',
 'basbang': 'basi',
 'bcanda': 'bercanda',
 'bdg': 'bandung',
 'begajulan': 'nakal',
 'beliin': 'belikan',
 'bencong': 'banci',
 'bentar': 'sebentar',
 'ber3': 'bertiga',
 'b

### stop words

In [None]:
from nlp_id.stopword import StopWord
stop_words = StopWord()
stop_words = stop_words.get_stopword()

In [None]:
stop_words[:20]

['ada',
 'adalagi',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agak-agak',
 'agaknya',
 'agar',
 'aja',
 'akan',
 'akankah',
 'akankan',
 'akhir',
 'akhir-akhirnya',
 'akhirannya',
 'akhiri',
 'akhirinya',
 'akhirnya',
 'aku']

In [None]:
len(stop_words)

1168

## Cleaning Process

In [None]:
df.head(5)

Unnamed: 0,content
0,"Aplikasi itu harusnya memudahkan, ini malah se..."
1,Aplikasinya dalam 2 bulan terakhir ini mengece...
2,Aplikasi sampah. Sudah setahun lebih gak ada i...
3,"Aplikasi tidak berguna dan membingungkan, pend..."
4,SUGGESTION: Pada halaman ketika mengisi biodat...


### Text Preprocessing

In [None]:
def preprocessing(text):
    '''
    1. transform all alphabet into lower case
    2. removes all punctuation, numbers, and special character
    '''
    text = text.lower()
    text = re.sub(r'[^A-Za-z]',' ', text)
    return text

In [None]:
preprocessed_data = df['content'].apply(preprocessing)
preprocessed_data[0]

'aplikasi itu harusnya memudahkan  ini malah sebaliknya jadi menyulitkan  antrean udah lama pas giliran mau scan barcode aplikasinya eror  akhirnya gak bisa isi bbm    jadi sia sia saja ngantre lama   '

### removing stop words

In [None]:
from nlp_id.stopword import StopWord
stopword = StopWord()

def clean_stop_words(text):
    return stopword.remove_stopword(text)


In [None]:
cleaned_text = preprocessed_data.apply(clean_stop_words)

In [None]:
cleaned_text[0]

'aplikasi memudahkan menyulitkan antrean giliran scan barcode aplikasinya eror gak isi bbm sia sia ngantre'

### Lemmatization

In [None]:
!pip install nlp-id



In [None]:
from nlp_id.lemmatizer import Lemmatizer
lemmatizer = Lemmatizer()

def lemmatizing(text):
    return lemmatizer.lemmatize(text)


In [None]:
lemmatized = cleaned_text.apply(lemmatizing)

In [None]:
lemmatized[0]

'aplikasi mudah sulit antre gilir scan barcode aplikasi eror gak isi bbm sia sia ngantre'

### Normalize slang words

In [None]:
from nlp_id.tokenizer import Tokenizer
tokenizer = Tokenizer()

def tokenizing(text):
    return tokenizer.tokenize(text)

In [None]:
tokenized = lemmatized.apply(tokenizing)
tokenized[0]

['aplikasi',
 'mudah',
 'sulit',
 'antre',
 'gilir',
 'scan',
 'barcode',
 'aplikasi',
 'eror',
 'gak',
 'isi',
 'bbm',
 'sia',
 'sia',
 'ngantre']

In [None]:
def normalizing_slang(tokenized_text):
    result = [word if word not in slang_words.keys() else slang_words[word] for word in tokenized_text]
    return result

In [None]:
normalized_slang = tokenized.apply(normalizing_slang)

In [None]:
normalized_slang[5]

['tahu',
 'aplikasi',
 'buruk',
 'hidup',
 'kwalitasnya',
 'nol',
 'banding',
 'aplikasi',
 'performa',
 'kecewa',
 'error',
 'dan',
 'lambat',
 'proses',
 'mohon',
 'it',
 'pertamina',
 'untuk',
 'baik',
 'kwalutas',
 'aplikasi',
 'ganti',
 'pinter',
 'bidang',
 'it']

### Normalize words which is not stored in slang words and stop words

#### Check the progress

In [None]:
def join(list_word):
    return " ".join(list_word)

joining = normalized_slang.apply(join)

In [None]:
comparison = pd.concat([df['content'], joining], axis=1)
comparison.columns = ['before', 'after']

In [None]:
comparison

Unnamed: 0,before,after
0,"Aplikasi itu harusnya memudahkan, ini malah se...",aplikasi mudah sulit antre gilir scan barcode ...
1,Aplikasinya dalam 2 bulan terakhir ini mengece...,aplikasi kecewa nopol daftar ajuk banding doku...
2,Aplikasi sampah. Sudah setahun lebih gak ada i...,aplikasi sampah tahun gak improvement ubah dat...
3,"Aplikasi tidak berguna dan membingungkan, pend...",aplikasi guna bingung daftar sulit belit belit...
4,SUGGESTION: Pada halaman ketika mengisi biodat...,suggestion halaman isi biodata on android pili...
...,...,...
180,Setiap mau bayar gak bisa di scan barcodenya j...,bayar gak scan barcodenya jelek aplikasi perin...
181,"Aplikasi tidak bermutu banget , baru juga mau ...",aplikasi mutu layan henti
182,Gak bisa di buka aplikasi nya habis update.. G...,gak buka aplikasi habis update bagaimana solus...
183,"apk ribet menyusahkan rakyat, wajib punya barc...",apk ribet susah rakyat wajib barcode tapi suli...


#### scan the unknown words

In [None]:
unknown_words = []

def detecting_unknown(list_word):
    for word in list_word:
        if word not in root_words and word not in unknown_words:
            unknown_words.append(word)

    return list_word

In [None]:
detect = normalized_slang.apply(detecting_unknown)

In [None]:
len(unknown_words)

410

In [None]:
unknown_words[:5]

['scan', 'barcode', 'gak', 'bbm', 'ngantre']

#### using generative AI to obtain dictionary for normalizing

In [None]:
!pip install -U google-generativeai



In [None]:
import google.generativeai as genai
import os
from google.colab import userdata
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

# https://aistudio.google.com/app/u/3/apikey
# https://github.com/google-gemini/cookbook/blob/main/quickstarts/Authentication.ipynb

In [None]:
model = genai.GenerativeModel('gemini-1.0-pro-latest')

response = model.generate_content(
    f"I have a list of Indonesian words which do not exist in the Indonesian \
    dictionary. Provide me with a Python dictionary containing key-value pairs \
    of unnormalized and normalised words. Here is the list of Indonesian words.\
     {unknown_words[:100]}. provide me a whole dictionary"
    )

result = response.text
print(result)

```python
unnormalized_normalized_words = {
    "scan": "pindai",
    "barcode": "kode batang",
    "gak": "tidak",
    "bbm": "bahan bakar minyak",
    "ngantre": "mengantri",
    "nopol": "nomor polisi",
    "banding": "perbandingan",
    "ga": "tidak",
    "verificator": "verifikator",
    "nunggu": "menunggu",
    "gaguna": "tidak berguna",
    "improvement": "peningkatan",
    "error": "kesalahan",
    "lebih baik": "lebih baik",
    "repot": "merepotkan",
    "fitur": "fitur",
    "sangahan": "bantahan",
    "suggestion": "saran",
    "on": "aktif",
    "android": "android",
    "tenaik": "naik",
    "keboard": "keyboard",
    "handphone": "ponsel",
    "web": "situs web",
    "mengupload": "mengunggah",
    "ktp": "kartu tanda penduduk",
    "stnk": "surat tanda nomor kendaraan",
    "verifikasi": "verifikasi",
    "kwalitasnya": "kualitasnya",
    "it": "itu",
    "pertamina": "pertamina",
    "kwalutas": "kualitas",
    "pinter": "pintar",
    "cashless": "cashless",
    "wall

In [None]:
response2 = model.generate_content(
    f"I have a list of Indonesian words which do not exist in the Indonesian \
    dictionary. Provide me with a Python dictionary containing key-value pairs \
    of unnormalized and normalised words. Here is the list of Indonesian words.\
     {unknown_words[100:200]}. provide me a whole dictionary"
    )

result2 = response2.text
print(result2)

```python
unnormalized_to_normalized = {
    "dg": "dengan",
    "gara": "karena",
    "halooo": "halo",
    "tidur": "tidur",
    "pke": "pakai",
    "wifi": "wifi",
    "connection": "koneksi",
    "muter": "mutar",
    "supir": "sopir",
    "gabisa": "tidak bisa",
    "prusahaan": "perusahaan",
    "servernya": "server",
    "saya": "saya",
    "wuuu": "wow",
    "ribet": "ribet",
    "barkot": "barcode",
    "dpan": "depan",
    "tah": "tau",
    "kekmna": "kemana",
    "mksdya": "maksudnya",
    "ferifikasi": "verifikasi",
    "perifikasi": "verifikasi",
    "kyakya": "kayaknya",
    "admin": "admin",
    "aplikasiya": "aplikasi",
    "kyak": "kayak",
    "org": "orang",
    "gagallll": "gagal",
    "teruss": "terus",
    "hbs": "habis",
    "upload": "upload",
    "foro": "foto",
    "an": "dan",
    "lahh": "lah",
    "diem": "diam",
    "nonton": "nonton",
    "daftarin": "daftarin",
    "hmm": "hmm",
    "bntu": "bantu",
    "by": "oleh",
    "call": "call",
    "center": "cen

In [None]:
response3 = model.generate_content(
    f"I have a list of Indonesian words which do not exist in the Indonesian \
    dictionary. Provide me with a Python dictionary containing key-value pairs \
    of unnormalized and normalised words. Here is the list of Indonesian words.\
     {unknown_words[200:300]}. provide me a whole dictionary"
    )

result3 = response3.text
print(result3)

```python
unnormalized_normalized_word_pairs = {
    'fix': 'memperbaiki',
    'out': 'keluar',
    'taii': 'kotoran',
    'asuu': 'makian yang tidak pantas',
    'now watching': 'sedang menonton',
    'bayang': 'bayangan',
    'bank': 'bank',
    'flexible': 'fleksibel',
    'peetamax': 'Pertamax',
    'apps': 'aplikasi',
    'sblmnya': 'sebelumnya',
    'code': 'kode',
    'google': 'Google',
    'chrome': 'Chrome',
    'branda': 'merek',
    'harys': 'Hari',
    'ribetnya': 'kerumitannya',
    'website': 'situs web',
    'fotoin': 'memfoto',
    'hadeh': 'ungkapan kekesalan',
    'uninstal': 'mencopot pemasangan',
    'nolak': 'menolak',
    'playstore': 'Play Store',
    'ngotot': 'keras kepala',
    'gobl': 'bodoh',
    'ta': 'tentunya',
    'subsiditepat': 'subsidi tepat',
    'claim': 'klaim',
    'claimnya': 'klaimnya',
    'kualitasx': 'kualitasnya',
    'nnti': 'nanti',
    'okeeeeyyyy': 'enak sekali',
    'ui': 'antarmuka pengguna',
    'pertamax': 'Pertamax',
    'dapetin':

In [None]:
response4 = model.generate_content(
    f"I have a list of Indonesian words which do not exist in the Indonesian \
    dictionary. Provide me with a Python dictionary containing key-value pairs \
    of unnormalized and normalised words. Here is the list of Indonesian words.\
     {unknown_words[300:]}. provide me a whole dictionary"
    )

result4 = response4.text
print(result4)

```python
unnormalized_normalized_dict = {
    'scanbarcode': 'scan barcode',
    'cache': 'cache',
    'eee': 'eee',
    'maksmal': 'maksimal',
    'mtr': 'meter',
    'cash': 'cash',
    'ngeselin': 'mengesalkan',
    'trdaftar': 'terdaftar',
    'bugg': 'bug',
    'detect': 'detect',
    'nope': 'tidak',
    'pdhl': 'padahal',
    'bnr': 'benar',
    'diperbaikin': 'diperbaiki',
    'saat': 'saat',
    'pengunaanya': 'penggunaannya',
    'didownload': 'diunduh',
    'diprint': 'dicetak',
    'logout': 'logout',
    'urgent': 'urgent',
    'sdm': 'sumber daya manusia',
    'mempuni': 'memadai',
    'ngbug': 'bug',
    'kend': 'kendala',
    'ngebak': 'membackup',
    'stempat': 'setempat',
    'ny': 'nya',
    'apknya': 'aplikasinya',
    'nyesel': 'menyesal',
    'simple': 'sederhana',
    'qode': 'kode',
    'diupgrade': 'diperbarui',
    'dbuka': 'dibuka',
    'masukkan': 'masukkan',
    'haduh': 'haduh',
    'dm': 'direct message',
    'ig': 'instagram',
    'dimaps': 'dipetakan'

In [None]:
import ast

def str_to_dict(response):
    start = response.index("{")
    end = response.index("}") + 1

    return ast.literal_eval(response[ start : end ])

In [None]:
result = str_to_dict(result)
result2 = str_to_dict(result2)
result3 = str_to_dict(result3)
result4 = str_to_dict(result4)

In [None]:
norm = {key: value for dic in [result, result2, result3, result4] for key, value in dic.items()}

print(norm)

{'scan': 'pindai', 'barcode': 'kode batang', 'gak': 'tidak', 'bbm': 'bahan bakar minyak', 'ngantre': 'mengantri', 'nopol': 'nomor polisi', 'banding': 'perbandingan', 'ga': 'tidak', 'verificator': 'verifikator', 'nunggu': 'menunggu', 'gaguna': 'tidak berguna', 'improvement': 'peningkatan', 'error': 'kesalahan', 'lebih baik': 'lebih baik', 'repot': 'merepotkan', 'fitur': 'fitur', 'sangahan': 'bantahan', 'suggestion': 'saran', 'on': 'aktif', 'android': 'android', 'tenaik': 'naik', 'keboard': 'keyboard', 'handphone': 'ponsel', 'web': 'situs web', 'mengupload': 'mengunggah', 'ktp': 'kartu tanda penduduk', 'stnk': 'surat tanda nomor kendaraan', 'verifikasi': 'verifikasi', 'kwalitasnya': 'kualitasnya', 'it': 'itu', 'pertamina': 'pertamina', 'kwalutas': 'kualitas', 'pinter': 'pintar', 'cashless': 'cashless', 'wallet': 'dompet digital', 'salah': 'salah', 'otp': 'kata sandi sekali pakai', 'sms': 'pesan singkat', 'tolong': 'tolong', 'bug': 'bug', 'gopay': 'gopay', 'no': 'nomor', 'hp': 'ponsel', '

In [None]:
# check apakah kata-kata yang kita normalisasi jumlahnya sama
len(norm) == len(unknown_words)

True

#### normalizing words do not exist in indonesian dictionary

In [None]:
normalized_slang

0      [aplikasi, mudah, sulit, antre, gilir, scan, b...
1      [aplikasi, kecewa, nopol, daftar, ajuk, bandin...
2      [aplikasi, sampah, tahun, gak, improvement, ub...
3      [aplikasi, guna, bingung, daftar, sulit, belit...
4      [suggestion, halaman, isi, biodata, on, androi...
                             ...                        
180    [bayar, gak, scan, barcodenya, jelek, aplikasi...
181                       [aplikasi, mutu, layan, henti]
182    [gak, buka, aplikasi, habis, update, bagaimana...
183    [apk, ribet, susah, rakyat, wajib, barcode, ta...
184             [aju, keluh, tuju, lanjut, daftar, aneh]
Name: content, Length: 185, dtype: object

In [None]:
def normalizing_unknown(list_words):
    for word in list_words:
        if word in norm.keys():
            index = list_words.index(word)
            list_words = list_words[:index] + [norm[word]] + list_words[index+1:]

    return list_words

In [None]:
normalized_unknown = normalized_slang.apply(normalizing_unknown)

In [None]:
normalized_unknown

0      [aplikasi, mudah, sulit, antre, gilir, pindai,...
1      [aplikasi, kecewa, nomor polisi, daftar, ajuk,...
2      [aplikasi, sampah, tahun, tidak, peningkatan, ...
3      [aplikasi, guna, bingung, daftar, sulit, belit...
4      [saran, halaman, isi, biodata, aktif, android,...
                             ...                        
180    [bayar, tidak, pindai, barcodenya, jelek, apli...
181                       [aplikasi, mutu, layan, henti]
182    [tidak, buka, aplikasi, habis, pembaruan, baga...
183    [apk, ribet, susah, rakyat, harus, kode batang...
184             [aju, keluh, tuju, lanjut, daftar, aneh]
Name: content, Length: 185, dtype: object

#### repeat some cleaning process for better result

In [None]:
normalized_unknown = normalized_unknown.apply(join)

In [None]:
normalized_unknown

0      aplikasi mudah sulit antre gilir pindai kode b...
1      aplikasi kecewa nomor polisi daftar ajuk perba...
2      aplikasi sampah tahun tidak peningkatan ubah d...
3      aplikasi guna bingung daftar sulit belit belit...
4      saran halaman isi biodata aktif android pilih ...
                             ...                        
180    bayar tidak pindai barcodenya jelek aplikasi p...
181                            aplikasi mutu layan henti
182    tidak buka aplikasi habis pembaruan bagaimana ...
183    apk ribet susah rakyat harus kode batang tapi ...
184                    aju keluh tuju lanjut daftar aneh
Name: content, Length: 185, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(clean_stop_words)

In [None]:
normalized_unknown

0      aplikasi mudah sulit antre gilir pindai kode b...
1      aplikasi kecewa nomor polisi daftar ajuk perba...
2      aplikasi sampah peningkatan ubah data kendara ...
3      aplikasi bingung daftar sulit belit belit fitu...
4      saran halaman isi biodata aktif android pilih ...
                             ...                        
180    bayar pindai barcodenya jelek aplikasi perinta...
181                            aplikasi mutu layan henti
182    buka aplikasi habis pembaruan solusi mengunduh...
183    apk ribet susah rakyat kode batang sulit alas ...
184                           aju keluh tuju daftar aneh
Name: content, Length: 185, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(lemmatizing)

In [None]:
normalized_unknown

0      aplikasi mudah sulit antre gilir pindai kode b...
1      aplikasi kecewa nomor polisi daftar ajuk bandi...
2      aplikasi sampah tingkat ubah data kendara akse...
3      aplikasi bingung daftar sulit belit belit fitu...
4      saran halaman isi biodata aktif android pilih ...
                             ...                        
180    bayar pindai barcodenya jelek aplikasi perinta...
181                            aplikasi mutu layan henti
182    buka aplikasi habis baru solusi unduh ulang ko...
183    apk ribet susah rakyat kode batang sulit alas ...
184                           aju keluh tuju daftar aneh
Name: content, Length: 185, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(tokenizing)

In [None]:
normalized_unknown

0      [aplikasi, mudah, sulit, antre, gilir, pindai,...
1      [aplikasi, kecewa, nomor, polisi, daftar, ajuk...
2      [aplikasi, sampah, tingkat, ubah, data, kendar...
3      [aplikasi, bingung, daftar, sulit, belit, beli...
4      [saran, halaman, isi, biodata, aktif, android,...
                             ...                        
180    [bayar, pindai, barcodenya, jelek, aplikasi, p...
181                       [aplikasi, mutu, layan, henti]
182    [buka, aplikasi, habis, baru, solusi, unduh, u...
183    [apk, ribet, susah, rakyat, kode, batang, suli...
184                     [aju, keluh, tuju, daftar, aneh]
Name: content, Length: 185, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(normalizing_slang)

In [None]:
normalized_unknown

0      [aplikasi, mudah, sulit, antre, gilir, pindai,...
1      [aplikasi, kecewa, nomor, polisi, daftar, ajuk...
2      [aplikasi, sampah, tingkat, ubah, data, kendar...
3      [aplikasi, bingung, daftar, sulit, belit, beli...
4      [saran, halaman, isi, biodata, aktif, android,...
                             ...                        
180    [bayar, pindai, barcodenya, jelek, aplikasi, p...
181                       [aplikasi, mutu, layan, henti]
182    [buka, aplikasi, habis, baru, solusi, unduh, u...
183    [apk, ribet, susah, rakyat, kode, batang, suli...
184                     [aju, keluh, tuju, daftar, aneh]
Name: content, Length: 185, dtype: object

In [None]:
normalized_unknown = normalized_unknown.apply(join)

In [None]:
df = normalized_unknown.to_frame()
df.head()

Unnamed: 0,content
0,aplikasi mudah sulit antre gilir pindai kode b...
1,aplikasi kecewa nomor polisi daftar ajuk bandi...
2,aplikasi sampah tingkat ubah data kendara akse...
3,aplikasi bingung daftar sulit belit belit fitu...
4,saran halaman isi biodata aktif android pilih ...


In [None]:
# %cd '/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore'

/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore


In [None]:
# df.to_csv('deep_cleaned_playstore_review.csv', index=False)

## translating

In [None]:
df.head()

Unnamed: 0,content
0,aplikasi mudah sulit antre gilir pindai kode b...
1,aplikasi kecewa nomor polisi daftar ajuk bandi...
2,aplikasi sampah tingkat ubah data kendara akse...
3,aplikasi bingung daftar sulit belit belit fitu...
4,saran halaman isi biodata aktif android pilih ...


In [None]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [None]:
from deep_translator import GoogleTranslator

def convert_eng(tweet):
    translator = GoogleTranslator(source='id', target='en')
    translation = translator.translate(tweet)
    return translation


df['content'] = df['content'].apply(convert_eng)

In [None]:
# df.to_csv("deep_cleaned_translated.csv")

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def preprocessing(text):
    '''
    1. transform all alphabet into lower case
    2. removes all punctuation, numbers, and special character
    '''
    text = text.lower()
    text = re.sub(r'[^A-Za-z]',' ', text)
    return text

def tokenization(text):
    '''
    split the sentence into list of words
    '''
    token = word_tokenize(text)
    return token

def remove_stopwords(tokenized_text):
    '''
    filter out stopwords from the tokenized text (list of words)
    '''
    words = [i for i in tokenized_text if i not in stopwords.words("english")]
    return words

def lemmatization(removed_stopwords):
    '''
    find the root form of each word
    '''
    processed = [lemmatizer.lemmatize(word) for word in removed_stopwords]
    return " ".join(processed)

In [None]:
# applying the preprocessing function to the dataframe

preproces = df['content'].apply(preprocessing)
preproces[0:3]

0    easy application  difficult to queue  turn to ...
1    disappointed application  police number  appea...
2    rubbish application level change vehicle data ...
Name: content, dtype: object

In [None]:
# applying the tokenization function to the dataframe

tokenize = preproces.apply(tokenization)
tokenize[0:3]

0    [easy, application, difficult, to, queue, turn...
1    [disappointed, application, police, number, ap...
2    [rubbish, application, level, change, vehicle,...
Name: content, dtype: object

In [None]:
# applying the remove_stopwords function to the dataframe

remove_sw = tokenize.apply(remove_stopwords)
remove_sw[0:3]

0    [easy, application, difficult, queue, turn, sc...
1    [disappointed, application, police, number, ap...
2    [rubbish, application, level, change, vehicle,...
Name: content, dtype: object

In [None]:
# applying the lemmatization function to the dataframe

lemmatize = remove_sw.apply(lemmatization)
lemmatize[0:3]

0    easy application difficult queue turn scan bar...
1    disappointed application police number appeal ...
2    rubbish application level change vehicle data ...
Name: content, dtype: object

In [None]:
# store preprocessed reviwes into dataframe

df['preprocessed_content'] = lemmatize

In [None]:
%cd '/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore'

/content/drive/MyDrive/Magang/DDB/sentimen mypertamina playstore


In [None]:
df.to_csv("deep_cleaned_ready_for_predict_playstore.csv", index=False)

In [None]:
df.head()

Unnamed: 0,content,preprocessed_content
0,"easy application, difficult to queue, turn to ...",easy application difficult queue turn scan bar...
1,"Disappointed application, police number, appea...",disappointed application police number appeal ...
2,rubbish application level change vehicle data ...,rubbish application level change vehicle data ...
3,"Confused application, list, difficult, convolu...",confused application list difficult convoluted...
4,Suggestions for the biodata content page on An...,suggestion biodata content page android select...
