In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import re #regex library
import unicodedata

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df = pd.read_csv('/content/dataset_review_tokped.csv',index_col=[0])

In [None]:
df['Review'] = df['Review'].astype('str')

In [None]:
df["Review"].iloc[574]

'enak sekali..pada saat pengiriman hampir saja tdk diterima. Namun tdk begitu lama pengirim datang.'

In [None]:
#remove messy enter charracter
df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=True)

In [None]:
#remove emoji
df['Review'] = df['Review'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
df

  


Unnamed: 0,Review,Rating
0,maaantap,5
1,Enak segar,5
2,maknyus,5
3,Dari berat 500 gram yg remuk 59 gram. Entah da...,4
4,"enak banget, fresh dan renyah",5
...,...,...
5423,"good product, trusted seller",5
5424,sudah beli yg ke 3 ..,5
5425,byk campurannya,4
5426,good,5


In [None]:
#conver data to lowercase
df['Review'] = df['Review'].str.lower()
df

Unnamed: 0,Review,Rating
0,maaantap,5
1,enak segar,5
2,maknyus,5
3,dari berat 500 gram yg remuk 59 gram. entah da...,4
4,"enak banget, fresh dan renyah",5
...,...,...
5423,"good product, trusted seller",5
5424,sudah beli yg ke 3 ..,5
5425,byk campurannya,4
5426,good,5


In [None]:
# remove punctuation
df["Review"] = df['Review'].str.replace('[^\w\s]',' ')
df

  


Unnamed: 0,Review,Rating
0,maaantap,5
1,enak segar,5
2,maknyus,5
3,dari berat 500 gram yg remuk 59 gram entah da...,4
4,enak banget fresh dan renyah,5
...,...,...
5423,good product trusted seller,5
5424,sudah beli yg ke 3,5
5425,byk campurannya,4
5426,good,5


In [None]:
# multiple space
df['Review'] = df['Review'].str.replace(' {2,}', ' ', regex=True)
df['Review'] = df['Review'].str.strip()
df

Unnamed: 0,Review,Rating
0,maaantap,5
1,enak segar,5
2,maknyus,5
3,dari berat 500 gram yg remuk 59 gram entah dar...,4
4,enak banget fresh dan renyah,5
...,...,...
5423,good product trusted seller,5
5424,sudah beli yg ke 3,5
5425,byk campurannya,4
5426,good,5


In [None]:
#Drop Empty Rows
print(df.isnull().sum())
df.dropna(inplace = True)
print(df.isnull().sum())
df['Review'].astype(bool)
df = df[df['Review'].astype(bool)]
df

Review    0
Rating    0
dtype: int64
Review    0
Rating    0
dtype: int64


Unnamed: 0,Review,Rating
0,maaantap,5
1,enak segar,5
2,maknyus,5
3,dari berat 500 gram yg remuk 59 gram entah dar...,4
4,enak banget fresh dan renyah,5
...,...,...
5423,good product trusted seller,5
5424,sudah beli yg ke 3,5
5425,byk campurannya,4
5426,good,5


In [None]:
#Removes duplicates in place
df.drop_duplicates(keep=False, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,Review,Rating
10,enak kuacinya,5
12,pengiriman cepat packing bagus sesuai pesanan ...,5
13,pengemasan luar biasa baik untuk rasa menurut ...,4
15,terimakasih min,5
16,udah order untuk kesekian kali jos,5
...,...,...
5421,seller sangat responsif rasa belum dicoba kare...,5
5422,imut2 banget packing aman,5
5423,good product trusted seller,5
5424,sudah beli yg ke 3,5


In [None]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text
    
df['Review'] = df['Review'].apply(remove_accented_chars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['Review'] = df['Review'].apply(remove_number)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['Review'] = df['Review'].apply(remove_singl_char)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [None]:
# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['tokenization'] = df['Review'].apply(word_tokenize_wrapper)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Review,Rating,tokenization
10,enak kuacinya,5,"[enak, kuacinya]"
12,pengiriman cepat packing bagus sesuai pesanan ...,5,"[pengiriman, cepat, packing, bagus, sesuai, pe..."
13,pengemasan luar biasa baik untuk rasa menurut ...,4,"[pengemasan, luar, biasa, baik, untuk, rasa, m..."
15,terimakasih min,5,"[terimakasih, min]"
16,udah order untuk kesekian kali jos,5,"[udah, order, untuk, kesekian, kali, jos]"
...,...,...,...
5421,seller sangat responsif rasa belum dicoba kare...,5,"[seller, sangat, responsif, rasa, belum, dicob..."
5422,imut banget packing aman,5,"[imut, banget, packing, aman]"
5423,good product trusted seller,5,"[good, product, trusted, seller]"
5424,sudah beli yg ke,5,"[sudah, beli, yg, ke]"


In [None]:
normalizad_word = pd.read_csv("/content/list fix misleading words.csv")

In [None]:
normalizad_word.drop("0", axis=1, inplace=True)

In [None]:
normalizad_word

Unnamed: 0,then,new
0,seler,penjual
1,yg,yang
2,packing,pengemasan
3,respon,respons
4,order,beli
...,...,...
1045,ontime,tepat waktu
1046,on time,tepat waktu
1047,everyone,setiap orang
1048,familys,keluarga


In [None]:
normalizad_word.isnull().sum()

then     0
new     13
dtype: int64

In [None]:
normalizad_word = normalizad_word.dropna()

In [None]:
normalizad_word.isnull().sum()

then    0
new     0
dtype: int64

In [None]:
#Normalized text
normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

df['Review_Normalized'] = df['tokenization'].apply(normalized_term)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
df

Unnamed: 0,Review,Rating,tokenization,Review_Normalized
10,enak kuacinya,5,"[enak, kuacinya]","[enak, kuacinya]"
12,pengiriman cepat packing bagus sesuai pesanan ...,5,"[pengiriman, cepat, packing, bagus, sesuai, pe...","[pengiriman, cepat, pengemasan, bagus, sesuai,..."
13,pengemasan luar biasa baik untuk rasa menurut ...,4,"[pengemasan, luar, biasa, baik, untuk, rasa, m...","[pengemasan, luar, biasa, baik, untuk, rasa, m..."
15,terimakasih min,5,"[terimakasih, min]","[terima kasih, minimal]"
16,udah order untuk kesekian kali jos,5,"[udah, order, untuk, kesekian, kali, jos]","[sudah, beli, untuk, kesekian, kali, mantap]"
...,...,...,...,...
5421,seller sangat responsif rasa belum dicoba kare...,5,"[seller, sangat, responsif, rasa, belum, dicob...","[seller, sangat, responsif, rasa, belum, dicob..."
5422,imut banget packing aman,5,"[imut, banget, packing, aman]","[imut, banget, pengemasan, aman]"
5423,good product trusted seller,5,"[good, product, trusted, seller]","[good, produk, dipercaya, seller]"
5424,sudah beli yg ke,5,"[sudah, beli, yg, ke]","[sudah, beli, yang, ke]"


In [None]:
# import Sastrawi package
!pip install Sastrawi
!pip install swifter
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['Review_Normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['df_token_stemm'] = df['Review_Normalized'].swifter.apply(get_stemmed_term)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
4044
------------------------
enak : enak
kuacinya : kuaci
pengiriman : kirim
cepat : cepat
pengemasan : emas
bagus : bagus
sesuai : sesuai
pesanan : pesan
belum : belum
di : di
coba : coba
terima : terima
kasih : kasih
luar : luar
biasa : biasa
baik : baik
untuk : untuk
rasa : rasa
menurut : turut
saya : saya
lebih : lebih
dominan : dominan
ke : ke
asli : asli
atau : atau
mau : mau
varian : varian
apapun : apa
soalnya : soal
rasanya : rasa
kurang : kurang
dan : dan
sedikit : sedikit
asin : asin
juga : juga
yang : yang
agak : agak
terasa : asa
itu : itu
rumput : rumput
laut : laut
lain : lain
nya : nya
mendominasi : dominasi
sih : sih
terima kasih : terima kasih
minimal : minimal
sudah : sudah
beli : beli
kesekian : sekian
kali : kali
mantap : mantap
enaak : enaak
guriih : guriih
kacang : ka

Pandas Apply:   0%|          | 0/4060 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
df

Unnamed: 0,Review,Rating,tokenization,Review_Normalized,df_token_stemm
10,enak kuacinya,5,"[enak, kuacinya]","[enak, kuacinya]","[enak, kuaci]"
12,pengiriman cepat packing bagus sesuai pesanan ...,5,"[pengiriman, cepat, packing, bagus, sesuai, pe...","[pengiriman, cepat, pengemasan, bagus, sesuai,...","[kirim, cepat, emas, bagus, sesuai, pesan, bel..."
13,pengemasan luar biasa baik untuk rasa menurut ...,4,"[pengemasan, luar, biasa, baik, untuk, rasa, m...","[pengemasan, luar, biasa, baik, untuk, rasa, m...","[emas, luar, biasa, baik, untuk, rasa, turut, ..."
15,terimakasih min,5,"[terimakasih, min]","[terima kasih, minimal]","[terima kasih, minimal]"
16,udah order untuk kesekian kali jos,5,"[udah, order, untuk, kesekian, kali, jos]","[sudah, beli, untuk, kesekian, kali, mantap]","[sudah, beli, untuk, sekian, kali, mantap]"
...,...,...,...,...,...
5421,seller sangat responsif rasa belum dicoba kare...,5,"[seller, sangat, responsif, rasa, belum, dicob...","[seller, sangat, responsif, rasa, belum, dicob...","[seller, sangat, responsif, rasa, belum, coba,..."
5422,imut banget packing aman,5,"[imut, banget, packing, aman]","[imut, banget, pengemasan, aman]","[imut, banget, emas, aman]"
5423,good product trusted seller,5,"[good, product, trusted, seller]","[good, produk, dipercaya, seller]","[good, produk, percaya, seller]"
5424,sudah beli yg ke,5,"[sudah, beli, yg, ke]","[sudah, beli, yang, ke]","[sudah, beli, yang, ke]"


In [None]:
#removing stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
def stopword(sentence):
    text = [words for words in sentence if words not in set(stopwords.words('indonesian'))]
    text = ' '.join(text)
    return text

df['Review_clean'] = df['df_token_stemm'].apply(stopword)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [None]:
df

Unnamed: 0,Review,Rating,tokenization,Review_Normalized,df_token_stemm,Review_clean
10,enak kuacinya,5,"[enak, kuacinya]","[enak, kuacinya]","[enak, kuaci]",enak kuaci
12,pengiriman cepat packing bagus sesuai pesanan ...,5,"[pengiriman, cepat, packing, bagus, sesuai, pe...","[pengiriman, cepat, pengemasan, bagus, sesuai,...","[kirim, cepat, emas, bagus, sesuai, pesan, bel...",kirim cepat emas bagus sesuai pesan coba terim...
13,pengemasan luar biasa baik untuk rasa menurut ...,4,"[pengemasan, luar, biasa, baik, untuk, rasa, m...","[pengemasan, luar, biasa, baik, untuk, rasa, m...","[emas, luar, biasa, baik, untuk, rasa, turut, ...",emas dominan asli asli kuaci varian dominan as...
15,terimakasih min,5,"[terimakasih, min]","[terima kasih, minimal]","[terima kasih, minimal]",terima kasih minimal
16,udah order untuk kesekian kali jos,5,"[udah, order, untuk, kesekian, kali, jos]","[sudah, beli, untuk, kesekian, kali, mantap]","[sudah, beli, untuk, sekian, kali, mantap]",beli sekian kali mantap
...,...,...,...,...,...,...
5421,seller sangat responsif rasa belum dicoba kare...,5,"[seller, sangat, responsif, rasa, belum, dicob...","[seller, sangat, responsif, rasa, belum, dicob...","[seller, sangat, responsif, rasa, belum, coba,...",seller responsif coba buka
5422,imut banget packing aman,5,"[imut, banget, packing, aman]","[imut, banget, pengemasan, aman]","[imut, banget, emas, aman]",imut banget emas aman
5423,good product trusted seller,5,"[good, product, trusted, seller]","[good, produk, dipercaya, seller]","[good, produk, percaya, seller]",good produk percaya seller
5424,sudah beli yg ke,5,"[sudah, beli, yg, ke]","[sudah, beli, yang, ke]","[sudah, beli, yang, ke]",beli


In [None]:
df.isnull().sum()

Review               0
Rating               0
tokenization         0
Review_Normalized    0
df_token_stemm       0
Review_clean         0
dtype: int64

# Merge data label

In [None]:
df_label = pd.read_csv('/content/data_review_label.csv')

In [None]:
df_label

Unnamed: 0,Label
0,Positive
1,Positive
2,Negative
3,Positive
4,Positive
...,...
4055,Positive
4056,Positive
4057,Positive
4058,Positive


In [None]:
df_label.isnull().sum()

Label    1
dtype: int64

In [None]:
df_label['Label'].fillna(method ='ffill', inplace = True)

In [None]:
df_label.isnull().sum()

Label    0
dtype: int64

In [None]:
df_label['Label'].unique()

array(['Positive', 'Negative', ' Negative', ' Positive', ' Negative '],
      dtype=object)

In [None]:
#remove all whitespace in colomn labelling
def remove_whitespace_multiple(text):
    return text.replace(" ", "")

df_label['Label'] = df_label['Label'].apply(remove_whitespace_multiple)

In [None]:
df_label['Label'].unique()

array(['Positive', 'Negative'], dtype=object)

In [None]:
#conver data to lowercase
df_label['Label'] = df_label['Label'].str.lower()
df_label

Unnamed: 0,Label
0,positive
1,positive
2,negative
3,positive
4,positive
...,...
4055,positive
4056,positive
4057,positive
4058,positive


# Merge Labells to Dataset

In [None]:
df_review_labelled = pd.concat([df,df_label], axis=1)

In [None]:
df_review_labelled

Unnamed: 0,Review,Rating,tokenization,Review_Normalized,df_token_stemm,Review_clean,Label
0,,,,,,,positive
1,,,,,,,positive
2,,,,,,,negative
3,,,,,,,positive
4,,,,,,,positive
...,...,...,...,...,...,...,...
5421,seller sangat responsif rasa belum dicoba kare...,5.0,"[seller, sangat, responsif, rasa, belum, dicob...","[seller, sangat, responsif, rasa, belum, dicob...","[seller, sangat, responsif, rasa, belum, coba,...",seller responsif coba buka,
5422,imut banget packing aman,5.0,"[imut, banget, packing, aman]","[imut, banget, pengemasan, aman]","[imut, banget, emas, aman]",imut banget emas aman,
5423,good product trusted seller,5.0,"[good, product, trusted, seller]","[good, produk, dipercaya, seller]","[good, produk, percaya, seller]",good produk percaya seller,
5424,sudah beli yg ke,5.0,"[sudah, beli, yg, ke]","[sudah, beli, yang, ke]","[sudah, beli, yang, ke]",beli,


In [None]:
#df.to_csv('dataset_review_tokped_final_cleaned.csv', index=False)