# Stemming
Menjadikan kata yang memiliki imbuhan menjadi kata dasar

## Import module / package

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from tqdm.notebook import tqdm
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from ecsstemmer import EcsStemmer

In [2]:
dataset = pd.read_csv("data/tweets_clean.csv")
dataset.head(10)

Unnamed: 0,tweet
0,"['jbharga', 'ayam', 'rm', 'tgk', 'telor', 'wya..."
1,"['berdiri di atas kaki sendiri', 'serap', 'tel..."
2,"['penyebab', 'harga', 'telur', 'anjlok', 'vers..."
3,"['turunnya', 'permintaan', 'masyarakat', 'telu..."
4,"['peternak', 'ayam', 'petelur', 'mengaku', 'me..."
5,"['harga', 'jual', 'telur', 'ayam', 'solo', 'ra..."
6,"['harga', 'telur', 'anjlok', 'rp', 'kg', 'peme..."
7,"['ikutan', 'main', 'telur', 'bareng', 'voucher..."
8,"['telor', 'geprek', 'abah', 'uya', 'selera', '..."
9,"['beli', 'order', 'telur', 'set', 'telur', 'mc..."


## Retokenize

In [3]:
temp_detokenize = []

def detokenize(text):
    text1 = text.replace(']','').replace('[','')
    arr = text1.replace('"','').replace("\'","").split(",")
    return(TreebankWordDetokenizer().detokenize(arr))

dataset['tweet'] = dataset['tweet'].astype('U').apply(detokenize)

In [4]:
nltk.download('punkt')
def word_tokenize_wrapper(text):
    return word_tokenize(text)

dataset['tweet'] = dataset['tweet'].apply(word_tokenize_wrapper)
dataset['tweet']

[nltk_data] Downloading package punkt to /home/anz007/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0       [jbharga, ayam, rm, tgk, telor, wyam, sepapan,...
1       [berdiri, di, atas, kaki, sendiri, serap, telu...
2       [penyebab, harga, telur, anjlok, versi, pedaga...
3       [turunnya, permintaan, masyarakat, telur, ayam...
4       [peternak, ayam, petelur, mengaku, merugi, rat...
                              ...                        
2874    [blitar, harga, telur, ayam, anjlok, peternak,...
2875    [titip, sendal, baca, artikelnya, nasi, goreng...
2876    [suka, mooncake, isian, telur, daerah, mahal, ...
2877    [harga, telur, anjlok, peternak, jateng, babak...
2878    [harga, telur, ayam, anjlok, peternak, teranca...
Name: tweet, Length: 2879, dtype: object

## Proses Stemming

### Library Sastrawi _(Algoritma Nazief)_

In [5]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

tqdm.pandas()
dataset_nazief = dataset['tweet'].progress_apply(lambda x: [stemmer.stem(y) for y in x])
dataset_nazief

### Library EcsStemmer _(Algoritma ECS)_

In [7]:
stemmer1 = EcsStemmer()

tqdm.pandas()
dataset_ecs_temp = []
for i in tqdm(dataset['tweet'], total=int(len(dataset))):
    tweet = stemmer1.stemm(i)
    dataset_ecs_temp.append([tweet])
    
dataset_ecs = pd.DataFrame(dataset_ecs_temp, columns=['tweet'])
dataset_ecs

  0%|          | 0/2879 [00:00<?, ?it/s]

Unnamed: 0,tweet
0,"[jbharga, ayam, rm, tgk, telor, wyam, papan, r..."
1,"[diri, di, atas, kaki, sendiri, serap, telur, ..."
2,"[sebab, harga, telur, anjlok, versi, dagang, k..."
3,"[turun, minta, masyarakat, telur, ayam, ppkm, ..."
4,"[ternak, ayam, telur, menga, rugi, ratus, juta..."
...,...
2874,"[blitar, harga, telur, ayam, anjlok, ternak, a..."
2875,"[titip, sendal, baca, artikel, nasi, goreng, g..."
2876,"[suka, mooncake, isi, telur, daerah, mahal, ma..."
2877,"[harga, telur, anjlok, ternak, jateng, babak, ..."


## Export Data

In [6]:
dataset_nazief.to_csv('data/tweets_sastrawi.csv', index=False)
dataset_nazief_test = pd.read_csv('data/tweets_sastrawi.csv', encoding='latin1')
dataset_nazief_test.head(10)

In [8]:
dataset_ecs.to_csv('data/tweets_ecs.csv', index=False)
dataset_ecs_test = pd.read_csv('data/tweets_ecs.csv', encoding='latin1')
dataset_ecs_test.head(10)

Unnamed: 0,tweet
0,"['jbharga', 'ayam', 'rm', 'tgk', 'telor', 'wya..."
1,"['diri', 'di', 'atas', 'kaki', 'sendiri', 'ser..."
2,"['sebab', 'harga', 'telur', 'anjlok', 'versi',..."
3,"['turun', 'minta', 'masyarakat', 'telur', 'aya..."
4,"['ternak', 'ayam', 'telur', 'menga', 'rugi', '..."
5,"['harga', 'jual', 'telur', 'ayam', 'solo', 'ra..."
6,"['harga', 'telur', 'anjlok', 'rp', 'kg', 'peme..."
7,"['ikut', 'main', 'telur', 'bareng', 'voucher',..."
8,"['telor', 'geprek', 'abah', 'uya', 'selera', '..."
9,"['beli', 'order', 'telur', 'set', 'telur', 'mc..."
