In [1]:
import pandas as pd
import numpy as np
from models.MKNN import ModifiedKNN
import neattext.functions as nfx
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
df = pd.read_csv('Twitter_fresh/twitter_crawling.csv',encoding='latin1', usecols=['date','text'])
df.head()

Unnamed: 0,date,text
0,2022-08-11 12:44:23+00:00,@janeuaree Ayo kita buat money heist Indonesia...
1,2022-08-13 18:06:23+00:00,"@festieveal Iyhh hendry akskjhk bgtttt, ni jug..."
2,2022-08-12 20:19:36+00:00,"cuk, akhir2 ini tidur larut. sekitar jam 4 an ..."
3,2022-08-12 03:40:46+00:00,@Pak_Irv Professor itu gelar utk kepala tim pe...
4,2022-08-10 14:16:56+00:00,@Babylipp1 @sbyfess Serasa syuting money heist...


In [3]:
def casefolding(Text):
    Text = Text.lower()
    return Text

In [4]:
df['text'] = df['text'].apply(casefolding)
df.head()

Unnamed: 0,date,text
0,2022-08-11 12:44:23+00:00,@janeuaree ayo kita buat money heist indonesia...
1,2022-08-13 18:06:23+00:00,"@festieveal iyhh hendry akskjhk bgtttt, ni jug..."
2,2022-08-12 20:19:36+00:00,"cuk, akhir2 ini tidur larut. sekitar jam 4 an ..."
3,2022-08-12 03:40:46+00:00,@pak_irv professor itu gelar utk kepala tim pe...
4,2022-08-10 14:16:56+00:00,@babylipp1 @sbyfess serasa syuting money heist...


In [5]:
def punc_clean(Text):
        Text = nfx.remove_urls(Text)
        Text = nfx.remove_punctuations(Text)
        Text = nfx.remove_emojis(Text)
        Text = nfx.remove_special_characters(Text)
        Text = nfx.remove_numbers(Text)
        return Text

In [6]:
df['text'] = df['text'].apply(punc_clean)
df.head()

Unnamed: 0,date,text
0,2022-08-11 12:44:23+00:00,janeuaree ayo kita buat money heist indonesia wow
1,2022-08-13 18:06:23+00:00,festieveal iyhh hendry akskjhk bgtttt ni juga ...
2,2022-08-12 20:19:36+00:00,cuk akhir ini tidur larut sekitar jam an baru...
3,2022-08-12 03:40:46+00:00,pakirv professor itu gelar utk kepala tim pera...
4,2022-08-10 14:16:56+00:00,babylipp sbyfess serasa syuting money heist kak


In [7]:
def word_tokenize_wrapper(Text):
        return word_tokenize(Text)

In [8]:
df['text'] = df['text'].apply(word_tokenize_wrapper)
df.head()

Unnamed: 0,date,text
0,2022-08-11 12:44:23+00:00,"[janeuaree, ayo, kita, buat, money, heist, ind..."
1,2022-08-13 18:06:23+00:00,"[festieveal, iyhh, hendry, akskjhk, bgtttt, ni..."
2,2022-08-12 20:19:36+00:00,"[cuk, akhir, ini, tidur, larut, sekitar, jam, ..."
3,2022-08-12 03:40:46+00:00,"[pakirv, professor, itu, gelar, utk, kepala, t..."
4,2022-08-10 14:16:56+00:00,"[babylipp, sbyfess, serasa, syuting, money, he..."


In [13]:
def word_norm(tweets):
    word_dict = pd.read_csv('data/indonesia_slangWords.csv')
    norm_word_dict = {}
    for index, row in word_dict.iterrows():
        if row[0] not in norm_word_dict:
            norm_word_dict[row[0]] = row[1]
    return [norm_word_dict[term] if term in norm_word_dict else term for term in tweets]

In [14]:
df['text'] = df['text'].apply(word_norm)
df.head()

Unnamed: 0,date,text
0,2022-08-11 12:44:23+00:00,"[janeuaree, ayo, kita, buat, money, heist, ind..."
1,2022-08-13 18:06:23+00:00,"[festieveal, iyhh, hendry, akskjhk, bgtttt, in..."
2,2022-08-12 20:19:36+00:00,"[cuk, akhir, ini, tidur, larut, sekitar, jam, ..."
3,2022-08-12 03:40:46+00:00,"[pakirv, professor, itu, gelar, untuk, kepala,..."
4,2022-08-10 14:16:56+00:00,"[babylipp, sbyfess, serasa, syuting, money, he..."


In [15]:
def remove_stopword(Text):
    stopW = stopwords.words('indonesian', 'english')
    sw = pd.read_csv('data/stopwordbahasa.csv')
    stopW.extend(sw)
    remove_sw = ' '.join(Text)
    clean_sw = [word for word in remove_sw.split() if word.lower() not in stopW]
    return clean_sw

In [16]:
df['text'] = df['text'].apply(remove_stopword)
df.head()

Unnamed: 0,date,text
0,2022-08-11 12:44:23+00:00,"[janeuaree, ayo, money, heist, indonesia, wow]"
1,2022-08-13 18:06:23+00:00,"[festieveal, iyhh, hendry, akskjhk, bgtttt, pr..."
2,2022-08-12 20:19:36+00:00,"[cuk, tidur, larut, jam, an, tidur, dipaksa, b..."
3,2022-08-12 03:40:46+00:00,"[pakirv, professor, gelar, kepala, tim, peramp..."
4,2022-08-10 14:16:56+00:00,"[babylipp, sbyfess, serasa, syuting, money, he..."


In [None]:
def indo_stem(Text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    result = []
    for w in Text:
        result.append(stemmer.stem(w))
        result.append(" ")
    return " ".join(result)

In [None]:
df['text'] = df['text'].apply(indo_stem)
df.head()