In [40]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pandas as pd
import re

class PreprocessingText:
    def __init__(self):
        self.slang_list = [kamus.strip('\n').strip('\r') for kamus in open('./daftar_slangwords.txt')]
        self.stopword_list = [line.strip('\n')for line in open('./daftar_stopwords.txt')]
        
    # Case Folding
    def CaseFolding(self, tweets):
        tweets = tweets.lower()
        
        return tweets
    
    # Cleansing
    def Cleansing(self, tweets):
        # Menghilangkan mention, hashtag, character reference
        tweets = re.sub('[@#&][A-Za-z0-9_]+'," ", tweets)

        # Menghilangkan Tautan
        tweets = re.sub("\w+:\/\/\S+"," ", tweets)

        # Menghilangkan tanda baca
        tweets = re.sub('[()!?;,]', ' ', tweets)
        tweets = re.sub('\[.*?\]',' ', tweets)

        # Menghilangkan tanda selain huruf
        tweets = re.sub("[^a-z]"," ", tweets)

        # Menghilangkan spasi lebih dari 1
        tweets = ' '.join(tweets.split())
        
        return tweets
    
    # Replace Slangwords
    def Slangwords(self, tweets):
        for line in self.slang_list:
            slangword, unslang = line.strip().split(":")
            if slangword in tweets.split(" "):
                tweets = tweets.replace(slangword + " ", unslang + " ")
                tweets = tweets.replace(" " + slangword, " " + unslang)
                
        return tweets
    
    # Stopwords Removal
    def StopwordsRemoval(self, tweets):
        stopwords_list = list(map(lambda x:x.strip(),list(self.stopword_list)))
        tweets = tweets.split()
        tweets = [w for w in tweets if not w in stopwords_list]
        tweets = " ".join(word for word in tweets)
        
        return tweets
    
    # Tokenization
    def Tokenization(self, tweets):
        tweets = tweets.split()
        
        return tweets
    
    # Normalization (Stemming)
    def Stemming(self, tweets):
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        do = []
        for w in tweets:
            dt = stemmer.stem(w)
            do.append(dt)
        d_clean = []
        d_clean = " ".join(do)
        print(d_clean)
        
        return d_clean

In [41]:
df = pd.read_csv("Final_Sesudah_Kanjuruhan.csv", sep=';', encoding='unicode_escape')
df

Unnamed: 0,id,conversation_id,created_at,user_id,username,label,tweet
0,"1,58E+18","1,58E+18",2022-10-16 05:18:19 SE Asia Standard Time,"1,49E+18",alhusaini_asror,Negatif,"@mohmahfudmd Benar dan Betul.Ketua PSSI,mundur..."
1,"1,58E+18","1,58E+18",2022-10-16 05:13:51 SE Asia Standard Time,1442933096,metmalamminggu,Negatif,"@iIhamzada Dilarang intervensi gara"" aturan fi..."
2,"1,58E+18","1,58E+18",2022-10-16 05:06:34 SE Asia Standard Time,"7,88E+17",onedayasmine,Negatif,@gibran_tweet Mas.. jadi ketua PSSI yaaa Kalau...
3,"1,58E+18","1,58E+18",2022-10-16 05:03:44 SE Asia Standard Time,4812772283,harryunited05,Negatif,@medioclubID @PSSI mana orangnya itu2 aja lagi...
4,"1,58E+18","1,58E+18",2022-10-16 05:00:24 SE Asia Standard Time,"1,31E+18",ariflabmed,Positif,@medioclubID @PSSI Yunus Nusi akan dikenang se...
...,...,...,...,...,...,...,...
1244,"1,58E+18","1,58E+18",2022-10-07 11:47:59 SE Asia Standard Time,521254614,bodoamath,Negatif,lho kapolda sama ketua pssi nya mana?
1245,"1,58E+18","1,58E+18",2022-10-07 11:47:24 SE Asia Standard Time,"1,57E+18",crypto34209511,Negatif,@PSSI HAK DAN KEWAJIBAN PENONTON HARUS DI INGA...
1246,"1,58E+18","1,58E+18",2022-10-07 11:43:10 SE Asia Standard Time,2156012929,gbrand_9127,Negatif,@CNNIndonesia PSSI juga gk standar FIFA.
1247,"1,58E+18","1,58E+18",2022-10-07 11:41:33 SE Asia Standard Time,480858414,grrraargh,Negatif,@ainurohman Dia pikir PSSI cuma urus timnas


In [42]:
pretext = PreprocessingText()

df_casefolding = df.copy()
df_casefolding['tweet'] = df_casefolding['tweet'].apply(pretext.CaseFolding)

df_cleansing = df_casefolding.copy()
df_cleansing['tweet'] = df_cleansing['tweet'].apply(pretext.Cleansing)

df_slangwords = df_cleansing.copy()
df_slangwords['tweet'] = df_slangwords['tweet'].apply(pretext.Slangwords)

df_stopwords = df_slangwords.copy()
df_stopwords['tweet'] = df_stopwords['tweet'].apply(pretext.StopwordsRemoval)

df_token = df_stopwords.copy()
df_token['tweet'] = df_token['tweet'].apply(pretext.Tokenization)

df_stemming = df_token.copy()
df_stemming['tweet'] = df_stemming['tweet'].apply(pretext.Stemming)

dataAll = {'Raw Data': df, 'Case Folding': df_casefolding, 'Cleansing': df_cleansing, 
       'Slangwords': df_slangwords, 'StopWords': df_stopwords, 'Tokenizing': df_token,
       'Stemming': df_stemming}

writer = pd.ExcelWriter('./Data_Sesudah_Preprocessed.xlsx', engine='xlsxwriter', mode='w')

for data_sheet in dataAll.keys():
    dataAll[data_sheet].to_excel(writer, sheet_name=data_sheet, index=False)

writer.close()

print("\n\n\nData Preprocessed Saved!!!")

ketua pssi mundur sukarela bukti tanggung moral mundur sukarela hukum tegak hukum moral gerak moral republik indonesia gehumri
larang intervensi atur fifa pssi santai pura dengar pssi
mas ketua pssi menpora paragames tuju peristiwa hubung fifa menteri bumn kelas hubung menpora iwan bule indonesia sentuh hukum fifa hubung olimpiade london gagal
orang rangkap jabat
yunus nus kenang stadion standar fifa
pssi intervensi katagori ormas ormas agama henti paksa agama pribadi ormas agama bahaya makan korban kali
pensiun main sepak bola jajar urus
suruh susah suruh ketua mundur
saran ismed sofyan angkat kapolda jatim iwan bule ketua pssi
momen klb pssi mayoritas pilih ketua
thadalah not true not correct tepat contoh hirau polemik asnawi
pimpin pssi rumit ekspektasi kadang tuntut mau serba instan mantan dirut kai ignasius jonan bukti hasil pimpin kai segar level
bahas jokowi kaesang nana merry pssi asuransi hasil m boneka gain peri truk gila anxiousnya tempat ramai bahas nyata stay jugaa bahas g

In [23]:
import pandas as pd

data = pd.read_csv("Final_Sesudah_Kanjuruhan.csv", sep=';', encoding='unicode_escape')
# data = pd.read_csv("Final_Sesudah_Kanjuruhan.csv", sep=';', encoding='unicode_escape')
data

Unnamed: 0,id,conversation_id,created_at,user_id,username,label,tweet
0,"1, 58E+18","1, 58E+18",2022-10-16 05:18:19 SE Asia Standard Time,1 ...,alhusaini_asror,Negatif,"@mohmahfudmd Benar dan Betul.Ketua PSSI,mundur..."
1,"1, 58E+18","1, 58E+18",2022-10-16 05:13:51 SE Asia Standard Time,1442933096,metmalamminggu,Negatif,"@iIhamzada Dilarang intervensi gara"" aturan fi..."
2,"1, 58E+18","1, 58E+18",2022-10-16 05:06:34 SE Asia Standard Time,7 ...,onedayasmine,Negatif,@gibran_tweet Mas.. jadi ketua PSSI yaaa Kalau...
3,"1, 58E+18","1, 58E+18",2022-10-16 05:03:44 SE Asia Standard Time,4812772283,harryunited05,Negatif,@medioclubID @PSSI mana orangnya itu2 aja lagi...
4,"1, 58E+18","1, 58E+18",2022-10-16 05:00:24 SE Asia Standard Time,1 ...,ariflabmed,Positif,@medioclubID @PSSI Yunus Nusi akan dikenang se...
...,...,...,...,...,...,...,...
1233,"1, 58E+18","1, 58E+18",2022-10-07 11:47:59 SE Asia Standard Time,521254614,bodoamath,Negatif,lho kapolda sama ketua pssi nya mana? ...
1234,"1, 58E+18","1, 58E+18",2022-10-07 11:47:24 SE Asia Standard Time,1 ...,crypto34209511,Negatif,@PSSI HAK DAN KEWAJIBAN PENONTON HARUS DI INGA...
1235,"1, 58E+18","1, 58E+18",2022-10-07 11:43:10 SE Asia Standard Time,2156012929,gbrand_9127,Negatif,@CNNIndonesia PSSI juga gk standar FIFA. ...
1236,"1, 58E+18","1, 58E+18",2022-10-07 11:41:33 SE Asia Standard Time,480858414,grrraargh,Negatif,@ainurohman Dia pikir PSSI cuma urus timnas ...


In [25]:
tweets_positive = [t for t in data["label"] if t == "Positif"]
tweets_negative = [t for t in data["label"] if t == "Negatif"]

print("Hasil Sentimen Analisis")
print("Positif : ", len(tweets_positive), "({}%)".format(100*len(tweets_positive)/len(data)))
print("Negatif : ", len(tweets_negative), "({}%)".format(100*len(tweets_negative)/len(data)))
print("TOTAL : ", len(data))

Hasil Sentimen Analisis
Positif :  619 (50.0%)
Negatif :  619 (50.0%)
TOTAL :  1238
