# Sentistrength Label

Sentiment Strength Detection in Bahasa Indonesia.

Please cite this paper if you use this program:

Wahid, D. H., & Azhari, S. N. (2016). Peringkasan Sentimen Esktraktif di Twitter Menggunakan Hybrid TF-IDF dan Cosine Similarity. IJCCS (Indonesian Journal of Computing and Cybernetics Systems), 10(2), 207-218.

## Import Libary

In [None]:
import re
from collections import OrderedDict
import numpy as np

## Code

In [None]:
class sentistrength:
    def __init__(self, config=dict()):
        self.negasi = [line.replace('\n','') for line in open("/content/drive/MyDrive/RISET SENTIMENT ANALYSIS/Coding/sentistrength-id/negatingword.txt").read().splitlines()]
        self.tanya = [line.replace('\n','') for line in open("/content/drive/MyDrive/RISET SENTIMENT ANALYSIS/Coding/sentistrength-id/negatingword.txt").read().splitlines()]
        #create sentiment words dictionary
        self.sentiwords_txt = [line.replace('\n','').split(":") for line in open("/content/drive/MyDrive/RISET SENTIMENT ANALYSIS/Coding/sentistrength-id/sentiwords_id.txt").read().splitlines()]
        self.sentiwords_dict = OrderedDict()
        for term in self.sentiwords_txt:
            self.sentiwords_dict[term[0]] = int(term[1])
        #create emoticon dictionary
        self.emoticon_txt = [line.replace('\n','').split(" | ") for line in open("/content/drive/MyDrive/RISET SENTIMENT ANALYSIS/Coding/sentistrength-id/emoticon_id.txt").read().splitlines()]
        self.emoticon_dict = OrderedDict()
        for term in self.emoticon_txt:
            self.emoticon_dict[term[0]] = int(term[1])
        #create idioms dictionary
        self.idioms_txt = [line.replace('\n','').split(":") for line in open("/content/drive/MyDrive/RISET SENTIMENT ANALYSIS/Coding/sentistrength-id/idioms_id.txt").read().splitlines()]
        self.idioms_dict = OrderedDict()
        for term in self.idioms_txt:
            self.idioms_dict[term[0]] = int(term[1])
        #create boosterwords dictionary
        self.boosterwords_txt = [line.replace('\n','').split(":") for line in open("/content/drive/MyDrive/RISET SENTIMENT ANALYSIS/Coding/sentistrength-id/boosterwords_id.txt").read().splitlines()]
        self.boosterwords_dict = OrderedDict()
        for term in self.boosterwords_txt:
            self.boosterwords_dict[term[0]] = int(term[1])
        self.negation_conf = config["negation"]
        self.booster_conf = config["booster"]
        self.ungkapan_conf = config["ungkapan"]
        self.consecutive_conf = config["consecutive"]
        self.repeated_conf = config["repeated"]
        self.emoticon_conf = config["emoticon"]
        self.question_conf = config["question"]
        self.exclamation_conf = config["exclamation"]
        self.punctuation_conf = config["punctuation"]
        self.mean_conf = False

    def senti(self,term):
        try:
            return self.sentiwords_dict[term]
        except:
            return 0

    def emosikon(self,term):
        try:
            return self.emoticon_dict[term]
        except:
            return 0

    def ungkapan(self,term):
        try:
            return self.idioms_dict[term]
        except:
            return 0

    def booster(self, term):
        try:
            return self.boosterwords_dict[term]
        except:
            return 0

    def cek_negationword(self, prev_term, prev_term2):
        #jika kata sebelumnya (index-1) adalah kata negasi, negasikan nilai -+nya
        if prev_term in self.negasi or prev_term2+" "+prev_term in self.negasi:
            # print prev_term
            self.score = -abs(self.score) if self.score>0 else abs(self.score)

    def cek_boosterword(self,term):
        booster_score = self.booster(term)
        if booster_score !=0 and self.score>0: self.score += booster_score
        if booster_score !=0 and self.score<0: self.score -= booster_score

    def cek_consecutive_term(self, prev_term):
        if self.prev_score>0 and self.score >=3: self.score+=1 
        if self.prev_score<0 and self.score <=-3: self.score-=1 

    def cek_ungkapan(self, bigram,trigram, i):
        bigram = ' '.join(bigram)
        trigram = ' '.join(trigram)
        ungkapan_score = self.ungkapan(bigram)
        if ungkapan_score==0:
            ungkapan_score = self.ungkapan(trigram)
        if ungkapan_score!=0:
            self.score = ungkapan_score
            self.prev_score = 0
            self.pre_max_pos[i-1] = 1
            self.pre_max_neg[i-1] = -1
            self.max_pos = self.pre_max_pos[i-2] #if len(self.pre_max_pos)>1 else 1
            self.max_neg = self.pre_max_neg[i-2] #if len(self.pre_max_neg)>1 else -1
            self.sentence_score[i-1] = re.sub(r'\[\d\]','',self.sentence_score[i-1])

    def cek_repeated_punctuation(self, next_term):
        if re.search(r'!{2,}',next_term) and self.score >=3: self.score+=1
        if re.search(r'!{2,}',next_term) and self.score <=-3: self.score-=1

    def remove_extra_repeated_char(self, term):
        return re.sub(r'([A-Za-z])\1{2,}',r'\1',term)
    def plural_to_singular(self, term):
        return re.sub(r'([A-Za-z]+)\-\1', r'\1',term)
    def classify(self):
        result = "neutral"
        try:
            if self.mean_conf:
                mean_p = np.mean(self.mean_pos)
                mean_n = np.mean(self.mean_neg)
                # print mean_p, mean_n
                if mean_p > mean_n:
                    result = "positive"
                elif mean_p < mean_n and not self.is_tanya:
                    result = "negative"
                elif mean_p < mean_n and self.is_tanya:
                    result = "neutral"
            else:
                if abs(self.sentences_max_pos) > abs(self.sentences_max_neg):
                    result = "positive"
                elif abs(self.sentences_max_pos) < abs(self.sentences_max_neg):
                    result = "negative"
                elif abs(self.sentences_max_pos) == abs(self.sentences_max_neg):
                    result = "neutral"
        except:
            print("error ",self.sentences_max_pos, self.sentences_max_neg)
        return result
    def cek_neutral_term(self,terms,i):
        if terms[i-1] in self.neutral_term or terms[i+1] in self.neutral_term: self.score=1 

    def main(self,sentence):
        self.neutral_term = ['jika','kalau']
        sentences = sentence.split('.')
        self.sentences_max_neg = -1
        self.sentences_max_pos = 1
        self.sentences_score = []
        self.sentences_text = []
        for sentence in sentences:
            self.max_neg = -1
            self.max_pos = 1
            self.mean_neg = [1]
            self.mean_pos = [1]
            self.sentence_score=[]
            terms = sentence.split()
            # terms = re.split(r'[\s,.]',sentence)
            terms_length = len(terms)
            self.is_tanya = False
            self.sentence_text = ''
            # print self.max_pos, self.max_neg
            #SEMUA KALIMAT YANG MEMILIKI TANDA SERU MEMILIKI +ve minimal 2
            if self.exclamation_conf and re.search('!',sentence): self.max_pos = 2
            self.prev_score = 0
            self.pre_max_pos = []
            self.pre_max_neg = []
            for i,term in enumerate(terms):
                # repeated_term = ''
                is_extra_char = False
                plural = ''
                self.score = 0
                # if re.search(r'[A-Za-z\-.]+',term):
                # print term
                if re.search(r'([A-Za-z])\1{3,}',term):
                    is_extra_char = True
                    # repeated_term =term
                term = self.remove_extra_repeated_char(term)
                if re.search(r'([A-Za-z]+)\-\1',term):
                    plural = term
                    term = self.plural_to_singular(term)
                #GET SENTI SCORE#
                self.score = self.senti(term)
                # print "senti score",term, self.score

                #NEGATION HANDLER#
                if self.negation_conf and self.score !=0 and i>0:self.cek_negationword(terms[i-1],terms[i-2])
                # print  "negation score",term, self.score

                #BOOSTERWORD HANDLER#
                if self.booster_conf and self.score !=0 and i>0 and i<=(terms_length-1):self.cek_boosterword(terms[i-1])
                if self.booster_conf and self.score !=0 and i>=0 and i<(terms_length-1):self.cek_boosterword(terms[i+1])
                # print  "booster score",term, self.score

                #IDIOM/UNGKAPAN HANDLER#
                if self.ungkapan_conf and i>0 and i<=(terms_length-1):self.cek_ungkapan([terms[i-1],term],[terms[i-2],terms[i-1],term],i)
                # if self.ungkapan_conf and i>=0 and i<(terms_length-1):self.cek_ungkapan([term,terms[i+1]])
                # print  "idiom score",term, self.score

                #CONSECUTIVE SENTIMENT WORD#
                if self.consecutive_conf and i>0 and i<=(terms_length-1) and self.score !=0:self.cek_consecutive_term(terms[i-1])
                # print  "consecutive score",term, self.score

                #+1 SENTI SCORE IF REPEATED CHAR ON POSITIVE/NEGATIVE +2 IF NEUTRAL TERM
                if self.repeated_conf and is_extra_char==True and self.score>0: self.score+=1
                if self.repeated_conf and is_extra_char==True and self.score<0: self.score-=1
                if self.repeated_conf and is_extra_char==True and self.score==0: self.score=2
                # print  "repeat char score", term, self.score
                if self.punctuation_conf and i>=0 and i<(terms_length-1): self.cek_repeated_punctuation(terms[i+1])
                # CEK APAKAH TERDAPAT KATA TANYA
                if self.question_conf and (term in self.tanya or re.search(r'\?',term)):self.is_tanya = True
                # CEK neutral term 
                if self.score!=0 and i>1 and i<(terms_length-2): self.cek_neutral_term(terms,i)
                # if self.score!=0 and i>0 and i<(terms_length-4): self.cek_neutral_term(terms,i)
                if self.emoticon_conf and self.score==0: self.score = self.emosikon(term)

                self.prev_score = self.score
                if self.mean_conf and self.score>0: self.mean_pos.append(self.score)	
                if self.mean_conf and self.score<0: self.mean_neg.append(abs(self.score))
                #GET MAX SCORE +ve/-ve	
                self.max_pos= self.score if self.score > self.max_pos else self.max_pos
                self.max_neg= self.score if self.score < self.max_neg else self.max_neg
                #insert score info current term
                self.pre_max_pos.append(self.max_pos)
                self.pre_max_neg.append(self.max_neg)
                # print self.pre_max_pos, self.pre_max_neg
                if plural !='': term = plural
                self.sentence_text += ' {}'.format(term)
                if self.score != 0:term = "{} [{}]".format(term, self.score)
                self.sentence_score.append(term)

            self.sentences_text.append(self.sentence_text)
            self.sentences_score.append(" ".join(self.sentence_score))
            if self.is_tanya: 
                self.max_neg = -1
            self.sentences_max_pos = self.max_pos if self.max_pos > self.sentences_max_pos else self.sentences_max_pos
            self.sentences_max_neg = self.max_neg if self.max_neg < self.sentences_max_neg else self.sentences_max_neg
            # print self.sentences_max_pos, self.sentences_max_neg
        sentence_result = self.classify()
        # print self.sentences_text
        return {"classified_text":". ".join(self.sentences_score),"tweet_text":". ".join(self.sentences_text),"sentence_score":self.sentences_score,"max_positive":self.sentences_max_pos,"max_negative":self.sentences_max_neg,"kelas":sentence_result}

config = dict()
config["negation"] = True
config["booster"]  = True
config["ungkapan"]  = True
config["consecutive"]  = True
config["repeated"]  = True
config["emoticon"]  = True
config["question"]  = True
config["exclamation"]  = True
config["punctuation"]  = True 
senti = sentistrength(config)

## Testing

In [None]:
print(senti.main("Ini setelah vaksin booster jadi batukkkk.. mana awet sampe skrnggggg yaallah cape:'")['kelas'])

negative


## Labelling with Sentistrength

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('/content/tokopedia_sentiment.xlsx')

In [None]:
df.head()

Unnamed: 0,texts,score,labels
0,"penanganan komplain barang rusak mengecewakan,...",,negative
1,good,,neutral
2,berat appnya,,negative
3,Lebih baik ada fitur mengedit alamat pengirima...,,positive
4,Pembayaran via VA cuma bisa satu kali. Transak...,,positive


In [None]:
df = df.drop(['reviewId', 'userName', 'userImage', 'thumbsUpCount', 'at', 'replyContent', 'repliedAt'], axis=1)

In [None]:
df.rename(columns = {'reviewCreatedVersion':'labels', 'content':'texts'}, inplace = True)

In [None]:
df.count().texts

56465

In [None]:
df = df.astype({"labels": str}, errors='raise') 

In [None]:
for i in range(df.count().texts):
  # if i == 305:
    # continue
  result = senti.main(df.iloc[i]['texts'])['kelas']
  # print(result)
  df.iloc[i, df.columns.get_loc('labels')] = result
  # print(df.iloc[i]['label'])

AttributeError: ignored

In [None]:
df.head(20)

Unnamed: 0,texts,score,labels
0,"penanganan komplain barang rusak mengecewakan,...",,negative
1,good,,neutral
2,berat appnya,,negative
3,Lebih baik ada fitur mengedit alamat pengirima...,,positive
4,Pembayaran via VA cuma bisa satu kali. Transak...,,positive
5,Semakin bagus..,,positive
6,selalu memudahkan pembelian. enaknya nih bisa ...,,neutral
7,jos,,neutral
8,"Connection problem, please fix it",,neutral
9,"192 94 ,414茂73171 44u温暖 @ 呃呃实在9峨眉怎么看什么没人狂热，19 ...",,neutral


In [None]:
df.to_excel('tokopedia_sentiment_label.xlsx',index=False) 

In [None]:
df.value_counts("labels")

labels
nan         207459
neutral       1633
positive       726
negative       386
dtype: int64

# Import Data

In [None]:
#Import Libary
!pip install Sastrawi
!pip install swifter
!pip install emoji
import pandas as pd
import numpy as np
import re
import emoji
import string
import gspread
import seaborn as sns
import matplotlib.pyplot as plt
import swifter
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 4.3 MB/s 
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swifter
  Downloading swifter-1.2.0.tar.gz (658 kB)
[K     |████████████████████████████████| 658 kB 3.5 MB/s 
Collecting psutil>=5.6.6
  Downloading psutil-5.9.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (281 kB)
[K     |████████████████████████████████| 281 kB 59.2 MB/s 
Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 44.4 MB/s 
Collecting lock

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[K     |████████████████████████████████| 175 kB 4.2 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=dc9188fc793a2ed9d6dd52c1118a1a4dfa2f8a105255c11b0e36a199b5b91bc4
  Stored in directory: /root/.cache/pip/wheels/8a/4e/b6/57b01db010d17ef6ea9b40300af725ef3e210cb1acfb7ac8b6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_excel('/content/drive/MyDrive/Dataset/VaksinasiBoosterDatasetWithLabel.xlsx')
# df = pd.read_excel('/content/dataset.xlsx')

In [None]:
df.head()

Unnamed: 0,label,text
0,netral,hey coba himbau warganya cek seifikat vaksin d...
1,netral,mumpung netizen progresif dan berwawasan diban...
2,negative,ini setelah vaksin booster jadi batuk mana awe...
3,positive,maju loe bible gak takut gw gw dah vaksin booster
4,positive,buat sarapan besok sebelum vaksin booster


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   298 non-null    object
 1   text    298 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB


# Preprocess Data

## Drop NaN Column & Drop Duplicate Data

In [None]:
df.dropna(subset = ["label"], axis=0, inplace=True)

In [None]:
df = df.drop_duplicates()

In [None]:
df['label'].value_counts()

negative    187
positive    157
neutral     140
Name: label, dtype: int64

In [None]:
df['text_real'] = df.text

## Remove Link, Hashtag, Mention

In [None]:
def cleanUpTweet(txt):
    txt = re.sub(r'@[A-Za-z0-9_]+', '', txt) #Remove mentions
    txt = re.sub(r'#[A-Za-z0-9_]+', '', txt) #Remove hashtags
    txt = re.sub(r'RT ', '', txt) #Remove retweets
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', txt) #Remove urls
    return txt

In [None]:
def stripEmoji(txt):
  txt = re.sub(emoji.get_emoji_regexp(), r'', txt)
  return txt

In [None]:
df['text'] = df['text'].apply(cleanUpTweet)
df['text'] = df['text'].apply(stripEmoji)

## Remove HTML Char

In [None]:
def removeHTMLChar(text):
    text = ' '.join(re.sub("([&][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    return text

In [None]:
df['text'] = df['text'].apply(removeHTMLChar)

## Remove Punctuation

In [None]:
def remove_punctuation(txt):
  txt = re.sub(r'[^\w\s]', ' ', txt)
  return txt
    # return text.translate(str.maketrans("","",string.punctuation))

In [None]:
df['text'] = df['text'].apply(remove_punctuation)

In [None]:
df['text'].iloc[176]

'Covid     Bikin Vaksin     Vaksin disuntikin     Covid masih ada     Vaksin disuntik lagi     Covid masih ada lagi      booster  disuntikin     Covid udah hampir beres     Muncul Hepatitis  Misterius   Tot ah'

In [None]:
df.head()

Unnamed: 0,label,text,text_real
1,negative,ini setelah vaksin booster jadi batuk mana awe...,Ini setelah vaksin booster jadi batukkkk.. man...
2,positive,maju loe bible gak takut gw gw dah vaksin booster,"RT @rockmansick: MAJU LOE BIBLE GAK TAKUT GW, ..."
4,positive,vaksin booster dulu makannya mas dijamin keman...,@Gondrongsejatix @shitlicious vaksin booster d...
9,positive,di mainland china kasus covid lagi meledak yg ...,RT @risyadazhary: Di mainland china kasus covi...
10,negative,abis vaksin booster lemes bgt eww,abis vaksin booster lemes bgt eww


## Remove Whitespace

In [None]:
def remove_whitespace_LT(text):
    return text.strip()

In [None]:
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

In [None]:
def spell(word):
    if re.search(r'([a-zA-Z])\1{2,}', word):
        rword = re.sub(r'([a-zA-Z])\1{2,}','\\1', word)
        return rword
    else:
        return word

In [None]:
df['text'] = df['text'].apply(remove_whitespace_LT)
df['text'] = df['text'].apply(remove_whitespace_multiple)
df['text'] = df['text'].apply(spell)

## To Lowecase

In [None]:
df["text"] = df["text"].apply(lambda item: str(item).lower())

In [None]:
df.to_excel("VaksinasiBoosterDatasetWithLabel.xlsx")

In [None]:
# Remove Label Netral
df = df[df.label != 'neutral']

## Remove Special Character

In [None]:
def remove_text_special(text):
    # hapus tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # hapus non ASCII (emot, bahasa china dll)
    text = text.encode('ascii', 'replace').decode('ascii')
    # hapus mention, link, hashtag
    # text = text.replace("rt", "") # revisi
    text = ' '.join(re.sub("([@#&][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove URL 
    return text.replace("http://", " ").replace("https://", " ")

In [None]:
df['text'] = df.text.astype('str')                
df['text'] = df['text'].apply(remove_text_special)

In [None]:
df['text'].iloc[150]

'efek vaksin booster semalem baru kerasa sekarang mana kerjaan lagi numpuk'

# Tokenize

In [None]:
from nltk.tokenize import word_tokenize 

In [None]:
def word_tokenize_wrapper(text):
    return word_tokenize(text)
    
def tokenize(word):
  word = word.split(" ")
  return word

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
df['text_tokens'] = df['text'].apply(word_tokenize_wrapper)

# Count Word

In [None]:
from collections import defaultdict
word_count = defaultdict(int)
for tokens in df["text_tokens"]:
    for token in tokens:
       word_count[token] += 1

In [None]:
word_count_df = pd.DataFrame({"key": word_count.keys(), "count": word_count.values()})
word_count_df_sort = word_count_df.sort_values('count', axis=0, ascending=False, inplace=False)
word_count_df_sort.head()

Unnamed: 0,key,count
4,vaksin,600
5,booster,513
65,yg,106
51,dan,99
3,udah,79


# Stopword Removal

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords

In [None]:
list_stopwords = stopwords.words('indonesian')
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       'yah'])

txt_stopword = pd.read_csv("/content/drive/MyDrive/Dataset/Sentiment Analysis Booster/stopwords.txt", names= ["stopwords"], header = None)
list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))
list_stopwords = set(list_stopwords)
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]
df['text_tokens_WSW'] = df['text_tokens'].apply(stopwords_removal) 

# Stemming

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

In [None]:
df_normalized = pd.DataFrame(df['label'])
df_normalized['text'] = df['text']

In [None]:
normalizad_word = df_normalized

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

df['text_normalized'] = df['text_tokens_WSW'].apply(normalized_term)


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['text_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['text_tokens_stemmed'] = df['text_normalized'].swifter.apply(get_stemmed_term)

1900
------------------------
tahan : tahan
zra : zra
vaksin : vaksin
booster : booster
batuk : batuk
awet : awet
sampe : sampe
skrng : skrng
yaallah : yaallah
cape : cape
maju : maju
bible : bible
takut : takut
sarapan : sarap
besok : besok
makannya : makan
dijamin : jamin
kemana : mana
ajah : ajah
bebas : bebas
mudah : mudah
mudahan : mudah
mudik : mudik
berangkat : berangkat
suntik : suntik
terinfeksi : infeksi
aktif : aktif
berfungsi : fungsi
infeksi : infeksi
virus : virus
covid : covid
19 : 19
tunggu : tunggu
kabar : kabar
satgas : satgas
ayo : ayo
kepentingan : penting
vaksinmu : vaksin
melindungimu : lindung
melindungiku : lindung
melindungi : lindung
mari : mari
ajak : ajak
jalan2 : jalan2
pake : pake
kereta : kereta
jawa : jawa
belom : bom
ibunya : ibu
disuru : disuru
mainland : mainland
china : china
meledak : ledak
baiknya : baik
abis : abis
lemes : lemes
eww : eww
stop : stop
memperparah : parah
mutasi : mutasi
merusak : rusak
kesehatan : sehat
efek : efek
jangka : jangka


Pandas Apply:   0%|          | 0/484 [00:00<?, ?it/s]

# Merge

In [None]:
def merge_again(text):
    return ' '.join(text)

In [None]:
df['done_text'] = df['text_tokens_stemmed'].apply(merge_again)

In [None]:
for i in range(51):
  print(df['done_text'].iloc[i])

tahan zra vaksin booster
vaksin booster batuk awet sampe skrng yaallah cape
maju bible takut vaksin booster
sarap besok vaksin booster
vaksin booster makan jamin mana ajah bebas
mudah mudah mudik berangkat mudik suntik vaksin booster infeksi vaksin booster aktif fungsi infeksi virus covid 19 tunggu kabar satgas
ayo vaksin booster vaksin penting vaksin lindung lindung lindung mari vaksin
ayo vaksin booster
ajak jalan2 pake kereta jawa bom vaksin booster ibu disuru booster takut
mainland china covid ledak booster vaksin baik vaksin
abis vaksin booster lemes eww
stop vaksin booster parah mutasi virus rusak sehat efek jangka panjang timbul penykit tular
kalah lawan vaksin booster
kinn gue gebuk smpe nyakitin porsche gue takut tembak gue udh vaksin booster
vaksin booster hasil swab 
nama mamak kedetek vaksin nunjukin sbnrnya tiga udh sampe booster gatau 1 2x vaksin pas beli tiket
besok udh kosan efek vaksin booster rasa
ayo vaksin booster
canda semenjak vaksin booster ngerasa sakit
efek vak

# Modelling

In [None]:
# df = df[df['label'] != 'neutral']  

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## Setup TFIDF and CountVectorizer

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
tfid = TfidfVectorizer(
    min_df = 3,
    max_df = 0.10,
    ngram_range = (1,2)
)

In [None]:
count_vect = CountVectorizer(
    min_df = 3,
    max_df = 0.10,
    ngram_range = (1,2)
)

In [None]:
tfid_result = tfid.fit_transform(df["text"]).toarray() #
tfid_df = pd.DataFrame(tfid_result, columns = tfid.get_feature_names())
tfid_df.columns = ["word_" + str(x) for x in tfid_df.columns]
tfid_df.index = df.index

In [None]:
cvzr_result = count_vect.fit_transform(df["text"]).toarray() #
cvzr_df = pd.DataFrame(cvzr_result, columns = count_vect.get_feature_names())
cvzr_df.columns = ["word_" + str(x) for x in cvzr_df.columns]
cvzr_df.index = df.index

In [None]:
cvzr_df.columns

Index(['word_19', 'word_2x', 'word_aamiin', 'word_abis', 'word_abis booster',
       'word_abis vaksin', 'word_ada efek', 'word_ada vaksin', 'word_adalah',
       'word_agak',
       ...
       'word_yaa', 'word_yah', 'word_yang mau', 'word_yeay', 'word_yg belum',
       'word_yg bisa', 'word_yg booster', 'word_yg vaksin', 'word_yuk',
       'word_zombie'],
      dtype='object', length=521)

## Basic Model

In [None]:
# extract the labels from the train data
y = df.label.values
# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(df.text.values, y, #
stratify=y,
random_state=1,
test_size=0.2, shuffle=True)

In [None]:
# initializing the countvectorizer
vectorizer = CountVectorizer()
# tokenize and make the document into a matrix
document_term_matrix = vectorizer.fit_transform(df['text']) #
# check the result
df_baru = pd.DataFrame(document_term_matrix.toarray(), columns = vectorizer.get_feature_names())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# vectorize tweets for model building
vectorizer = CountVectorizer(binary=True)
# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))
# transform documents to document-term matrix
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [None]:
from sklearn import svm
# classify using support vector classifier
svm = svm.SVC(kernel = 'linear', probability=True)
# fit the SVC model based on the given training data
prob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)
# perform classification and prediction on samples in x_test
y_pred_svm = svm.predict(x_test_vec)

In [None]:
conf_mat = confusion_matrix(y_test, y_pred_svm)
print(conf_mat)
print(classification_report(y_test, y_pred_svm, digits = 4))

[[28 10]
 [ 8 23]]
              precision    recall  f1-score   support

    negative     0.7778    0.7368    0.7568        38
    positive     0.6970    0.7419    0.7188        31

    accuracy                         0.7391        69
   macro avg     0.7374    0.7394    0.7378        69
weighted avg     0.7415    0.7391    0.7397        69



In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy score for SVM is: ", accuracy_score(y_test, y_pred_svm) * 100, '%')

Accuracy score for SVM is:  73.91304347826086 %


## Tuning with RBF Kernel

In [None]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
 
# fitting the model for grid search
grid.fit(x_train_vec, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.545 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.545 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.545 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.545 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.527 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.545 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.545 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.545 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.545 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.527 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.545 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [None]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=1000, gamma=0.0001)


In [None]:
grid_predictions = grid.predict(x_test_vec)
 
# print classification report
print(classification_report(y_test, grid_predictions))
print("Accuracy score for SVM is: ", accuracy_score(y_test, grid_predictions) * 100, '%')

              precision    recall  f1-score   support

    negative       0.76      0.76      0.76        38
    positive       0.71      0.71      0.71        31

    accuracy                           0.74        69
   macro avg       0.74      0.74      0.74        69
weighted avg       0.74      0.74      0.74        69

Accuracy score for SVM is:  73.91304347826086 %


## Tuning with Linear Kernel

In [None]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear']}
 
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
 
# fitting the model for grid search
grid.fit(x_train_vec, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.764 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.855 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.636 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.818 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.691 total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.764 total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.855 total time=   0.0s
[CV 3/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.636 total time=   0.0s
[CV 4/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.818 total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.691 total time=   0.0s
[CV 1/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.764 total time=   0.0s
[CV 2/5] END ..C=0.1, gamma=0.01, kernel=linear

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear']},
             verbose=3)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=0.1, gamma=1, kernel='linear')


In [None]:
grid_predictions = grid.predict(x_test_vec)

print(classification_report(y_test, grid_predictions))
print("Accuracy score for SVM is: ", accuracy_score(y_test, grid_predictions) * 100, '%')

              precision    recall  f1-score   support

    negative       0.74      0.82      0.78        38
    positive       0.74      0.65      0.69        31

    accuracy                           0.74        69
   macro avg       0.74      0.73      0.73        69
weighted avg       0.74      0.74      0.74        69

Accuracy score for SVM is:  73.91304347826086 %


## Tuning with Polynomial Kernel

In [None]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['poly']}
 
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
 
# fitting the model for grid search
grid.fit(x_train_vec, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.618 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.764 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.618 total time=   0.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.745 total time=   0.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.636 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.564 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.564 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.545 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.545 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.527 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.545 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=0.01, kernel=poly

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['poly']},
             verbose=3)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

{'C': 1, 'gamma': 0.1, 'kernel': 'poly'}
SVC(C=1, gamma=0.1, kernel='poly')


In [None]:
grid_predictions = grid.predict(x_test_vec)

print(classification_report(y_test, grid_predictions))
print("Accuracy score for SVM is: ", accuracy_score(y_test, grid_predictions) * 100, '%')

              precision    recall  f1-score   support

    negative       0.69      0.82      0.75        38
    positive       0.71      0.55      0.62        31

    accuracy                           0.70        69
   macro avg       0.70      0.68      0.68        69
weighted avg       0.70      0.70      0.69        69

Accuracy score for SVM is:  69.56521739130434 %


## Tuning with Sigmoid Kernel

In [None]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['sigmoid']}
 
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
 
# fitting the model for grid search
grid.fit(x_train_vec, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.545 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.545 total time=   0.0s
[CV 3/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.545 total time=   0.0s
[CV 4/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.545 total time=   0.0s
[CV 5/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.527 total time=   0.0s
[CV 1/5] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.545 total time=   0.0s
[CV 2/5] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.545 total time=   0.0s
[CV 3/5] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.545 total time=   0.0s
[CV 4/5] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.545 total time=   0.0s
[CV 5/5] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.527 total time=   0.0s
[CV 1/5] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.545 total time=   0.0s
[CV 2/5] END .C=0.1, gamma=0.01, kernel=sigmoid

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['sigmoid']},
             verbose=3)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

{'C': 100, 'gamma': 0.001, 'kernel': 'sigmoid'}
SVC(C=100, gamma=0.001, kernel='sigmoid')


In [None]:
grid_predictions = grid.predict(x_test_vec)

print(classification_report(y_test, grid_predictions))
print("Accuracy score for SVM is: ", accuracy_score(y_test, grid_predictions) * 100, '%')

              precision    recall  f1-score   support

    negative       0.74      0.82      0.78        38
    positive       0.74      0.65      0.69        31

    accuracy                           0.74        69
   macro avg       0.74      0.73      0.73        69
weighted avg       0.74      0.74      0.74        69

Accuracy score for SVM is:  73.91304347826086 %


## Compare with Different Tuning Model

In [None]:
kernels = ['Polynomial', 'RBF', 'Sigmoid','Linear']#A function which returns the corresponding SVC model
def getClassifier(ktype):
    if ktype == 0:
        # Polynomial kernal
        return SVC(kernel='poly', degree=8, gamma="auto")
    elif ktype == 1:
        # Radial Basis Function kernal
        return SVC(kernel='rbf', gamma="auto")
    elif ktype == 2:
        # Sigmoid kernal
        return SVC(kernel='sigmoid', gamma="auto")
    elif ktype == 3:
        # Linear kernal
        return SVC(kernel='linear', gamma="auto")

In [None]:
for i in range(4):
    svclassifier = getClassifier(i) 
    svclassifier.fit(x_train_vec, y_train)
    y_pred = svclassifier.predict(x_test_vec)
    print("Evaluation:", kernels[i], "kernel")
    print(classification_report(y_test,y_pred))

Evaluation: Polynomial kernel
              precision    recall  f1-score   support

    negative       0.55      1.00      0.71        38
    positive       0.00      0.00      0.00        31

    accuracy                           0.55        69
   macro avg       0.28      0.50      0.36        69
weighted avg       0.30      0.55      0.39        69

Evaluation: RBF kernel
              precision    recall  f1-score   support

    negative       0.55      1.00      0.71        38
    positive       0.00      0.00      0.00        31

    accuracy                           0.55        69
   macro avg       0.28      0.50      0.36        69
weighted avg       0.30      0.55      0.39        69

Evaluation: Sigmoid kernel
              precision    recall  f1-score   support

    negative       0.55      1.00      0.71        38
    positive       0.00      0.00      0.00        31

    accuracy                           0.55        69
   macro avg       0.28      0.50      0.36     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

In [None]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(x_train_vec, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   0.0s
[CV] END .....................C=0.1, gamma=1, k

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf', 'poly', 'sigmoid']},
             verbose=2)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)

{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
SVC(C=10, gamma=0.01)


In [None]:
grid_predictions = grid.predict(x_test_vec)

print(confusion_matrix(y_test, grid_predictions))
print(classification_report(y_test, grid_predictions))

[[30 11  8]
 [ 6 22  9]
 [12  8 21]]
              precision    recall  f1-score   support

    negative       0.62      0.61      0.62        49
     neutral       0.54      0.59      0.56        37
    positive       0.55      0.51      0.53        41

    accuracy                           0.57       127
   macro avg       0.57      0.57      0.57       127
weighted avg       0.58      0.57      0.57       127



In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy score for SVM is: ", accuracy_score(y_test, y_pred_svm) * 100, '%')

Accuracy score for SVM is:  55.90551181102362 %
