In [1]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import json
import reprlib

In [2]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ANZ007\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
df = pd.read_csv('data/tweets_sastrawi.csv')

In [4]:
temp_detokenize = []

def detokenize(text):
    text1 = text.replace(']','').replace('[','')
    arr = text1.replace('"','').replace("\'","").split(",")
    return(TreebankWordDetokenizer().detokenize(arr))

df['tweet'] = df['tweet'].astype('U').apply(detokenize)

In [5]:
# Memanfaatkan nltk VADER untuk menggunakan leksikon kustom
sia1A, sia1B, sia2 = SentimentIntensityAnalyzer(), SentimentIntensityAnalyzer(), SentimentIntensityAnalyzer()
# membersihkan leksikon VADER default
sia1A.lexicon.clear()
sia1B.lexicon.clear()
sia2.lexicon.clear()

# Membaca leksikon InSet
# Leksikon InSet lexicon dibagi menjadi dua, yakni polaritas negatif dan polaritas positif;
# kita akan menggunakan nilai compound saja untuk memberi label pada suatu kalimat
with open('data/lexicon/InSet/negative.json') as f:
    data1A = f.read()
with open('data/lexicon/InSet/positive.json') as f:
    data1B = f.read()

# Membaca leksikon kata2 kasar
with open('data/lexicon/swear-words.json') as f:
    data2 = f.read()

# Mengubah leksikon sebagai dictionary
insetNeg = json.loads(data1A)
insetPos = json.loads(data1B)
# senti = json.loads(data2)

# Update leksikon VADER yang sudah 'dimodifikasi'
sia1A.lexicon.update(insetNeg)
sia1B.lexicon.update(insetPos)
# sia2.lexicon.update(senti)

print(reprlib.repr(sia1A.lexicon))
print(reprlib.repr(sia1B.lexicon))
# print(reprlib.repr(sia2.lexicon))

{'(barang) bekas': -4, '(olahraga) bokser': -5, '(tua) uzur': -3, 'Anda': -4, ...}
{'(hujan) gerimis': 1, '(warna) dadu': 3, 'Ahad': 3, 'Sri paduka': 4, ...}


In [6]:
sample = "kalau kamu sudah sampai sini kamu hebat ayo terus kamu pasti bisa"
print("insetNeg: ", sia1A.polarity_scores(sample))
print("insetPos: ", sia1B.polarity_scores(sample))
print("insetCpdSum: 'compound':", sia1A.polarity_scores(sample)["compound"] + sia1B.polarity_scores(sample)["compound"])

print("senti\t: ", sia2.polarity_scores(sample))

insetNeg:  {'neg': 0.526, 'neu': 0.474, 'pos': 0.0, 'compound': -0.875}
insetPos:  {'neg': 0.0, 'neu': 0.333, 'pos': 0.667, 'compound': 0.9517}
insetCpdSum: 'compound': 0.07669999999999999
senti	:  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [7]:
def is_positive_inset(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia1A.polarity_scores(tweet)["compound"] + sia1B.polarity_scores(tweet)["compound"] > 0

# def is_positive_senti(tweet: str) -> bool:
#     """True if tweet has positive compound sentiment, False otherwise."""
#     return sia2.polarity_scores(tweet)["compound"] > 0

In [9]:
df2 = pd.DataFrame()
temp_df2 = []

df2['tweet'] = df['tweet'].copy()

for tweet in df2['tweet']:
    if is_positive_inset(tweet) == True:
        label = "Positif"
    else:
        label = "Negatif"
    temp_df2.append([label])
    
temp_df2 = pd.DataFrame(temp_df2, columns=['sentimen'])
df2['sentimen'] = temp_df2['sentimen'].copy()
df2.reset_index(drop=True, inplace=True)
df2

Unnamed: 0,tweet,sentimen
0,jbharga ayam rm tgk telor wyam papan rm...,Negatif
1,diri di atas kaki sendiri serap telur t...,Negatif
2,sebab harga telur anjlok versi dagang kaget,Negatif
3,turun minta masyarakat telur ayam ppkm l...,Negatif
4,ternak ayam telur aku rugi ratus juta r...,Positif
...,...,...
2874,blitar harga telur ayam anjlok ternak an...,Negatif
2875,titip sendal baca artikel nasi goreng go...,Negatif
2876,suka mooncake isi telur daerah mahal mah...,Positif
2877,harga telur anjlok ternak jateng babak bur,Negatif


In [10]:
print("Positif :",len(df2[df2.sentimen=="Positif"]), " tweet")
print("Negatif :",len(df2[df2.sentimen=="Negatif"]), " tweet")

Positif : 1649  tweet
Negatif : 1230  tweet
