In [14]:
import pandas as pd
import nltk, csv, string, random, os, pickle, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('omw-1.4')

INDONESIAN_STOPWORDS = stopwords.words('indonesian')
PUNCTUATIONS = string.punctuation
STEMMER = StemmerFactory().create_stemmer()

dataset1 = []
dataset2 = []
list_words1 = []
list_words2 = []
labeled1 = []
labeled2 = []
classifier1 = 0
classifier2 = 0

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\derry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\derry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\derry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\derry\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\derry\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\derry\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-

In [15]:
def pre_processing(sentence):
    words = word_tokenize(sentence)
    words = [word for word in words if word not in INDONESIAN_STOPWORDS]
    words = [STEMMER.stem(word) for word in words]
    words = [word for word in words if word not in PUNCTUATIONS]
    words = [word for word in words if word.isalpha()]
    return words

In [16]:
def init1():
    data = []
    with open("dataset_tweet_sentiment_pilkada_DKI_2017 (1) - dataset_tweet_sentiment_pilkada_DKI_2017 (1).csv", encoding='UTF-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append(row)

    global list_words1
    global labeled1
    for d in data:
        sentence = d['Text Tweet'].lower()
        words = pre_processing(sentence)
        for w in words:
            list_words1.append(w)
        labeled1.append((d['Text Tweet'], d['Buzzer/Not']))

    fd = FreqDist(list_words1)
    list_words1 = [word for word, count in fd.most_common(300)]

In [17]:
def count_and_balance_labels():
    global labeled1
    label_counts = Counter(label for _, label in labeled1)
    print(f"Label counts before balancing: {label_counts}")

    min_count = min(label_counts.values())

    balanced_data = []
    label_counter = {'Buzzer': 0, 'Not': 0}

    for sentence, label in labeled1:
        if label_counter[label] < min_count:
            balanced_data.append((sentence, label))
            label_counter[label] += 1

    print(f"Label counts after balancing: {label_counter}")
    return balanced_data

In [18]:
def count_and_balance_labels():
    global labeled1
    label_counts = Counter(label for _, label in labeled1)
    print(f"Label counts before balancing: {label_counts}")

    min_count = min(label_counts.values())

    balanced_data = []
    label_counter = {'Buzzer': 0, 'Not': 0}

    for sentence, label in labeled1:
        if label_counter[label] < min_count:
            balanced_data.append((sentence, label))
            label_counter[label] += 1

    print(f"Label counts after balancing: {label_counter}")
    return balanced_data

In [19]:
def training1(balanced_data):
    global dataset1
    for sentence, label in balanced_data:
        words = word_tokenize(sentence)
        words = pre_processing(sentence)

        try:
            label_int = str(label)
            if label_int == "Not":
                newlabel = "Not"
            elif label_int == "Buzzer":
                newlabel = "Buzzer"
            else:
                print(f"  [!] WARNING: UNEXPECTED LABEL VALUE '{label_int}'")
                continue

            dict = {}
            for feature in list_words1:
                key = feature
                value = feature in words
                dict[key] = value
            dataset1.append((dict, newlabel))
        except ValueError:
            print(f"  [!] WARNING: COULD NOT CONVERT LABEL '{label}' TO AN INTEGER.")

    X, y = zip(*dataset1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    clf_pipeline = Pipeline([
        ('vectorizer', DictVectorizer(sparse=False)),
        ('classifier', SVC(kernel='rbf'))
    ])

    clf_pipeline.fit(X_train, y_train)
    global classifier1
    classifier1 = clf_pipeline

    y_pred = clf_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("  MODEL TRAINING ACCURACY (Model 1): " + str(accuracy * 100) + " %")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    file = open("model1.pickle", "wb")
    pickle.dump(classifier1, file)
    file.close()


In [20]:
def init2():
    data = []
    with open("dataset_tweet_sentiment_pilkada_DKI_2017 (1) - dataset_tweet_sentiment_pilkada_DKI_2017 (1).csv", encoding='UTF-8') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append(row)

    global list_words2
    random.shuffle(data)
    global labeled2
    for d in data:
        sentence = d[3].lower()
        words = pre_processing(sentence)
        for w in words:
            list_words2.append(w)
        labeled2.append((d[3], d[1]))

    fd = FreqDist(list_words2)
    list_words2 = [word for word, count in fd.most_common(300)]

In [21]:
def training2():
    global dataset2
    global labeled2
    for sentence, label in labeled2:
        words = word_tokenize(sentence)
        words = pre_processing(sentence)

        try:
            label_int = str(label)
            if label_int == "negative":
                newlabel = "negative"
            elif label_int == "positive":
                newlabel = "positive"
            else:
                print(f"  [!] WARNING: UNEXPECTED LABEL VALUE '{label_int}'")
                continue

            dict = {}
            for feature in list_words2:
                key = feature
                value = feature in words
                dict[key] = value
            dataset2.append((dict, newlabel))
        except ValueError:
            print(f"  [!] WARNING: COULD NOT CONVERT LABEL '{label}' TO AN INTEGER.")

    X, y = zip(*dataset2)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    clf_pipeline = Pipeline([
        ('vectorizer', DictVectorizer(sparse=False)),
        ('classifier', SVC(kernel='rbf'))
    ])

    clf_pipeline.fit(X_train, y_train)
    global classifier2
    classifier2 = clf_pipeline

    y_pred = clf_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("  MODEL TRAINING ACCURACY (Model 2): " + str(accuracy * 100) + " %")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    file = open("model2.pickle", "wb")
    pickle.dump(classifier2, file)
    file.close()


In [22]:
def main():
    init1()
    os.system("cls")
    if os.path.isfile("model1.pickle"):
        file = open("model1.pickle", "rb")
        global classifier1
        classifier1 = pickle.load(file)
        print("  [>] LOAD MODEL 1 COMPLETE...")
    else:
        print("  [>] TRAINING MODEL 1...")
        balanced_data = count_and_balance_labels()
        training1(balanced_data)
        print("  [>] TRAINING MODEL 1 COMPLETE...")

    init2()
    if os.path.isfile("model2.pickle"):
        file = open("model2.pickle", "rb")
        global classifier2
        classifier2 = pickle.load(file)
        print("  [>] LOAD MODEL 2 COMPLETE...")
    else:
        print("  [>] TRAINING MODEL 2...")
        training2()
        print("  [>] TRAINING MODEL 2 COMPLETE...")

main()

  [>] TRAINING MODEL 1...
Label counts before balancing: Counter({'Not': 559, 'Buzzer': 341})
Label counts after balancing: {'Buzzer': 341, 'Not': 341}
  MODEL TRAINING ACCURACY (Model 1): 69.56521739130434 %
              precision    recall  f1-score   support

      Buzzer       0.74      0.59      0.66        34
         Not       0.67      0.80      0.73        35

    accuracy                           0.70        69
   macro avg       0.70      0.69      0.69        69
weighted avg       0.70      0.70      0.69        69

[[20 14]
 [ 7 28]]
  [>] TRAINING MODEL 1 COMPLETE...
  [>] TRAINING MODEL 2...
  MODEL TRAINING ACCURACY (Model 2): 75.55555555555556 %
              precision    recall  f1-score   support

    negative       0.87      0.71      0.78        56
    positive       0.64      0.82      0.72        34

    accuracy                           0.76        90
   macro avg       0.75      0.77      0.75        90
weighted avg       0.78      0.76      0.76        90



In [23]:
# Predict model 1
input_sentence1 = " bajingan tengik di penjara aja udh"
words1 = pre_processing(input_sentence1.lower())
features1 = {feature: (feature in words1) for feature in list_words1}
features1 = [features1]  
classification1 = classifier1.predict(features1)[0]
print(f"Classification (Model 1): {classification1}")

# Predict model 2
input_sentence2 = "menang telak satu putaran"
words2 = pre_processing(input_sentence2.lower())
features2 = {feature: (feature in words2) for feature in list_words2}
features2 = [features2]  
classification2 = classifier2.predict(features2)[0]
print(f"Classification (Model 2): {classification2}")

Classification (Model 1): Buzzer
Classification (Model 2): positive
