# Sentiment Analysis dari Instagram comments calon Presiden 2024 sebelum dan sesudah deklrasi calon wakil presiden menggunakan Naive Bayes


## Tujuan : mengetahui perbedaan sentiment dari komentar instagram sebelum dan sesudah deklarasi calon wakil presiden


<ul>
    <li>
    Aurelius Ivan Wijaya (00000054769)
    </li>
    <li>
    Rajendra Abhinaya (00000060445)
    </li>
    <li>
    Maecyntha Irelynn Tantra (00000055038)
    </li>
    <li>
    Patricia theodora (00000054093)
    </li>
<ul>

# Data Collection

apa sih yang sebenernya kita cari?
* web scrapping algoritm untuk data primer (ivan)
* labeling (pat)
* data sekunder (mae)
* stopword library indonesia (abhi)
* cari jurnal referensi yang sudah, sebagai literature review (all, min 4 per person)

In [82]:
import pandas as pd
import numpy as np

# import dataset    
# [ Primary Dataset ]
anies_before = pd.DataFrame(pd.read_csv('./Dataset/Anies/anies_before.csv')['comments'])
anies_after = pd.DataFrame(pd.read_csv('./Dataset/Anies/anies_after.csv')['comments'])
ganjar_before = pd.DataFrame(pd.read_csv('./Dataset/Ganjar/ganjar_before.csv')['comments'])
ganjar_after = pd.DataFrame(pd.read_csv('./Dataset/Ganjar/ganjar_after.csv')['comments'])
prabowo_before = pd.DataFrame(pd.read_csv('./Dataset/Prabowo/prabowo_before.csv')['comments'])
prabowo_after = pd.DataFrame(pd.read_csv('./Dataset/Prabowo/prabowo_after.csv')['comments'])

# [ Secondary Dataset ]
instagram_cyber_comments = pd.read_csv('./Dataset/dataset_komentar_instagram_cyberbullying.csv')
tweet_tv = pd.read_csv('./Dataset/dataset_tweet_sentimen_tayangan_tv.csv')
tweet_pilkada = pd.read_csv('./Dataset/dataset_tweet_sentiment_pilkada_DKI_2017.csv')
tweet_opini_film = pd.read_csv('./Dataset/dataset_tweet_sentiment_opini_film.csv')
tweet_cellular = pd.read_csv('./Dataset/dataset_tweet_sentiment_cellular_service_provider.csv')
prastyo_sentiment = pd.read_csv('./Dataset/prastyo-sentiment_all.csv')
prastyo_sentiment_covid = pd.read_csv('./Dataset/prastyo-sentiment_posneg.csv')

In [58]:
display(prabowo_after.head())

Unnamed: 0,comments
0,"Semangat, Pak."
1,BISMILLAH PAK ❤️ 🇮🇩
2,Gasss pak🔥
3,Ini yang kumau.
4,"Polling yuk, silakan tekan tombol yg pilih Ani..."


## Data Integration

In [59]:
# 1: positive, 0: negative 

# data integration
# instagram_cyber_comments
# instagram_cyber_comments['label'] = 1
# change label name to 'comments'
instagram_cyber_comments.rename(columns={'Instagram Comment Text': 'comments'}, inplace=True)
# mapping sentiment
instagram_cyber_comments['Sentiment'] = instagram_cyber_comments['Sentiment'].map({'positive': 1, 'negative': 0})
instagram_cyber_comments['label'] = instagram_cyber_comments['Sentiment'].astype(int)
# drop unused columns
instagram_cyber_comments.drop(columns=['Id', 'Sentiment'], inplace=True)
# display(instagram_cyber_comments.head())
#change data type to string to ensure all data type is string
instagram_cyber_comments['comments'] = instagram_cyber_comments['comments'].astype(str)

# tweet_tv
# change label name to 'comments'
tweet_tv.rename(columns={'Tweet': 'comments'}, inplace=True)
# mapping sentiment
tweet_tv['Sentiment'] = tweet_tv['Sentiment'].map({'positive': 1, 'negative': 0})
tweet_tv['label'] = tweet_tv['Sentiment'].astype(int)
tweet_tv.rename(columns={'Text Tweet': 'comments'}, inplace=True)
tweet_tv.drop(columns=['Id', 'Sentiment', "Jumlah Retweet", "Acara TV"], inplace=True)
#change data type to string to ensure all data type is string
tweet_tv['comments'] = tweet_tv['comments'].astype(str)

# tweet_pilkada
# tweet_pilkada add new column 'label'
# tweet_pilkada['label'] = tweet_pilkada['Sentiment'].map({'positive': 1, 'negative': 0}).astype(int)
tweet_pilkada['Sentiment'] = tweet_pilkada['Sentiment'].map({'positive': 1, 'negative': 0}).astype(int)
# tweet_pilkada['label'] = tweet_pilkada['Sentiment'].astype(int)
tweet_pilkada = tweet_pilkada[['Sentiment', 'Text Tweet']]
tweet_pilkada.rename(columns={'Text Tweet': 'comments'}, inplace=True)
tweet_pilkada.rename(columns={'Sentiment': 'label'}, inplace=True)
#change data type to string to ensure all data type is string
tweet_pilkada['comments'] = tweet_pilkada['comments'].astype(str)

# tweet_opini_film
# change label name to 'comments'
tweet_opini_film.rename(columns={'Tweet': 'comments'}, inplace=True)
# mapping sentiment
tweet_opini_film['Sentiment'] = tweet_opini_film['Sentiment'].map({'positive': 1, 'negative': 0})
tweet_opini_film['label'] = tweet_opini_film['Sentiment'].astype(int)
tweet_opini_film.drop(columns=['Id', 'Sentiment'], inplace=True)
tweet_opini_film.rename(columns={'Text Tweet': 'comments'}, inplace=True)
#change data type to string to ensure all data type is string
tweet_opini_film['comments'] = tweet_opini_film['comments'].astype(str)

# tweet_cellular
# change label name to 'comments'
tweet_cellular.rename(columns={'Tweet': 'comments'}, inplace=True)
# mapping sentiment
tweet_cellular['Sentiment'] = tweet_cellular['Sentiment'].map({'positive': 1, 'negative': 0})
tweet_cellular['label'] = tweet_cellular['Sentiment'].astype(int)
tweet_cellular.drop(columns=['Id', 'Sentiment'], inplace=True)
tweet_cellular.rename(columns={'Text Tweet': 'comments'}, inplace=True)
# integrate all secondary dataset
secondary_dataset = pd.concat([
    instagram_cyber_comments,
    tweet_tv, 
    tweet_pilkada, 
    tweet_opini_film, 
    tweet_cellular
    ], ignore_index=True)
#change data type to string to ensure all data type is string
secondary_dataset['comments'] = secondary_dataset['comments'].astype(str)

In [60]:
display(secondary_dataset.head())

Unnamed: 0,comments,label
0,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...,0
1,Geblek lo tata...cowo bgt dibela2in balikan......,0
2,Kmrn termewek2 skr lengket lg duhhh kok labil ...,0
3,"Intinya kalau kesel dengan ATT nya, gausah ke ...",0
4,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...",0


# Pre-Proccessing

## Data Cleaning

### Handle Missing Value

In [61]:
def handleMissingValue(df):
    df = df.dropna()
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    # df = df.reset_index(drop=True)  
    # set column name
    # pick only comments features
    df = df[['comments']] 
    # df.columns = ['column']
    return df
# prabowo_after = prabowo_after.dropna()

anies_before['comments'] = handleMissingValue(anies_before)
anies_after['comments'] = handleMissingValue(anies_after)
ganjar_before['comments'] = handleMissingValue(ganjar_before)
ganjar_after['comments'] = handleMissingValue(ganjar_after)
prabowo_before['comments'] = handleMissingValue(prabowo_before)
prabowo_after['comments'] = handleMissingValue(prabowo_after)
secondary_dataset['comments'] = handleMissingValue(secondary_dataset)

In [62]:
display(secondary_dataset.head())
secondary_dataset.info()
# display(anies_after.head())

Unnamed: 0,comments,label
0,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...,0
1,Geblek lo tata...cowo bgt dibela2in balikan......,0
2,Kmrn termewek2 skr lengket lg duhhh kok labil ...,0
3,"Intinya kalau kesel dengan ATT nya, gausah ke ...",0
4,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...",0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  2191 non-null   object
 1   label     2200 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 25.9+ KB


### Case Folding

handle case folding to make sure all the words are in the same case (lowercase)

In [63]:
import string
# Handle case folding
# def case_folding(data):
#     datatemps = []
#     for i in range(0, len(data)):
#         try:
#             print(data['comments'][i])
#             commentTemp = data['comments'][i]
#             datatemps.append(commentTemp.str.lower())
#             #lower case
#             commentTemp = data
#         except KeyError as e:
#             # print(f"KeyError at index {i}: {e}")
#             pass # skip the row if there is no comment
#     datatemps = {'comments': datatemps}
#     return pd.DataFrame(datatemps)

def case_folding(data):
    datatemps = []
    for i in range(0, len(data)):
        try:
            commentTemp = data['comments'][i]
            if isinstance(commentTemp, str):  # Check if the value is a string
                datatemps.append(commentTemp.lower())
        except KeyError as e:
            # print(f"KeyError at index {i}: {e}")
            pass  # skip the row if there is no comment
    datatemps = {'comments': datatemps}
    return pd.DataFrame(datatemps)
    

# anies_before['comments'] =  pd.DataFrame(case_folding(anies_before))
# anies_after['comments'] =  pd.DataFrame(case_folding(anies_after))
# ganjar_before['comments'] =  pd.DataFrame(case_folding(ganjar_before))
# ganjar_after['comments'] =  pd.DataFrame(case_folding(ganjar_after))
# prabowo_before['comments'] =  pd.DataFrame(case_folding(prabowo_before))
# prabowo_after['comments'] =  pd.DataFrame(case_folding(prabowo_after))
secondary_dataset['comments'] =  pd.DataFrame(case_folding(secondary_dataset))

display(secondary_dataset.head())

Unnamed: 0,comments,label
0,<username> tolol!! gak ada hubungan nya kegug...,0
1,geblek lo tata...cowo bgt dibela2in balikan......,0
2,kmrn termewek2 skr lengket lg duhhh kok labil ...,0
3,"intinya kalau kesel dengan att nya, gausah ke ...",0
4,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...",0


### Punctuation & Number & Whitespace Removal

In [64]:
import string
import re
import pandas as pd

user =['agus'
       ,'ahok',
       'anies',
       'aniesbaswedan',
         'aniesbaswedan_',  
         'agussilvy',
         'silvy',
            'baswedan',
            'baswedan_',
            'ahy' 
       ]
def remove_punct(data):
    datatemps = []
    for i in range(0, len(data)):
        comment = str(data['comments'][i])  # Convert to string
        comment = re.sub("@[^\s]+", "", comment)  # remove @user
        comment = re.sub(r'[^\w\s]', '', comment)  # Remove punctuation
        comment = comment.strip()  # Remove whitespace
        comment = re.sub(r'\s+', ' ', comment)  # Remove double spacing
        comment = comment.strip()  # Remove whitespace
        comment = re.sub(r'\s+[a-zA-Z]\s+', ' ', comment)  # Remove single characters
        comment = re.sub(r'\d+', '', comment)  # Remove numbers
        # remove tags
        comment = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", comment)
        # remove special characters and digits
        comment = re.sub("(\\d|\\W)+", " ", comment)
        # remove person name from user
        for u in user:
            comment = comment.replace(u, '')
        datatemps.append(comment)
    return datatemps

# Call the functions successively
# anies_before['comments'] = pd.DataFrame(remove_punct(anies_before))
# anies_after['comments'] = pd.DataFrame(remove_punct(anies_after))
# ganjar_before['comments'] = pd.DataFrame(remove_punct(ganjar_before))
# ganjar_after['comments'] = pd.DataFrame(remove_punct(ganjar_after))
# prabowo_before['comments'] = pd.DataFrame(remove_punct(prabowo_before))
# prabowo_after['comments'] = pd.DataFrame(remove_punct(prabowo_after))
secondary_dataset['comments'] = pd.DataFrame(remove_punct(secondary_dataset))

#save to csv
secondary_dataset.to_csv('./clean.csv', index=False)

In [65]:
display(secondary_dataset.head())

Unnamed: 0,comments,label
0,username tolol gak ada hubungan nya keguguran ...,0
1,geblek lo tatacowo bgt dibelain balikanhadewwn...,0
2,kmrn termewek skr lengket lg duhhh kok labil b...,0
3,intinya kalau kesel dengan att nya gausah ke a...,0
4,hadewwwww permpuan itu lgsakit jiwaknp harus d...,0


### Text Normalization / Noise Removal

* this one need research (ivan)
* slang word dataset that i used : https://github.com/nasalsabila/kamus-alay

* Contoh sebelum: "Para mahasiswa yang memperoleh nilai yang rendah dalam ujian tidak diizinkan untuk mengikuti ujian ulang."
* Contoh sesudah: "Mahasiswa yang memperoleh nilai rendah dalam ujian tidak diizinkan mengikuti ujian ulang."

In [66]:
import pandas as pd
indo_slang_word = pd.read_csv('./Dataset/TextNormalization/colloquial-indonesian-lexicon.csv')
indo_slang_word.head()

def replace_slang_word(doc,slang_word):
    for index in  range(0,len(doc)-1):
        index_slang = slang_word.slang==doc[index]
        formal = list(set(slang_word[index_slang].formal))
        if len(formal)==1:
            doc[index]=formal[0]
    return doc

# def text_normalization(data):
#     datatemps = []
#     for i in range(0, len(data)):
#         comment = data.iloc[i, 0]  # Access the 'comments' column in the DataFrame
#         comment = comment.split()
#         comment = replace_slang_word(comment,indo_slang_word)
#         comment = ' '.join(comment)
#         datatemps.append(comment)
#     return datatemps

def text_normalization(data):
    datatemps = []
    for i in range(0, len(data)):
        comment = str(data['comments'][i])  # Convert to string
        comment = comment.split()
        comment = replace_slang_word(comment, indo_slang_word)
        comment = ' '.join(comment)
        datatemps.append(comment)
    return datatemps

# Call the functions successively
# anies_before['comments'] = pd.DataFrame(text_normalization(anies_before))
# anies_after['comments'] = pd.DataFrame(text_normalization(anies_after))
# ganjar_before['comments'] = pd.DataFrame(text_normalization(ganjar_before))
# ganjar_after['comments'] = pd.DataFrame(text_normalization(ganjar_after))
# prabowo_before['comments'] = pd.DataFrame(text_normalization(prabowo_before))
# prabowo_after['comments'] = pd.DataFrame(text_normalization(prabowo_after))
secondary_dataset['comments'] = pd.DataFrame(text_normalization(secondary_dataset))

In [67]:
display(secondary_dataset.head())

Unnamed: 0,comments,label
0,username tolol enggak ada hubungan nya kegugur...,0
1,geblek lo tatacowo banget dibelain balikanhade...,0
2,kemarin termewek sekarang lengket lagi duh kok...,0
3,intinya kalau kesel dengan att nya enggak usah...,0
4,hadewwwww permpuan itu lgsakit jiwaknp harus d...,0


### Stopwords Removal

* this one need research (abhi)

In [68]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# download nltk
nltk.download('punkt')

def remove_stopwords(data):
    datatemps = []
    for i in range(0, len(data)):
        comment = data['comments']  # Access the 'comments' column in the DataFrame
        # print(comment)
        stop_words = set(stopwords.words('indonesian'))
        word_tokens = word_tokenize(comment[i])
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        filtered_sentence = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        comment = ' '.join(filtered_sentence)
        datatemps.append(comment)
    return datatemps

# Call the functions successively
# anies_before['comments'] = pd.DataFrame(remove_stopwords(anies_before))
# anies_after['comments'] = pd.DataFrame(remove_stopwords(anies_after))
# ganjar_before['comments'] = pd.DataFrame(remove_stopwords(ganjar_before))
# ganjar_after['comments'] = pd.DataFrame(remove_stopwords(ganjar_after))
# prabowo_before['comments'] = pd.DataFrame(remove_stopwords(prabowo_before))
# prabowo_after['comments'] = pd.DataFrame(remove_stopwords(prabowo_after))
secondary_dataset['comments'] = pd.DataFrame(remove_stopwords(secondary_dataset))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aurel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [69]:
display(secondary_dataset.head())

Unnamed: 0,comments,label
0,username tolol hubungan nya keguguran pakai hi...,0
1,geblek lo tatacowo banget dibelain balikanhade...,0
2,kemarin termewek lengket duh labil banget sih ...,0
3,intinya kesel att nya anaknya kasihan perkemba...,0
4,hadewwwww permpuan lgsakit jiwaknp peran utama...,0


### Stemming / Lemmatization

* this one need research (mae)


In [70]:
# # import Sastrawi package
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# # create stemmer
# factory = StemmerFactory()
# stemmer = factory.create_stemmer()

# def stemming(data):
#     datatemps = []
#     for i in range(0, len(data)):
#         comment = data['comments']  # Access the 'comments' column in the DataFrame
#         # print(comment)
#         comment = stemmer.stem(comment[i])
#         datatemps.append(comment)
#     return datatemps

# # Call the functions successively
# secondary_dataset['comments'] = pd.DataFrame(stemming(secondary_dataset))
# display(anies_after.head())


from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Define the stemming function
def stem_text(text):
    if isinstance(text, str):
        return stemmer.stem(text)
    else:
        return text

# Convert 'comments' column to string and apply the stemming function
secondary_dataset['comments'] = secondary_dataset['comments'].astype(str).apply(stem_text)

# Display the updated DataFrame
display(secondary_dataset.head())


Unnamed: 0,comments,label
0,username tolol hubung nya gugur pakai hijab sy...,0
1,geblek lo tatacowo banget bain balikanhadewwnt...,0
2,kemarin mewek lengket duh labil banget sih mba...,0
3,inti kesel att nya anak kasihan kembang psikis...,0
4,hadewwwww permpuan lgsakit jiwaknp peran utama...,0


In [71]:
# display(secondary_dataset.head())
display(secondary_dataset.head())

Unnamed: 0,comments,label
0,username tolol hubung nya gugur pakai hijab sy...,0
1,geblek lo tatacowo banget bain balikanhadewwnt...,0
2,kemarin mewek lengket duh labil banget sih mba...,0
3,inti kesel att nya anak kasihan kembang psikis...,0
4,hadewwwww permpuan lgsakit jiwaknp peran utama...,0


## Tokenization

* this one need research (mae)

In [72]:
# def tokenization(data):
#     datatemps = []
#     for i in range(0, len(data)):
#         comment = data['comments']  # Access the 'comments' column in the DataFrame
#         # print(comment)
#         comment = word_tokenize(comment[i])
#         datatemps.append(comment)
#     return datatemps
def tokenization(data):
    datatemps = []
    for i in range(0, len(data)):
        comment = data['comments'][i]  # Access the 'comments' column in the DataFrame
        comment = word_tokenize(comment)
        datatemps.append(comment)
    return datatemps

# # Call the functions successively
# # anies_before['comments'] = pd.DataFrame(tokenization(anies_before))
# # anies_after['comments'] = pd.DataFrame(tokenization(anies_after))
# # ganjar_before['comments'] = pd.DataFrame(tokenization(ganjar_before))
# # ganjar_after['comments'] = pd.DataFrame(tokenization(ganjar_after))
# # prabowo_before['comments'] = pd.DataFrame(tokenization(prabowo_before))
# # prabowo_after['comments'] = pd.DataFrame(tokenization(prabowo_after))
# # secondary_dataset['comments'] = pd.DataFrame(tokenization(secondary_dataset))   

# anies_before['comments'] = tokenization(anies_before)
# anies_after['comments'] = tokenization(anies_after)
# ganjar_before['comments'] = tokenization(ganjar_before)
# ganjar_after['comments'] = tokenization(ganjar_after)
# prabowo_before['comments'] = tokenization(prabowo_before)
# prabowo_after['comments'] = tokenization(prabowo_after)
# secondary_dataset['comments'] = tokenization(secondary_dataset)

from sklearn.feature_extraction.text import CountVectorizer

# # CountVectorizer
vectorizer = CountVectorizer()
secondary_dataset_vectorizer = tokenization(secondary_dataset)
# print(vectorizer.get_feature_names())
# # display(X_train.shape)
# print(secondary_dataset['comments'])
# display(secondary_dataset)
display(secondary_dataset_vectorizer)
secondary_dataset['comments'] = secondary_dataset_vectorizer

[['username',
  'tolol',
  'hubung',
  'nya',
  'gugur',
  'pakai',
  'hijab',
  'syar',
  'lo',
  'bilang',
  'bayi',
  'nya',
  'panas',
  'dalem',
  'hubung',
  'nya',
  'woyyyy',
  'otak',
  'jempol',
  'lo',
  'singkron',
  'sih',
  'ya',
  'tulis',
  'komentar'],
 ['geblek',
  'lo',
  'tatacowo',
  'banget',
  'bain',
  'balikanhadewwntar',
  'tinggal',
  'salah',
  'tuh',
  'cowopadahal',
  'kitenya',
  'oon'],
 ['kemarin',
  'mewek',
  'lengket',
  'duh',
  'labil',
  'banget',
  'sih',
  'mbak',
  'kaya',
  'abege',
  'kemarin',
  'cari',
  'sensasi',
  'biar',
  'top',
  'markotoppp',
  'ertong',
  'kualitas'],
 ['inti',
  'kesel',
  'att',
  'nya',
  'anak',
  'kasihan',
  'kembang',
  'psikis',
  'anak',
  'depan',
  'orang',
  'tolol',
  'anda',
  'anak',
  'anak',
  'katai',
  'orang',
  'benci',
  'asa',
  'benci',
  'tau',
  'batesnya',
  'nama',
  'manusia',
  'gaakan',
  'suka',
  'haters'],
 ['hadewwwww',
  'permpuan',
  'lgsakit',
  'jiwaknp',
  'peran',
  'utama',


In [73]:
display(secondary_dataset.head())


Unnamed: 0,comments,label
0,"[username, tolol, hubung, nya, gugur, pakai, h...",0
1,"[geblek, lo, tatacowo, banget, bain, balikanha...",0
2,"[kemarin, mewek, lengket, duh, labil, banget, ...",0
3,"[inti, kesel, att, nya, anak, kasihan, kembang...",0
4,"[hadewwwww, permpuan, lgsakit, jiwaknp, peran,...",0


## Synthetic Minority Oversampling Technique (SMOTE)

* this one need research (ivan)
* Smote adalah sebuah tehnik yang digunakan terhadap data yang tidak seimbang

In [74]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train_tweets_tfidf, y_train.values)
# print(X_train_smote.shape, y_train_smote.shape)

# # SMOTE on full training data
# smote = SMOTE()
# X_smote, y_smote = smote.fit_resample(X_tweets_tfidf, y.values)
# print(X_smote.shape, y_smote.shape)

# # Class Imbalance Check
# plt.pie(pd.value_counts(y_train_smote), 
#         labels=['Label 0 (Positive)', 'Label 1 (Negative)'], 
#         autopct='%0.1f%%')
# plt.axis('equal')
# plt.show()




## K-Fold Cross Validation

* this one need research (abhi)

In [75]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
# naive bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# Assuming 'comments' is the text data in your dataset
X = secondary_dataset['comments']
y = secondary_dataset['label']

# Join the list of strings into a single string for each document
X = [' '.join(comment) for comment in X]

# Use CountVectorizer to convert text data to a format suitable for RandomForestClassifier
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X).toarray()  # Convert to array

# Define 10-fold cross-validation test harness
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Instantiate your classifier
classifier = MultinomialNB(alpha=0.5)

# Use cross_val_score to perform K-Fold Cross Validation
cv_results = cross_val_score(classifier, X, y, cv=kfold)

# Print the results for each fold
best = 0
for i, accuracy in enumerate(cv_results):
    if accuracy > best:
        best = i
        print(f"Fold {i + 1}: Accuracy = {accuracy} <== BEST RESULT")
    else:
        pass
    print(f"Fold {i + 1}: Accuracy = {accuracy}")
# Print the mean and standard deviation of the cross-validation results
print(f"\nMean Accuracy: {cv_results.mean()}")
print(f"Standard Deviation: {cv_results.std()}")

# fold 6
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X, y)
# print(classifier.predict(vectorizer.transform(['saya suka anies'])))

Fold 1: Accuracy = 0.7909090909090909 <== BEST RESULT
Fold 1: Accuracy = 0.7909090909090909
Fold 2: Accuracy = 0.7681818181818182 <== BEST RESULT
Fold 2: Accuracy = 0.7681818181818182
Fold 3: Accuracy = 0.7727272727272727
Fold 4: Accuracy = 0.7909090909090909
Fold 5: Accuracy = 0.7772727272727272
Fold 6: Accuracy = 0.7863636363636364
Fold 7: Accuracy = 0.7454545454545455
Fold 8: Accuracy = 0.7363636363636363
Fold 9: Accuracy = 0.7363636363636363
Fold 10: Accuracy = 0.740909090909091

Mean Accuracy: 0.7645454545454545
Standard Deviation: 0.021493800759157973


In [81]:
def prediction(data):
    tokenized = []
    for i in range(0, len(data)):
        tokenized.extend(word_tokenize(data[i]))

    # Transform using the same vectorizer used during training
    text = vectorizer.transform([" ".join(tokenized)]).toarray()

    return classifier.predict(text)

# Call the function successively
text = ['dasar tolol anies']
pred = prediction(text)

print(pred)
# # predict ganjar
# # ganjar_after['comments'] = tokenization(ganjar_after)
# ganjar_after['comments'] = ganjar_after['comments'].astype(str).apply(stem_text)
# ganjar_after['comments'] = remove_stopwords(ganjar_after)
# ganjar_after['comments'] = text_normalization(ganjar_after)

[1]


In [77]:
# display(ganjar_after.head())
# # ganjar_after['comments'] = tokenization(ganjar_after)
# negative = 0
# positive = 0
# for i in range(0, len(ganjar_after)):
#     pred = prediction(ganjar_after['comments'][i])
#     print(pred, ganjar_after['comments'][i])
#     if pred == 0:
#         negative += 1
#     else:
#         positive += 1
# print(negative, positive)


Unnamed: 0,comments
0,pokok optimis berangkat banteng jatuh tuju ang...
1,pilih sulit amin gama klik 2x dukung ganjar ma...
2,kereennnn
3,pasang ganjar-mahfud md milik rekam jejak kuat...
4,gasss


[1] pokok optimis berangkat banteng jatuh tuju angkat tangan
[1] pilih sulit amin gama klik 2x dukung ganjar mahfud
[1] kereennnn
[1] pasang ganjar-mahfud md milik rekam jejak kuat pimpin kelola anggap pasang stabil politik ekonomi jaga aman tertib negeri
[1] gasss
[1] yuhu
[1] jawa indonesia orang jawa
[1] adem nih balut pasang doa ganjar mahfud
[1] 
[1] bismillah alhamdulillah
[1] merah - hijau
[1] duet mati musuh gaaasssspooolllll ganjar - mahfud md
[1] bismillah pilih indonesia maju
[1] wkwkwkkwkwkwkw kalo benaran ganjar mahfud fix milu putar iklim investasi sudah bisa tebak arah inshallah positif ganfud gassssss
[1] mantap
[1] 
[1] top global
[1] 
[1] aju cuti tanggal 13-15 feb 2024 nyoblos ganjar-mahfud
[1] alhamdulillah bahagia banget ya allah ekspektasi tegak lurus sesuai realita gass terus dukung penuh ganjar - mahfud
[1] duet 100 persen pilih
[1] sih yes
[1] mohon maaf pilih
[1] warga jateng dukung ganjar
[1] pdip ketar tir mak banteng group
[1] bismillah nggih menang
[1] ami

## Workload Distribution?

# Modeling

<!-- * kemungkinan Binomial Naive Bayes -->

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

# Assuming X_train is a list of strings (or can be converted to strings)
text_data = [str(item) for item in X_train]

# Create a TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2)  # Adjust min_df as needed

# Vectorize the training set
X_train_vectorized = vectorizer.fit_transform(text_data)

# Assuming X_test is a list of strings (or can be converted to strings)
X_test = [str(item) for item in X_test]

# Train the Gaussian Naive Bayes model
gnb = GaussianNB()
gnb.fit(X_train_vectorized.toarray(), y_train)  # Convert sparse matrix to dense array

# Vectorize the test set
X_test_vectorized = vectorizer.transform(X_test)

# Make predictions on the test set
predictions = gnb.predict(X_test_vectorized.toarray())  # Convert sparse matrix to dense array

# Continue with evaluation as needed


NameError: name 'X_train' is not defined

In [None]:
# accuracy
from sklearn.metrics import accuracy_score
X_test_vectorized = vectorizer.transform(X_test)
y_pred = gnb.predict(X_test_vectorized.toarray())
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.6622222222222223


In [None]:
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

# Train the model
mnb = MultinomialNB(alpha=0.8, class_prior=None, fit_prior=True)
mnb.fit(X_train_vectorized, y_train)

In [None]:
# accuracy
from sklearn.metrics import accuracy_score
y_pred = mnb.predict(X_test_vectorized.toarray())
print('Accuracy: ', accuracy_score(y_test, y_pred))

# 0.7681818181818182

Accuracy:  0.7466666666666667


In [None]:
# Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB

# Train the model
bnb = BernoulliNB(alpha=1, class_prior=None, fit_prior=True)
bnb.fit(X_train_vectorized, y_train)

In [None]:
# accuracy
from sklearn.metrics import accuracy_score
y_pred = bnb.predict(X_test_vectorized.toarray())
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.7155555555555555


### Random Forest Classifier

In [None]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# Train the model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train_vectorized, y_train)

# accuracy
from sklearn.metrics import accuracy_score
y_pred = rfc.predict(X_test_vectorized.toarray())

print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.6933333333333334


In [None]:
# SVM Classifier
from sklearn.svm import SVC

# Train the model
svm = SVC()

svm.fit(X_train_vectorized, y_train)

# accuracy

from sklearn.metrics import accuracy_score
y_pred = svm.predict(X_test_vectorized.toarray())
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.7066666666666667


# Model Evaluation

* this one need research (pat)

# Analysis the data from primary data

In [None]:
def predict_sentiment(model, vectorizer, text):
    # Vectorize the input text
    text_vectorized = vectorizer.transform([text])
    # Make predictions
    prediction = model.predict(text_vectorized.toarray())
    
    return prediction[0]

# Test the model
# print(predict_sentiment(gnb, vectorizer, text))
# anies before

def predict_batch(data):
    positive_count = 0
    negative_count = 0
    for i in range(0, len(data)):
        prediction = predict_sentiment(gnb, vectorizer, data['comments'][i])
        if prediction == 1:
            positive_count += 1
        else:
            negative_count += 1
        # print('sentiment', prediction, data['comments'][i])
    print('komentar positive: ', positive_count)
    print('komentar negative: ', negative_count)


predict_batch(ganjar_before)

ValueError: np.nan is an invalid document, expected byte or unicode string.

## Modal Comprarison

# Conclusion