# Sentiment Analysis dari Instagram comments calon Presiden 2024 sebelum dan sesudah deklrasi calon wakil presiden menggunakan Naive Bayes


## Tujuan : mengetahui perbedaan sentiment dari komentar instagram sebelum dan sesudah deklarasi calon wakil presiden


<ul>
    <li>
    Aurelius Ivan Wijaya (00000054769)
    </li>
    <li>
    Rajendra Abhinaya (00000060445)
    </li>
    <li>
    Maecyntha Irelynn Tantra (00000055038)
    </li>
    <li>
    Patricia Theodora (00000054093)
    </li>
<ul>

# Data Collection

apa sih yang sebenernya kita cari?
* web scrapping algoritm untuk data primer (ivan)
* labeling (pat)
* data sekunder (mae)
* stopword library indonesia (abhi)
* cari jurnal referensi yang sudah, sebagai literature review (all, min 4 per person)

In [2]:
import pandas as pd
import numpy as np

# import dataset
# [ Primary Dataset ]
anies_before = pd.read_csv('./Dataset/Anies/anies_before.csv')
anies_after = pd.read_csv('./Dataset/Anies/anies_after.csv')
ganjar_before = pd.read_csv('./Dataset/Ganjar/ganjar_before.csv')
ganjar_after = pd.read_csv('./Dataset/Ganjar/ganjar_after.csv')
prabowo_before = pd.read_csv('./Dataset/Prabowo/prabowo_before.csv')
prabowo_after = pd.read_csv('./Dataset/Prabowo/prabowo_after.csv')

# [ Secondary Dataset ]
instagram_cyber_comments = pd.read_csv('./Dataset/dataset_komentar_instagram_cyberbullying.csv')
tweet_tv = pd.read_csv('./Dataset/dataset_tweet_sentimen_tayangan_tv.csv')
tweet_pilkada = pd.read_csv('./Dataset/dataset_tweet_sentiment_pilkada_DKI_2017.csv')
tweet_opini_film = pd.read_csv('./Dataset/dataset_tweet_sentiment_opini_film.csv')
tweet_cellular = pd.read_csv('./Dataset/dataset_tweet_sentiment_cellular_service_provider.csv')

## Data Integration

In [3]:
# 1: positive, 0: negative 

# data integration
# instagram_cyber_comments
# instagram_cyber_comments['label'] = 1
# change label name to 'comments'
instagram_cyber_comments.rename(columns={'Instagram Comment Text': 'comments'}, inplace=True)
# mapping sentiment
instagram_cyber_comments['Sentiment'] = instagram_cyber_comments['Sentiment'].map({'positive': 1, 'negative': 0})
instagram_cyber_comments['label'] = instagram_cyber_comments['Sentiment'].astype(int)
# drop unused columns
instagram_cyber_comments.drop(columns=['Id', 'Sentiment'], inplace=True)
# display(instagram_cyber_comments.head())
#change data type to string to ensure all data type is string
instagram_cyber_comments['comments'] = instagram_cyber_comments['comments'].astype(str)

# tweet_tv
# change label name to 'comments'
tweet_tv.rename(columns={'Tweet': 'comments'}, inplace=True)
# mapping sentiment
tweet_tv['Sentiment'] = tweet_tv['Sentiment'].map({'positive': 1, 'negative': 0})
tweet_tv['label'] = tweet_tv['Sentiment'].astype(int)
tweet_tv.rename(columns={'Text Tweet': 'comments'}, inplace=True)
tweet_tv.drop(columns=['Id', 'Sentiment', "Jumlah Retweet", "Acara TV"], inplace=True)
#change data type to string to ensure all data type is string
tweet_tv['comments'] = tweet_tv['comments'].astype(str)

# tweet_pilkada
# tweet_pilkada add new column 'label'
# tweet_pilkada['label'] = tweet_pilkada['Sentiment'].map({'positive': 1, 'negative': 0}).astype(int)
tweet_pilkada['Sentiment'] = tweet_pilkada['Sentiment'].map({'positive': 1, 'negative': 0}).astype(int)
# tweet_pilkada['label'] = tweet_pilkada['Sentiment'].astype(int)
tweet_pilkada = tweet_pilkada[['Sentiment', 'Text Tweet']]
tweet_pilkada.rename(columns={'Text Tweet': 'comments'}, inplace=True)
tweet_pilkada.rename(columns={'Sentiment': 'label'}, inplace=True)
#change data type to string to ensure all data type is string
tweet_pilkada['comments'] = tweet_pilkada['comments'].astype(str)

# tweet_opini_film
# change label name to 'comments'
tweet_opini_film.rename(columns={'Tweet': 'comments'}, inplace=True)
# mapping sentiment
tweet_opini_film['Sentiment'] = tweet_opini_film['Sentiment'].map({'positive': 1, 'negative': 0})
tweet_opini_film['label'] = tweet_opini_film['Sentiment'].astype(int)
tweet_opini_film.drop(columns=['Id', 'Sentiment'], inplace=True)
tweet_opini_film.rename(columns={'Text Tweet': 'comments'}, inplace=True)
#change data type to string to ensure all data type is string
tweet_opini_film['comments'] = tweet_opini_film['comments'].astype(str)

# tweet_cellular
# change label name to 'comments'
tweet_cellular.rename(columns={'Tweet': 'comments'}, inplace=True)
# mapping sentiment
tweet_cellular['Sentiment'] = tweet_cellular['Sentiment'].map({'positive': 1, 'negative': 0})
tweet_cellular['label'] = tweet_cellular['Sentiment'].astype(int)
tweet_cellular.drop(columns=['Id', 'Sentiment'], inplace=True)
tweet_cellular.rename(columns={'Text Tweet': 'comments'}, inplace=True)
# integrate all secondary dataset
secondary_dataset = pd.concat([instagram_cyber_comments, tweet_tv, tweet_pilkada, tweet_opini_film, tweet_cellular], ignore_index=True)
#change data type to string to ensure all data type is string
secondary_dataset['comments'] = secondary_dataset['comments'].astype(str)

display(secondary_dataset.head())
# convert to csv
# secondary_dataset.to_csv('./Dataset/secondary_dataset.csv', index=False)

Unnamed: 0,comments,label
0,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...,0
1,Geblek lo tata...cowo bgt dibela2in balikan......,0
2,Kmrn termewek2 skr lengket lg duhhh kok labil ...,0
3,"Intinya kalau kesel dengan ATT nya, gausah ke ...",0
4,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...",0


In [4]:
display(secondary_dataset.head())

Unnamed: 0,comments,label
0,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...,0
1,Geblek lo tata...cowo bgt dibela2in balikan......,0
2,Kmrn termewek2 skr lengket lg duhhh kok labil ...,0
3,"Intinya kalau kesel dengan ATT nya, gausah ke ...",0
4,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...",0


# Pre-Proccessing

## Data Cleaning

### Handle Missing Value

In [5]:
def handleMissingValue(df):
    df = df.dropna()
    df = df.drop_duplicates()
    # df = df.reset_index(drop=True)  
    # set column name
    # pick only comments features
    df = df[['comments']] 
    # df.columns = ['column']
    return df
# prabowo_after = prabowo_after.dropna()

anies_before['comments'] = handleMissingValue(anies_before)
anies_after['comments'] = handleMissingValue(anies_after)
ganjar_before['comments'] = handleMissingValue(ganjar_before)
ganjar_after['comments'] = handleMissingValue(ganjar_after)
prabowo_before['comments'] = handleMissingValue(prabowo_before)
prabowo_after['comments'] = handleMissingValue(prabowo_after)
secondary_dataset['comments'] = handleMissingValue(secondary_dataset)

In [6]:
display(secondary_dataset.head())

secondary_dataset.info()

Unnamed: 0,comments,label
0,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...,0
1,Geblek lo tata...cowo bgt dibela2in balikan......,0
2,Kmrn termewek2 skr lengket lg duhhh kok labil ...,0
3,"Intinya kalau kesel dengan ATT nya, gausah ke ...",0
4,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha...",0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   comments  2191 non-null   object
 1   label     2200 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 25.9+ KB


### Case Folding

handle case folding to make sure all the words are in the same case (lowercase)

In [7]:
import string
# Handle case folding
# def case_folding(data):
#     datatemps = []
#     for i in range(0, len(data)):
#         try:
#             print(data['comments'][i])
#             commentTemp = data['comments'][i]
#             datatemps.append(commentTemp.str.lower())
#             #lower case
#             commentTemp = data
#         except KeyError as e:
#             # print(f"KeyError at index {i}: {e}")
#             pass # skip the row if there is no comment
#     datatemps = {'comments': datatemps}
#     return pd.DataFrame(datatemps)

def case_folding(data):
    datatemps = []
    for i in range(0, len(data)):
        try:
            commentTemp = data['comments'][i]
            if isinstance(commentTemp, str):  # Check if the value is a string
                datatemps.append(commentTemp.lower())
        except KeyError as e:
            # print(f"KeyError at index {i}: {e}")
            pass  # skip the row if there is no comment
    datatemps = {'comments': datatemps}
    return pd.DataFrame(datatemps)
    

anies_before['comments'] =  pd.DataFrame(case_folding(anies_before))
anies_after['comments'] =  pd.DataFrame(case_folding(anies_after))
ganjar_before['comments'] =  pd.DataFrame(case_folding(ganjar_before))
ganjar_after['comments'] =  pd.DataFrame(case_folding(ganjar_after))
prabowo_before['comments'] =  pd.DataFrame(case_folding(prabowo_before))
prabowo_after['comments'] =  pd.DataFrame(case_folding(prabowo_after))
secondary_dataset['comments'] =  pd.DataFrame(case_folding(secondary_dataset))

display(anies_after.head())

Unnamed: 0,index,comments
0,0,"tombol tolak muhaimin, gak ush takut kehilanga..."
1,0,"@alfarouqxoumar amit2 milih muhaimin, sama aja..."
2,1,@akbar_brox capres abadi
3,2,@alfarouqxoumar memangnya ada apa dengan suara...
4,3,"@alfarouqxoumar kasian ahy, ditinggal begitu aja."


### Punctuation & Number & Whitespace Removal

In [8]:
import string
import re
import pandas as pd

def remove_punct(data):
    datatemps = []
    for i in range(0, len(data)):
        comment = str(data.iloc[i, 0])  # Convert to string
        comment = re.sub("@[^\s]+", "", comment)  # remove @user
        comment = re.sub(r'[^\w\s]', '', comment)  # Remove punctuation
        comment = comment.strip()  # Remove whitespace
        comment = re.sub(r'\s+', ' ', comment)  # Remove double spacing
        comment = comment.strip()  # Remove whitespace
        comment = re.sub(r'\s+[a-zA-Z]\s+', ' ', comment)  # Remove single characters
        comment = re.sub(r'\d+', '', comment)  # Remove numbers
        # remove tags
        comment = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", comment)
        # remove special characters and digits
        comment = re.sub("(\\d|\\W)+", " ", comment)
        
        datatemps.append(comment)
    return datatemps

# Call the functions successively
anies_before['comments'] = pd.DataFrame(remove_punct(anies_before))
anies_after['comments'] = pd.DataFrame(remove_punct(anies_after))
ganjar_before['comments'] = pd.DataFrame(remove_punct(ganjar_before))
ganjar_after['comments'] = pd.DataFrame(remove_punct(ganjar_after))
prabowo_before['comments'] = pd.DataFrame(remove_punct(prabowo_before))
prabowo_after['comments'] = pd.DataFrame(remove_punct(prabowo_after))
secondary_dataset['comments'] = pd.DataFrame(remove_punct(secondary_dataset))

In [9]:
display(anies_after.head())

Unnamed: 0,index,comments
0,0,
1,0,
2,1,
3,2,
4,3,


### Text Normalization / Noise Removal

* this one need research (ivan)
* slang word dataset that i used : https://github.com/nasalsabila/kamus-alay

* Contoh sebelum: "Para mahasiswa yang memperoleh nilai yang rendah dalam ujian tidak diizinkan untuk mengikuti ujian ulang."
* Contoh sesudah: "Mahasiswa yang memperoleh nilai rendah dalam ujian tidak diizinkan mengikuti ujian ulang."

In [10]:
import pandas as pd
indo_slang_word = pd.read_csv('./Dataset/TextNormalization/colloquial-indonesian-lexicon.csv')
indo_slang_word.head()

def replace_slang_word(doc,slang_word):
    for index in  range(0,len(doc)-1):
        index_slang = slang_word.slang==doc[index]
        formal = list(set(slang_word[index_slang].formal))
        if len(formal)==1:
            doc[index]=formal[0]
    return doc

# def text_normalization(data):
#     datatemps = []
#     for i in range(0, len(data)):
#         comment = data.iloc[i, 0]  # Access the 'comments' column in the DataFrame
#         comment = comment.split()
#         comment = replace_slang_word(comment,indo_slang_word)
#         comment = ' '.join(comment)
#         datatemps.append(comment)
#     return datatemps

def text_normalization(data):
    datatemps = []
    for i in range(0, len(data)):
        comment = str(data.iloc[i, 0])  # Convert to string
        comment = comment.split()
        comment = replace_slang_word(comment, indo_slang_word)
        comment = ' '.join(comment)
        datatemps.append(comment)
    return datatemps

# Call the functions successively
anies_before['comments'] = pd.DataFrame(text_normalization(anies_before))
anies_after['comments'] = pd.DataFrame(text_normalization(anies_after))
ganjar_before['comments'] = pd.DataFrame(text_normalization(ganjar_before))
ganjar_after['comments'] = pd.DataFrame(text_normalization(ganjar_after))
prabowo_before['comments'] = pd.DataFrame(text_normalization(prabowo_before))
prabowo_after['comments'] = pd.DataFrame(text_normalization(prabowo_after))
secondary_dataset['comments'] = pd.DataFrame(text_normalization(secondary_dataset))

In [11]:
display(anies_after.head())

Unnamed: 0,index,comments
0,0,0
1,0,0
2,1,1
3,2,2
4,3,3


### Stopwords Removal

* Stopwords: Stopwords are common words found in many languages such as prepositions, pronouns, etc that do not add much information

Stopword examples in English: I, What, An, The, So

Stopword examples in Indonesian: Saya, Dan, Akan, Pada, Jadi

* Stopwords removal is the process of removing stopwords from the text in a dataset. This is done to help reduce the amount of words in the dataset which will make training the model faster. As stopwords do not contain any important information, their removal does not negatively impact the model that is being trained.

https://towardsdatascience.com/text-pre-processing-stop-words-removal-using-different-libraries-f20bac19929a
https://yunusmuhammad007.medium.com/basic-text-preprocessing-menggunakan-nltk-86ba3e65a1dc

In [38]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# download nltk
nltk.download('punkt')
nltk.download('stopwords')

def remove_stopwords(data):
    datatemps = []
    stop_words = set(stopwords.words('indonesian'))
    for i in range(0, len(data)):
        comment = data['comments']  # Access the 'comments' column in the DataFrame
        # print(comment)
        word_tokens = word_tokenize(comment[i])
        #filtered_sentence = [w for w in word_tokens if not w in stop_words]
        filtered_sentence = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        comment = ' '.join(filtered_sentence)
        datatemps.append(comment)
    return datatemps

# Call the functions successively
anies_before['comments'] = pd.DataFrame(remove_stopwords(anies_before))
anies_after['comments'] = pd.DataFrame(remove_stopwords(anies_after))
ganjar_before['comments'] = pd.DataFrame(remove_stopwords(ganjar_before))
ganjar_after['comments'] = pd.DataFrame(remove_stopwords(ganjar_after))
prabowo_before['comments'] = pd.DataFrame(remove_stopwords(prabowo_before))
prabowo_after['comments'] = pd.DataFrame(remove_stopwords(prabowo_after))
secondary_dataset['comments'] = pd.DataFrame(remove_stopwords(secondary_dataset))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
display(anies_after.head())

Unnamed: 0,index,comments
0,0,0
1,0,0
2,1,1
3,2,2
4,3,3


### Stemming / Lemmatization

* this one need research (mae)


In [975]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(data):
    datatemps = []
    for i in range(0, len(data)):
        comment = data['comments']  # Access the 'comments' column in the DataFrame
        # print(comment)
        comment = stemmer.stem(comment[i])
        datatemps.append(comment)
    return datatemps

# Call the functions successively
anies_before['comments'] = pd.DataFrame(stemming(anies_before))
anies_after['comments'] = pd.DataFrame(stemming(anies_after))
ganjar_before['comments'] = pd.DataFrame(stemming(ganjar_before))
ganjar_after['comments'] = pd.DataFrame(stemming(ganjar_after))
prabowo_before['comments'] = pd.DataFrame(stemming(prabowo_before))
prabowo_after['comments'] = pd.DataFrame(stemming(prabowo_after))
secondary_dataset['comments'] = pd.DataFrame(stemming(secondary_dataset))
display(anies_after.head())

Unnamed: 0,index,comments
0,0,0
1,0,0
2,1,1
3,2,2
4,3,3


In [978]:
# display(secondary_dataset.head())
display(prabowo_after.head())

Unnamed: 0,index,comments
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


## Tokenization

* this one need research (mae)

In [985]:
# def tokenization(data):
#     datatemps = []
#     for i in range(0, len(data)):
#         comment = data['comments']  # Access the 'comments' column in the DataFrame
#         # print(comment)
#         comment = word_tokenize(comment[i])
#         datatemps.append(comment)
#     return datatemps
def tokenization(data):
    datatemps = []
    for i in range(0, len(data)):
        comment = data['comments'][i]  # Access the 'comments' column in the DataFrame
        comment = word_tokenize(comment)
        datatemps.append(comment)
    return datatemps

# Call the functions successively
# anies_before['comments'] = pd.DataFrame(tokenization(anies_before))
# anies_after['comments'] = pd.DataFrame(tokenization(anies_after))
# ganjar_before['comments'] = pd.DataFrame(tokenization(ganjar_before))
# ganjar_after['comments'] = pd.DataFrame(tokenization(ganjar_after))
# prabowo_before['comments'] = pd.DataFrame(tokenization(prabowo_before))
# prabowo_after['comments'] = pd.DataFrame(tokenization(prabowo_after))
# secondary_dataset['comments'] = pd.DataFrame(tokenization(secondary_dataset))   

anies_before['comments'] = tokenization(anies_before)
anies_after['comments'] = tokenization(anies_after)
ganjar_before['comments'] = tokenization(ganjar_before)
ganjar_after['comments'] = tokenization(ganjar_after)
prabowo_before['comments'] = tokenization(prabowo_before)
prabowo_after['comments'] = tokenization(prabowo_after)
secondary_dataset['comments'] = tokenization(secondary_dataset)


In [25]:
display(secondary_dataset.head())

Unnamed: 0,comments,label
0,username tolol hubungan nya keguguran pakai hi...,0
1,geblek lo tatacowo banget dibelain balikanhade...,0
2,kemarin termewek lengket duh labil banget sih ...,0
3,intinya kesel att nya anaknya kasihan perkemba...,0
4,hadewwwww permpuan lgsakit jiwaknp peran utama...,0


## Synthetic Minority Oversampling Technique (SMOTE)

* this one need research (ivan)
* Smote adalah sebuah tehnik yang digunakan terhadap data yang tidak seimbang

In [None]:
# from imblearn.over_sampling import SMOTE

# smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train_tweets_tfidf, y_train.values)
# print(X_train_smote.shape, y_train_smote.shape)

# # SMOTE on full training data
# smote = SMOTE()
# X_smote, y_smote = smote.fit_resample(X_tweets_tfidf, y.values)
# print(X_smote.shape, y_smote.shape)

# # Class Imbalance Check
# plt.pie(pd.value_counts(y_train_smote), 
#         labels=['Label 0 (Positive)', 'Label 1 (Negative)'], 
#         autopct='%0.1f%%')
# plt.axis('equal')
# plt.show()

## K-Fold Cross Validation

* this one need research (abhi)

In [993]:
# for now we will use train test split
from sklearn.model_selection import train_test_split

# split data into training and validation set
# for secondary dataset
X_train, X_test, y_train, y_test = train_test_split(secondary_dataset['comments'], secondary_dataset['label'], test_size=0.2, random_state=42)

display(X_train.head())
display(y_train.head())

display(X_test.head())
display(y_test.head())

1656    [anies, pasrah, hasil, pilkada, dki, jakarta, ...
752     [kecewa, mata, najwa, malam, mutu, narasi, naj...
892     [surabaya, ahy, kalah, garagara, antasari, ya,...
1041               [tulus, senang, insyaallah, jalan, bu]
1179    [ahok, nista, agama, nista, orang, aku, agama,...
Name: comments, dtype: object

1656    1
752     0
892     0
1041    1
1179    0
Name: label, dtype: int32

1451    [janji, aniessandi, lho, janji, mas, mohon, ka...
1334    [allah, orang, subhanallah, dkijakarta, ahokdj...
1761    [bilang, film, jelek, daur, ulang, gagal, imdb...
1735    [kalo, nol, bikin, film, perempuan, seksi, bum...
1576    [orang, tiga, pilkadadki, cikini, menteng, htt...
Name: comments, dtype: object

1451    0
1334    1
1761    0
1735    0
1576    1
Name: label, dtype: int32

## Workload Distribution?

# Modeling

<!-- * kemungkinan Binomial Naive Bayes -->

In [995]:
# # Gaussian Naive Bayes
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import GaussianNB

# vectorizer = CountVectorizer()
# X_train_vectorized = vectorizer.fit_transform(X_train)
# X_test_vectorized = vectorizer.transform(X_test)

# # Train the model
# gnb = GaussianNB()
# # display(secondary_dataset)
# gnb.fit(X_train_vectorized,y_train)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

# Assuming X_train is a list of lists containing tokenized comments
# Convert each list of tokens into a string by joining the tokens
X_train_str = [' '.join(tokens) for tokens in X_train]

# Use CountVectorizer to vectorize the comments
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_str)

# Assuming X_test is a list of lists containing tokenized comments
# Convert each list of tokens into a string by joining the tokens
X_test_str = [' '.join(tokens) for tokens in X_test]

# Vectorize the test set
X_test_vectorized = vectorizer.transform(X_test_str)

# Train the Gaussian Naive Bayes model
gnb = GaussianNB()
gnb.fit(X_train_vectorized.toarray(), y_train)

# Make predictions
# predictions = gnb.predict(X_test_vectorized.toarray())


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [1001]:
# accuracy
from sklearn.metrics import accuracy_score
y_pred = gnb.predict(X_test_vectorized.toarray())
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.7113636363636363


In [1004]:
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

# Train the model
mnb = MultinomialNB()
mnb.fit(X_train_vectorized, y_train)

In [1006]:
# accuracy
from sklearn.metrics import accuracy_score
y_pred = mnb.predict(X_test_vectorized.toarray())
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.7681818181818182


In [1007]:
# Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB

# Train the model
bnb = BernoulliNB()
bnb.fit(X_train_vectorized, y_train)

In [1008]:
# accuracy
from sklearn.metrics import accuracy_score
y_pred = bnb.predict(X_test_vectorized.toarray())
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.7659090909090909


# Model Evaluation

* this one need research (pat)

# Analysis the data from primary data

In [1010]:
def predict_sentiment(model, vectorizer, text):
    # Vectorize the input text
    text_vectorized = vectorizer.transform([text])
    
    # Make predictions
    prediction = model.predict(text_vectorized.toarray())
    
    return prediction[0]

# Test the model
# print(predict_sentiment(gnb, vectorizer, text))
# anies before
anies_before_count_positif = 0
anies_before_count_negatif = 0
display(anies_before.head())
# for i in range(0, len(anies_before)):
#     prediction = predict_sentiment(gnb, vectorizer, anies_before['comments'][i])
#     if prediction == 1:
#         anies_before_count_positif += 1
#     else:
#         anies_before_count_negatif += 1
# print('anies before positif: ', anies_before_count_positif)
# print('anies before negatif: ', anies_before_count_negatif)

Unnamed: 0,index,comments
0,0,[0]
1,1,[1]
2,2,[2]
3,3,[3]
4,4,[4]


## Modal Comprarison

# Conclusion