In [2]:
import pandas as pd
import numpy as np

import regex
import nltk
nltk.download("stopwords")
nltk.download("punkt")

from nltk.tokenize import word_tokenize # tokenisasi
from nltk.corpus import stopwords #stopwords
from nltk.stem.porter import PorterStemmer #stemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [33]:
# Load Data
data_movie = pd.read_excel("movie.xlsx")
data_movie.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [4]:
# Sentiment Value Count
data_movie['sentiment'].value_counts()

positive    501
negative    497
Name: sentiment, dtype: int64

In [5]:
# Shape of Data
data_movie.shape

(998, 2)

In [6]:
# missing value
data_movie.isna().sum()

review       0
sentiment    0
dtype: int64

CLEANING DATA

In [34]:
def cleanText(text):
    # Remove HTML tags
    text = regex.sub(r"<[^<]+?>", "", text)
    
    # Remove Special chars
    text = regex.sub(r"[^a-zA-Z0-9\s]", "", text)
    
    # LowerCase
    text = text.lower()
    
    return text

data_movie["review"] = data_movie["review"].apply(cleanText)
data_movie.head(10)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
5,probably my alltime favorite movie a story of ...,positive
6,i sure would like to see a resurrection of a u...,positive
7,this show was an amazing fresh innovative ide...,negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


StopWords Removal / Stemming

In [14]:
stopword_list = stopwords.words('english')
print(stopword_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [47]:
# Get list of stopwords from nltk
stopword_list = stopwords.words('english')

# Tokenize and Remove StopWords
def remove_stopwords(text):
    # strip() = menghapus spasi string diawal dan akhir
    tokens = [token.strip() for token in word_tokenize(text)] #pemecahan kata
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

# Applying the function to remove stopwords
data_movie['review'] = data_movie['review'].apply(remove_stopwords)
data_movie.head(10)

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
5,probably alltime favorite movie story selfless...,positive
6,sure would like see resurrection dated seahunt...,positive
7,show amazing fresh innovative idea 70s first a...,negative
8,encouraged positive comments film looking forw...,negative
9,like original gut wrenching laughter like movi...,positive


In [48]:
# Stemmer : mengembalikan kata ke bentuk dasar
def simple_stemmer(text):
    ps = PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

# Applying the function to stemm words
data_movie['review'] = data_movie['review'].apply(simple_stemmer)
data_movie.head(10)

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic there famili littl boy jake think there ...,negative
4,petter mattei love time money visual stun film...,positive
5,probabl alltim favorit movi stori selfless sac...,positive
6,sure would like see resurrect date seahunt ser...,positive
7,show amaz fresh innov idea 70 first air first ...,negative
8,encourag posit comment film look forward watch...,negative
9,like origin gut wrench laughter like movi youn...,positive


In [84]:
# x data review, y data target
X = data_movie['review']
y = data_movie['sentiment']

In [None]:
# Initializing the CountVectorizer
vect = CountVectorizer()

# Vectorizing the text
X = vect.fit_transform(X).astype(np.int8)
print('Vocabulary: ')
print(vect.vocabulary_)
print(X)

In [62]:
# Splitting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [86]:
def classifier_testing(classification, X_train, X_test, y_train, y_test):
    # Training the Classifier
    classification.fit(X_train, y_train)

    # Getting Predictions
    y_pred = classification.predict(X_test)

    # Accuracy Score
    classification_accuracy_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score:\n", classification_accuracy_score, "\n")

    # Classification Report
    class_rep = classification_report(y_test, y_pred)
    print("Classification Report:\n", class_rep, "\n")

    # Confusion Matrix
    conf_mtx = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_mtx, "\n")

In [88]:
# Initializing NaiveBayes-MultinomialNB Classifier
MNB = MultinomialNB()
# Training / Testing
classifier_testing(MNB, X_train, X_test, y_train, y_test)

Accuracy Score:
 0.81 

Classification Report:
               precision    recall  f1-score   support

    negative       0.76      0.90      0.83       100
    positive       0.88      0.72      0.79       100

    accuracy                           0.81       200
   macro avg       0.82      0.81      0.81       200
weighted avg       0.82      0.81      0.81       200
 

Confusion Matrix:
 [[90 10]
 [28 72]] 

