In [20]:
import pandas as pd

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

In [9]:
dataset = pd.read_csv('./dataset_sentiment.csv')
dataset.head(2)

Unnamed: 0,content,score,sentiment_score,sentiment
0,aplikasi bagus,5,-16,negative
1,beli token listrik udah sukses,3,0,neutral


In [13]:
tfidf_vectorizer = TfidfVectorizer()
bow_vectorizer = CountVectorizer()

In [14]:
tfidf_matrix = tfidf_vectorizer.fit_transform(dataset.content)
bow_matrix = bow_vectorizer.fit_transform(dataset.content)

In [24]:
tfidf_vectorizer.get_feature_names_out()

array(['aaa', 'aagar', 'aamiin', ..., 'zi', 'zonk', 'zozz'], dtype=object)

In [38]:
ranforest = RandomForestClassifier()
logistic = LogisticRegression()
tree = DecisionTreeClassifier(criterion='entropy')

In [39]:
tfidf_features = pd.DataFrame(tfidf_matrix.toarray(),
                            columns=tfidf_vectorizer.get_feature_names_out())
bow_features = pd.DataFrame(bow_matrix.toarray(),
                            columns=bow_vectorizer.get_feature_names_out())

In [40]:
tfidf_x_train, tfidf_x_test, tfidf_y_train, tfidf_y_test = train_test_split(tfidf_features, dataset.sentiment,
                                                                            test_size=.15, random_state=1915)
bow_x_train, bow_x_test, bow_y_train, bow_y_test = train_test_split(bow_features, dataset.sentiment,
                                                                    test_size=.15, random_state=1915)

In [41]:
ranforest.fit(tfidf_x_train, tfidf_y_train)
print(f'RandomForest with TFIDF test: {accuracy_score(tfidf_y_test, ranforest.predict(tfidf_x_test))}',
      f'RandomForest with TFIDF train: {accuracy_score(tfidf_y_train, ranforest.predict(tfidf_x_train))}',
      sep='\n')
ranforest.fit(bow_x_train, bow_y_train)
print(f'RandomForest with BOW test: {accuracy_score(bow_y_test, ranforest.predict(bow_x_test))}',
      f'RandomForest with BOW train: {accuracy_score(bow_y_train, ranforest.predict(bow_x_train))}',
      sep='\n', end='\n\n')

logistic.fit(tfidf_x_train, tfidf_y_train)
print(f'LogisticRegression with TFIDF test: {accuracy_score(tfidf_y_test, logistic.predict(tfidf_x_test))}',
      f'LogisticRegression with TFIDF train: {accuracy_score(tfidf_y_train, logistic.predict(tfidf_x_train))}',
      sep='\n')
logistic.fit(bow_x_train, bow_y_train)
print(f'LogisticRegression with BOW test: {accuracy_score(bow_y_test, logistic.predict(bow_x_test))}',
      f'LogisticRegression with BOW train: {accuracy_score(bow_y_train, logistic.predict(bow_x_train))}',
      sep='\n', end='\n\n')

tree.fit(bow_x_train, bow_y_train)
print(f'DecisionTree with BOW test: {accuracy_score(bow_y_test, tree.predict(bow_x_test))}',
      f'DecisionTree with BOW train: {accuracy_score(bow_y_train, tree.predict(bow_x_train))}',
      sep='\n', end='\n\n')

RandomForest with TFIDF test: 0.8853556485355648
RandomForest with TFIDF train: 1.0
RandomForest with BOW test: 0.8803347280334728
RandomForest with BOW train: 1.0

LogisticRegression with TFIDF test: 0.8870292887029289
LogisticRegression with TFIDF train: 0.9518250332495937
LogisticRegression with BOW test: 0.8945606694560669
LogisticRegression with BOW train: 0.9683759420718191

DecisionTree with BOW test: 0.8661087866108786
DecisionTree with BOW train: 1.0



In [52]:
test_text = "aplikasi keren banget"
test_text = tfidf_vectorizer.transform([test_text])
print(ranforest.predict(test_text))

['positive']


