In [1]:
import pandas as pd

# Membaca file CSV
df_imdb = pd.read_csv('IMDB Dataset.csv')

# Membaca file Excel
df_indo = pd.read_excel('Indonlu_Sentiment.xlsx')

# Menampilkan kedua dataset
print("Dataset dari CSV (IMdb Review):")
print(df_imdb)

print("\nDataset dari Excel (Ulasan Produk):")
print(df_indo)


Dataset dari CSV (IMdb Review):
                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]

Dataset dari Excel (Ulasan Produk):
                                                   Tweet     Label
0      warung ini dimiliki oleh pe

In [2]:
# Import untuk preprocessing
import re
import string
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Fungsi membersihkan teks
def clean_text(text):
    text = text.lower()  # Konversi teks menjadi lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Menghapus teks dalam tanda []
    text = re.sub(r'\w*\d\w*', '', text)  # Menghapus angka
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Menghapus tanda baca
    text = re.sub(r'\n', '', text)  # Menghapus newline
    text = re.sub(r'\s+', ' ', text)  # Menghapus spasi ganda
    return text

# Fungsi menghapus stopwords
def remove_stopwords(text, language='english'):
    stop_words = set(stopwords.words(language))
    return ' '.join([word for word in text.split() if word not in stop_words])

# Load dataset
df_imdb = pd.read_csv('IMDB Dataset.csv')
df_indo = pd.read_excel('Indonlu_Sentiment.xlsx')

# Preprocessing dataset IMDB
df_imdb['cleaned_text'] = df_imdb['review'].apply(clean_text)
df_imdb['cleaned_text'] = df_imdb['cleaned_text'].apply(remove_stopwords, language='english')

# Preprocessing dataset bahasa Indonesia
df_indo['cleaned_text'] = df_indo['Tweet'].apply(clean_text)
df_indo['cleaned_text'] = df_indo['cleaned_text'].apply(remove_stopwords, language='indonesian')

# Menampilkan hasil pembersihan
print("Dataset IMDB setelah pembersihan:")
print(df_imdb[['review', 'cleaned_text']].head())

print("\nDataset Bahasa Indonesia setelah pembersihan:")
print(df_indo[['Tweet', 'cleaned_text']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Risma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset IMDB setelah pembersihan:
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                        cleaned_text  
0  one reviewers mentioned watching oz episode yo...  
1  wonderful little production br br filming tech...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  

Dataset Bahasa Indonesia setelah pembersihan:
                                               Tweet  \
0  warung ini dimiliki oleh pengusaha pabrik tahu...   
1  mohon ulama lurus dan k212 mmbri hujjah partai...   
2  lokasi strategis di jalan sumatera bandung . t...   
3  betapa bahagia nya diri i

In [3]:
# Import untuk vektorisasi
from sklearn.feature_extraction.text import TfidfVectorizer

# Vektorisasi dataset IMDB
vectorizer_imdb = TfidfVectorizer(max_features=500, max_df=0.95, min_df=5)  # Membuat instance baru untuk dataset IMDB
X_imdb = vectorizer_imdb.fit_transform(df_imdb['cleaned_text'])  # Vektorisasi
terms_imdb = vectorizer_imdb.get_feature_names_out()  # Mendapatkan nama fitur

# Menampilkan beberapa hasil vektorisasi IMDB
print("Vektorisasi IMDB (beberapa baris):")
print(X_imdb[0:5, :].toarray())  # Menampilkan 5 baris pertama
print("Terms IMDB:")
print(terms_imdb[:10])  # Menampilkan 10 istilah pertama

# Vektorisasi dataset Bahasa Indonesia
vectorizer_indo = TfidfVectorizer(max_features=500, max_df=0.95, min_df=5)  # Membuat instance baru untuk dataset Bahasa Indonesia
X_indo = vectorizer_indo.fit_transform(df_indo['cleaned_text'])  # Vektorisasi
terms_indo = vectorizer_indo.get_feature_names_out()  # Mendapatkan nama fitur

# Menampilkan beberapa hasil vektorisasi Bahasa Indonesia
print("\nVektorisasi Bahasa Indonesia (beberapa baris):")
print(X_indo[0:5, :].toarray())  # Menampilkan 5 baris pertama
print("Terms Bahasa Indonesia:")
print(terms_indo[:10])  # Menampilkan 10 istilah pertama


Vektorisasi IMDB (beberapa baris):
[[0.         0.         0.         ... 0.11423438 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.16255777 0.        ]
 [0.         0.         0.         ... 0.         0.         0.17315243]
 [0.         0.         0.         ... 0.         0.         0.        ]]
Terms IMDB:
['able' 'absolutely' 'across' 'act' 'acting' 'action' 'actor' 'actors'
 'actress' 'actually']

Vektorisasi Bahasa Indonesia (beberapa baris):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Terms Bahasa Indonesia:
['acara' 'agama' 'ahok' 'air' 'akses' 'ala' 'alam' 'ambil' 'an' 'anak']


In [5]:
# Definisikan y_imdb dan y_indo sebagai label dari masing-masing dataset
y_imdb = df_imdb['sentiment']  # Kolom label untuk dataset IMDB
y_indo = df_indo['Label']  # Kolom label untuk dataset Bahasa Indonesia

# Setelah itu, lakukan pembagian train-test
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(X_imdb, y_imdb, test_size=0.2, random_state=42)
X_train_indo, X_test_indo, y_train_indo, y_test_indo = train_test_split(X_indo, y_indo, test_size=0.2, random_state=42)

# Menampilkan bentuk data latih dan uji
print(f'Train Shape IMDB: {X_train_imdb.shape}, Test Shape IMDB: {X_test_imdb.shape}')
print(f'Train Shape Indonesian: {X_train_indo.shape}, Test Shape Indonesian: {X_test_indo.shape}')


Train Shape IMDB: (40000, 500), Test Shape IMDB: (10000, 500)
Train Shape Indonesian: (10208, 500), Test Shape Indonesian: (2552, 500)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model Logistic Regression
lr_model = LogisticRegression(max_iter=200)

# Melatih model dengan data latih IMDB
lr_model.fit(X_train_imdb, y_train_imdb)

# Memprediksi data uji IMDB
y_pred_lr_imdb = lr_model.predict(X_test_imdb)

# Menghitung akurasi IMDB
print("Akurasi Logistic Regression (IMDB):", accuracy_score(y_test_imdb, y_pred_lr_imdb))
print(classification_report(y_test_imdb, y_pred_lr_imdb))

# Melatih model dengan data latih Indo
lr_model.fit(X_train_indo, y_train_indo)

# Memprediksi data uji Indo
y_pred_lr_indo = lr_model.predict(X_test_indo)

# Menghitung akurasi Indo
print("Akurasi Logistic Regression (Indo):", accuracy_score(y_test_indo, y_pred_lr_indo))
print(classification_report(y_test_indo, y_pred_lr_indo))


Akurasi Logistic Regression (IMDB): 0.843
              precision    recall  f1-score   support

    negative       0.85      0.83      0.84      4961
    positive       0.84      0.85      0.85      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

Akurasi Logistic Regression (Indo): 0.7942789968652038
              precision    recall  f1-score   support

    negative       0.70      0.75      0.73       849
     neutral       0.68      0.38      0.49       257
    positive       0.86      0.89      0.88      1446

    accuracy                           0.79      2552
   macro avg       0.75      0.68      0.70      2552
weighted avg       0.79      0.79      0.79      2552



In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model Naive Bayes
nb_model = MultinomialNB()

# Melatih model dengan data latih IMDB
nb_model.fit(X_train_imdb, y_train_imdb)

# Memprediksi data uji IMDB
y_pred_nb_imdb = nb_model.predict(X_test_imdb)

# Menghitung akurasi IMDB
print("Akurasi Naive Bayes (IMDB):", accuracy_score(y_test_imdb, y_pred_nb_imdb))
print(classification_report(y_test_imdb, y_pred_nb_imdb))

# Melatih model dengan data latih Indo
nb_model.fit(X_train_indo, y_train_indo)

# Memprediksi data uji Indo
y_pred_nb_indo = nb_model.predict(X_test_indo)

# Menghitung akurasi Indo
print("Akurasi Naive Bayes (Indo):", accuracy_score(y_test_indo, y_pred_nb_indo))
print(classification_report(y_test_indo, y_pred_nb_indo))


Akurasi Naive Bayes (IMDB): 0.8178
              precision    recall  f1-score   support

    negative       0.82      0.80      0.81      4961
    positive       0.81      0.83      0.82      5039

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

Akurasi Naive Bayes (Indo): 0.7539184952978056
              precision    recall  f1-score   support

    negative       0.75      0.61      0.67       849
     neutral       0.67      0.29      0.41       257
    positive       0.76      0.92      0.83      1446

    accuracy                           0.75      2552
   macro avg       0.73      0.61      0.64      2552
weighted avg       0.75      0.75      0.74      2552



In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model Decision Tree
dt_model = DecisionTreeClassifier()

# Melatih model dengan data latih IMDB
dt_model.fit(X_train_imdb, y_train_imdb)

# Memprediksi data uji IMDB
y_pred_dt_imdb = dt_model.predict(X_test_imdb)

# Menghitung akurasi IMDB
print("Akurasi Decision Tree (IMDB):", accuracy_score(y_test_imdb, y_pred_dt_imdb))
print(classification_report(y_test_imdb, y_pred_dt_imdb))

# Melatih model dengan data latih Indo
dt_model.fit(X_train_indo, y_train_indo)

# Memprediksi data uji Indo
y_pred_dt_indo = dt_model.predict(X_test_indo)

# Menghitung akurasi Indo
print("Akurasi Decision Tree (Indo):", accuracy_score(y_test_indo, y_pred_dt_indo))
print(classification_report(y_test_indo, y_pred_dt_indo))


Akurasi Decision Tree (IMDB): 0.7065
              precision    recall  f1-score   support

    negative       0.70      0.71      0.71      4961
    positive       0.71      0.70      0.71      5039

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.71     10000
weighted avg       0.71      0.71      0.71     10000

Akurasi Decision Tree (Indo): 0.7433385579937304
              precision    recall  f1-score   support

    negative       0.63      0.71      0.67       849
     neutral       0.53      0.44      0.48       257
    positive       0.85      0.82      0.84      1446

    accuracy                           0.74      2552
   macro avg       0.67      0.65      0.66      2552
weighted avg       0.75      0.74      0.74      2552



In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model Random Forest
rf_model = RandomForestClassifier(n_estimators=100)

# Melatih model dengan data latih IMDB
rf_model.fit(X_train_imdb, y_train_imdb)

# Memprediksi data uji IMDB
y_pred_rf_imdb = rf_model.predict(X_test_imdb)

# Menghitung akurasi IMDB
print("Akurasi Random Forest (IMDB):", accuracy_score(y_test_imdb, y_pred_rf_imdb))
print(classification_report(y_test_imdb, y_pred_rf_imdb))

# Melatih model dengan data latih Indo
rf_model.fit(X_train_indo, y_train_indo)

# Memprediksi data uji Indo
y_pred_rf_indo = rf_model.predict(X_test_indo)

# Menghitung akurasi Indo
print("Akurasi Random Forest (Indo):", accuracy_score(y_test_indo, y_pred_rf_indo))
print(classification_report(y_test_indo, y_pred_rf_indo))


Akurasi Random Forest (IMDB): 0.8156
              precision    recall  f1-score   support

    negative       0.82      0.81      0.81      4961
    positive       0.82      0.82      0.82      5039

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

Akurasi Random Forest (Indo): 0.792319749216301
              precision    recall  f1-score   support

    negative       0.71      0.72      0.71       849
     neutral       0.65      0.44      0.53       257
    positive       0.86      0.90      0.88      1446

    accuracy                           0.79      2552
   macro avg       0.74      0.69      0.71      2552
weighted avg       0.79      0.79      0.79      2552



In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model SVM
svm_model = SVC(kernel='linear')

# Melatih model dengan data latih IMDB
svm_model.fit(X_train_imdb, y_train_imdb)

# Memprediksi data uji IMDB
y_pred_svm_imdb = svm_model.predict(X_test_imdb)

# Menghitung akurasi IMDB
print("Akurasi SVM (IMDB):", accuracy_score(y_test_imdb, y_pred_svm_imdb))
print(classification_report(y_test_imdb, y_pred_svm_imdb))

# Melatih model dengan data latih Indo
svm_model.fit(X_train_indo, y_train_indo)

# Memprediksi data uji Indo
y_pred_svm_indo = svm_model.predict(X_test_indo)

# Menghitung akurasi Indo
print("Akurasi SVM (Indo):", accuracy_score(y_test_indo, y_pred_svm_indo))
print(classification_report(y_test_indo, y_pred_svm_indo))


Akurasi SVM (IMDB): 0.8427
              precision    recall  f1-score   support

    negative       0.85      0.83      0.84      4961
    positive       0.84      0.85      0.85      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

Akurasi SVM (Indo): 0.79858934169279
              precision    recall  f1-score   support

    negative       0.70      0.78      0.73       849
     neutral       0.68      0.39      0.50       257
    positive       0.88      0.88      0.88      1446

    accuracy                           0.80      2552
   macro avg       0.75      0.68      0.70      2552
weighted avg       0.80      0.80      0.79      2552



In [23]:
from tabulate import tabulate

# Membuat dictionary untuk menyimpan hasil akurasi
results_dict = {
    "Model": [
        "Logistic Regression (IMDB)", "Logistic Regression (Indo)",
        "Naive Bayes (IMDB)", "Naive Bayes (Indo)",
        "Decision Tree (IMDB)", "Decision Tree (Indo)",
        "Random Forest (IMDB)", "Random Forest (Indo)",
        "SVM (IMDB)", "SVM (Indo)"
    ],
    "Accuracy": [
        accuracy_score(y_test_imdb, y_pred_lr_imdb),
        accuracy_score(y_test_indo, y_pred_lr_indo),
        accuracy_score(y_test_imdb, y_pred_nb_imdb),
        accuracy_score(y_test_indo, y_pred_nb_indo),
        accuracy_score(y_test_imdb, y_pred_dt_imdb),
        accuracy_score(y_test_indo, y_pred_dt_indo),
        accuracy_score(y_test_imdb, y_pred_rf_imdb),
        accuracy_score(y_test_indo, y_pred_rf_indo),
        accuracy_score(y_test_imdb, y_pred_svm_imdb),
        accuracy_score(y_test_indo, y_pred_svm_indo)
    ]
}

# Membuat DataFrame dari dictionary
results_df = pd.DataFrame(results_dict)

# Menampilkan hasil evaluasi dalam tabel
print("Hasil Evaluasi Semua Model:")
print(tabulate(results_df, headers='keys', tablefmt='fancy_grid', showindex=False))

Hasil Evaluasi Semua Model:
╒════════════════════════════╤════════════╕
│ Model                      │   Accuracy │
╞════════════════════════════╪════════════╡
│ Logistic Regression (IMDB) │   0.843    │
├────────────────────────────┼────────────┤
│ Logistic Regression (Indo) │   0.794279 │
├────────────────────────────┼────────────┤
│ Naive Bayes (IMDB)         │   0.8178   │
├────────────────────────────┼────────────┤
│ Naive Bayes (Indo)         │   0.753918 │
├────────────────────────────┼────────────┤
│ Decision Tree (IMDB)       │   0.7065   │
├────────────────────────────┼────────────┤
│ Decision Tree (Indo)       │   0.743339 │
├────────────────────────────┼────────────┤
│ Random Forest (IMDB)       │   0.8156   │
├────────────────────────────┼────────────┤
│ Random Forest (Indo)       │   0.79232  │
├────────────────────────────┼────────────┤
│ SVM (IMDB)                 │   0.8427   │
├────────────────────────────┼────────────┤
│ SVM (Indo)                 │   0.798589 │
╘═══

In [32]:
# Fungsi untuk memprediksi sentimen
def predict_sentiment(text, model, vectorizer):
    cleaned_text = clean_text(text)
    text_vector = vectorizer.transform([cleaned_text]) 
    prediction = model.predict(text_vector) 
    return prediction[0]

# Teks baru untuk prediksi
new_text = "Saya sangat puas dengan produk ini!"

# Melakukan prediksi dengan berbagai model untuk dataset IMDB
print("Prediksi Sentimen Dataset IMDB:")
print("Naive Bayes:", predict_sentiment(new_text, nb_model, vectorizer_imdb))
print("Decision Tree:", predict_sentiment(new_text, dt_model, vectorizer_imdb))
print("Logistic Regression:", predict_sentiment(new_text, lr_model, vectorizer_imdb))
print("Random Forest:", predict_sentiment(new_text, rf_model, vectorizer_imdb))
print("SVM:", predict_sentiment(new_text, svm_model, vectorizer_imdb))

# Melakukan prediksi dengan berbagai model untuk dataset Bahasa Indonesia
print("\nPrediksi Sentimen Dataset Bahasa Indonesia:")
print("Naive Bayes:", predict_sentiment(new_text, nb_model, vectorizer_indo))
print("Decision Tree:", predict_sentiment(new_text, dt_model, vectorizer_indo))
print("Logistic Regression:", predict_sentiment(new_text, lr_model, vectorizer_indo))
print("Random Forest:", predict_sentiment(new_text, rf_model, vectorizer_indo))
print("SVM:", predict_sentiment(new_text, svm_model, vectorizer_indo))


Prediksi Sentimen Dataset IMDB:
Naive Bayes: positive
Decision Tree: negative
Logistic Regression: negative
Random Forest: negative
SVM: negative

Prediksi Sentimen Dataset Bahasa Indonesia:
Naive Bayes: positive
Decision Tree: positive
Logistic Regression: positive
Random Forest: positive
SVM: positive
