In [11]:
import pandas as pd 
import numpy as np

reviews_data = pd.read_excel("reviews_Preprocessing.xlsx", usecols=["Label", "reviews_tokens_stemmed"])
reviews_data.columns = ["label", "reviews"]

reviews_data.head()

Unnamed: 0,label,reviews
0,1,"['kemeja', 'bagus', 'banget', 'mau', 'nang ken..."
1,0,"['jahit', 'rapi cuman', 'benang', 'jahit', 'je..."
2,0,"['sesuai', 'harga', 'tipis', 'oke', 'warna', '..."
3,1,"['gila', 'sih', 'bagus', 'worth', 'it', 'lembu..."
4,0,"['kain', 'bagus', 'halus', 'buka', 'kotor', 'y..."


In [12]:
import ast

def join_text_list(texts):
    texts = ast.literal_eval(texts)
    return ' '.join([text for text in texts])
reviews_data["reviews_join"] = reviews_data["reviews"].apply(join_text_list)

reviews_data["reviews_join"].head()

0    kemeja bagus banget mau nang kenapa tidak dari...
1                  jahit rapi cuman benang jahit jelek
2    sesuai harga tipis oke warna abu kalau di foto...
3    gila sih bagus worth it lembut baju kirain tip...
4           kain bagus halus buka kotor ya warna putih
Name: reviews_join, dtype: object

In [13]:
reviews_data.head(10)

Unnamed: 0,label,reviews,reviews_join
0,1,"['kemeja', 'bagus', 'banget', 'mau', 'nang ken...",kemeja bagus banget mau nang kenapa tidak dari...
1,0,"['jahit', 'rapi cuman', 'benang', 'jahit', 'je...",jahit rapi cuman benang jahit jelek
2,0,"['sesuai', 'harga', 'tipis', 'oke', 'warna', '...",sesuai harga tipis oke warna abu kalau di foto...
3,1,"['gila', 'sih', 'bagus', 'worth', 'it', 'lembu...",gila sih bagus worth it lembut baju kirain tip...
4,0,"['kain', 'bagus', 'halus', 'buka', 'kotor', 'y...",kain bagus halus buka kotor ya warna putih
5,1,"['bagus', 'deh', 'harga', 'segitu', 'kain', 't...",bagus deh harga segitu kain tidak tipis banget...
6,0,"['harga', 'mahal', 'bahan', 'tipis', 'banget',...",harga mahal bahan tipis banget kirim warna rea...
7,1,"['puas', 'banget', 'takut', 'jelek', 'foto', '...",puas banget takut jelek foto yearbook pas data...
8,0,"['kecewa', 'sih', 'karena', 'pesan', 'warna', ...",kecewa sih karena pesan warna fuschia datengny...
9,0,"['maaf', 'beri', 'bintang', 'tidak', 'sesuai',...",maaf beri bintang tidak sesuai gambar gambar t...


In [14]:
label = reviews_data["label"]
text = reviews_data["reviews_join"]

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

In [16]:
train_data, test_data, train_labels, test_labels = train_test_split(text, label, test_size=0.2, random_state=42)

positive_count = (train_labels == 1).sum()
negative_count = (train_labels == 0).sum()
total_count = len(train_labels)
positive_ratio = positive_count / total_count
negative_ratio = negative_count / total_count

In [17]:
# Perhitungan TF vector pada train set
cvect = CountVectorizer()
TF_vector_train = cvect.fit_transform(train_data)

# Normalisasi TF vector pada train set
normalized_TF_vector_train = normalize(TF_vector_train, norm='l1', axis=1)

In [18]:
# Perhitungan TF vector pada test set menggunakan CountVectorizer yang sudah dilatih pada train set
TF_vector_test = cvect.transform(test_data)

# Normalisasi TF vector pada test set
normalized_TF_vector_test = normalize(TF_vector_test, norm='l1', axis=1)

In [19]:
from sklearn.feature_selection import SelectPercentile, chi2

# Persentase fitur yang ingin dipilih setelah seleksi (50%)
percent = 50

# Menghitung jumlah fitur yang diinginkan berdasarkan persentase
k = int(percent / 100 * normalized_TF_vector_train.shape[1])

# Menerapkan seleksi fitur dengan chi-square pada train set
selector = SelectPercentile(chi2, percentile=percent)
tf_mat_train_selected = selector.fit_transform(normalized_TF_vector_train, train_labels)

# Mengaplikasikan seleksi fitur yang sama pada test set
tf_mat_test_selected = selector.transform(normalized_TF_vector_test)

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

positive_count = (train_labels == 1).sum()
negative_count = (train_labels == 0).sum()
total_count = len(train_labels)
positive_ratio = positive_count / total_count
negative_ratio = negative_count / total_count

# Membuat objek model Multinomial Naive Bayes dengan class_prior yang sesuai
model = MultinomialNB(class_prior=[negative_ratio, positive_ratio])

# Melatih model dengan data latih yang sudah diseleksi
model.fit(tf_mat_train_selected, train_labels)

# Melakukan prediksi pada data uji yang sudah diseleksi
predictions = model.predict(tf_mat_test_selected)

# Menghitung akurasi
accuracy = accuracy_score(test_labels, predictions)
print("Akurasi:", accuracy)

# Menghitung precision
precision = precision_score(test_labels, predictions)
print("Precision:", precision)

# Menghitung recall
recall = recall_score(test_labels, predictions)
print("Recall:", recall)

# Menghitung F1-score
f1 = f1_score(test_labels, predictions)
print("F1-score:", f1)

Akurasi: 0.7844311377245509
Precision: 0.9795918367346939
Recall: 0.5783132530120482
F1-score: 0.7272727272727273
