<a href="https://colab.research.google.com/github/Damanikfanii/ecommerce-sentiment-analysis/blob/main/Sefty_Fani_Damanik_AI_Portfolio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# E-commerce Sentiment Classification Project

import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
# 1. DATASET

from google.colab import files
uploaded = files.upload()

data = []
with open("train.ft.txt", "r", encoding="utf-8") as f:
    for line in f:
        label, text = line.strip().split(" ", 1)
        label = int(label.replace("__label__", "")) - 1  # label: 0 = negatif, 1 = positif
        data.append((text, label))

# Ubah ke DataFrame
import pandas as pd
df = pd.DataFrame(data, columns=["review", "label"])

df = df.sample(5000, random_state=42)

# 2. PREPROCESSING
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_review'] = df['review'].apply(clean_text)

# 3A. TF-IDF + LOGISTIC REGRESSION
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['clean_review'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("\n=== Logistic Regression ===")
print(classification_report(y_test, y_pred_lr))

# 3B. TF-IDF + SVM
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("\n=== SVM ===")
print(classification_report(y_test, y_pred_svm))


# 3C. LSTM + Word Embedding
MAX_WORDS = 1000
MAX_LEN = 20

# Tokenisasi dan padding
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_review'])
X_seq = tokenizer.texts_to_sequences(df['clean_review'])
X_pad = pad_sequences(X_seq, maxlen=MAX_LEN)

X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_pad, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Embedding(input_dim=MAX_WORDS, output_dim=32, input_length=MAX_LEN))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=2, verbose=0)

loss, acc = model.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print("\n=== LSTM ===")
print(f"Test Accuracy: {acc:.2f}")

# 4. PREDIKSI MANUAL
def predict_sentiment(text):
    cleaned = clean_text(text)
    # Logistic Regression prediction
    tfidf_vec = tfidf.transform([cleaned])
    pred_lr = lr.predict(tfidf_vec)[0]

    # LSTM prediction
    seq = tokenizer.texts_to_sequences([cleaned])
    pad_seq = pad_sequences(seq, maxlen=MAX_LEN)
    pred_lstm = np.argmax(model.predict(pad_seq), axis=-1)[0]

    label_map = {0: "Negatif", 1: "Positif", 2: "Netral"}

    print("\n[Prediksi Kalimat:", text, "]")
    print("Logistic Regression:", label_map[pred_lr])
    print("LSTM:", label_map[pred_lstm])

# Contoh penggunaan
predict_sentiment("Saya sangat senang dengan layanan yang cepat dan ramah")
predict_sentiment("Pengalaman saya sangat buruk dan tidak menyenangkan")


Saving train.ft.txt to train.ft.txt

=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       508
           1       0.85      0.84      0.84       492

    accuracy                           0.85      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.85      0.85      0.85      1000


=== SVM ===
              precision    recall  f1-score   support

           0       0.85      0.86      0.86       508
           1       0.86      0.85      0.85       492

    accuracy                           0.85      1000
   macro avg       0.86      0.85      0.85      1000
weighted avg       0.86      0.85      0.85      1000






=== LSTM ===
Test Accuracy: 0.70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 225ms/step

[Prediksi Kalimat: Saya sangat senang dengan layanan yang cepat dan ramah ]
Logistic Regression: Positif
LSTM: Negatif
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step

[Prediksi Kalimat: Pengalaman saya sangat buruk dan tidak menyenangkan ]
Logistic Regression: Positif
LSTM: Negatif
