In [None]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
import emoji
import re
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [8]:
# Initialize stemmer and lemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

# Use Gensim stopwords for English
stop_words_gensim = STOPWORDS

# Slang word normalization dictionary
slangwords = {
    "@": "at", "lol": "laughing out loud", "omg": "oh my god", "idk": "i don't know", 
    "btw": "by the way", "tbh": "to be honest", "smh": "shaking my head", "brb": "be right back",
    "bff": "best friends forever", "gtg": "got to go", "fyi": "for your information", "np": "no problem",
    "wtf": "what the heck", "yolo": "you only live once", "fomo": "fear of missing out", "lmao": "laughing my ass off",
    "tmi": "too much information", "srsly": "seriously", "wut": "what", "bbl": "be back later", "thx": "thanks",
    "gr8": "great", "nvm": "never mind", "cu": "see you", "g2g": "got to go", "y": "why", "lmk": "let me know",
    "wyd": "what are you doing", "gimme": "give me", "gonna": "going to", "wanna": "want to", "gotta": "got to",
    "kinda": "kind of", "asap": "as soon as possible", "bday": "birthday", "bby": "baby", "cuz": "because", 
    "dr": "doctor", "b4": "before", "u": "you", "ur": "your", "pls": "please", "ty": "thank you"
}

In [9]:
# Preprocessing functions
def cleaningText(text):
    """Remove emojis, mentions, hashtags, URLs, numbers, and non-alphanumeric characters"""
    text = str(text)
    text = emoji.replace_emoji(text, replace='')  # Remove emojis
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)     # Remove mentions
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)     # Remove hashtags
    text = re.sub(r'RT[\s]+', '', text)            # Remove RT
    text = re.sub(r"http\S+", '', text)            # Remove URLs
    text = re.sub(r'[0-9]+', '', text)             # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)            # Remove non-alphanumeric characters
    text = text.replace('\n', ' ')
    return text.strip()

def casefoldingText(text):
    """Convert to lowercase"""
    return text.lower()

def tokenizingText(text):
    """Tokenize text"""
    return word_tokenize(text)

def filteringText(tokens):
    """Remove stopwords"""
    return [word for word in tokens if word not in stop_words_gensim]

def stemmingText(tokens):
    """Apply stemming"""
    return [stemmer.stem(word) for word in tokens]

def lemmatizingText(tokens):
    """Apply lemmatization"""
    return [lemmatizer.lemmatize(word) for word in tokens]

def fix_slangwords(text):
    """Normalize slang words"""
    words = text.split()
    fixed_words = [slangwords[word.lower()] if word.lower() in slangwords else word for word in words]
    return ' '.join(fixed_words)

def toSentence(tokens):
    """Join tokens back to sentence"""
    return ' '.join(tokens)

def preprocess(text):
    """Full preprocessing pipeline"""
    text = cleaningText(text)          # 1. Cleaning
    text = casefoldingText(text)       # 2. Case Folding
    text = fix_slangwords(text)        # 3. Slang Normalization
    tokens = tokenizingText(text)      # 4. Tokenizing
    tokens = filteringText(tokens)     # 5. Stopword removal
    tokens = stemmingText(tokens)      # 6. Stemming
    tokens = lemmatizingText(tokens)   # 7. Lemmatization
    return toSentence(tokens)          # 8. Join back into a sentence


In [10]:
# Load dataset
df = pd.read_csv('output_sentimen.csv')
df['clean_ulasan'] = df['ulasan'].apply(preprocess)  # Apply preprocessing

# Labeling based on text (manual example)
def label_sentiment_based_on_text(text):
    """Label sentiment based on text content"""
    if "good" in text or "great" in text or "love" in text:
        return 'positif'
    elif "bad" in text or "poor" in text or "hate" in text:
        return 'negatif'
    else:
        return 'netral'

df['sentimen'] = df['clean_ulasan'].apply(label_sentiment_based_on_text)

# Encode the labels (positif, negatif, netral) to numeric
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentimen'])

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=15000)
X = tfidf.fit_transform(df['clean_ulasan'])

# Split data (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SVM Model
model = SVC(kernel='linear')  # SVM dengan kernel linear
model.fit(X_train, y_train)

# Prediction
predictions = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, predictions)
print(f"Akurasi SVM: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, predictions, target_names=label_encoder.classes_))


Akurasi SVM: 99.25%

Classification Report:
              precision    recall  f1-score   support

     negatif       0.95      0.89      0.92       134
      netral       0.99      1.00      1.00      4010
     positif       1.00      0.98      0.99      1856

    accuracy                           0.99      6000
   macro avg       0.98      0.96      0.97      6000
weighted avg       0.99      0.99      0.99      6000



In [11]:

# Helper untuk Word2Vec
def get_w2v_embeddings(texts, model, vector_size):
    embeddings = []
    for tokens in texts:
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        if vectors:
            embeddings.append(np.mean(vectors, axis=0))
        else:
            embeddings.append(np.zeros(vector_size))
    return np.array(embeddings)

# Token ulang untuk Word2Vec
df['tokens'] = df['clean_ulasan'].apply(tokenizingText)

# Train Word2Vec
w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# ====================== SKEMA 1 ======================
# SVM + TF-IDF + 80/20
X_tfidf = tfidf.fit_transform(df['clean_ulasan'])
y_encoded = label_encoder.fit_transform(df['sentimen'])
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)
model_svm = SVC(kernel='linear')
model_svm.fit(X_train1, y_train1)
y_pred1 = model_svm.predict(X_test1)
print("\n🔹 Skema 1: SVM + TF-IDF + 80/20")
print(f"Akurasi: {accuracy_score(y_test1, y_pred1) * 100:.2f}%")
print(classification_report(y_test1, y_pred1, target_names=label_encoder.classes_))



🔹 Skema 1: SVM + TF-IDF + 80/20
Akurasi: 99.25%
              precision    recall  f1-score   support

     negatif       0.95      0.89      0.92       134
      netral       0.99      1.00      1.00      4010
     positif       1.00      0.98      0.99      1856

    accuracy                           0.99      6000
   macro avg       0.98      0.96      0.97      6000
weighted avg       0.99      0.99      0.99      6000



In [17]:
# ====================== SKEMA 2 ======================
from sklearn.neural_network import MLPClassifier

tfidf_vectorizer = TfidfVectorizer(max_features=15000)
X_tfidf_mlp = tfidf_vectorizer.fit_transform(df['clean_ulasan'])
X_train5, X_test5, y_train5, y_test5 = train_test_split(X_tfidf_mlp, y_encoded, test_size=0.2, random_state=42)

model_mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
model_mlp.fit(X_train5, y_train5)
y_pred5 = model_mlp.predict(X_test5)

print("\n🔹 Skema 2: MLP + TF-IDF + 80/20")
print(f"Akurasi: {accuracy_score(y_test5, y_pred5) * 100:.2f}%")
print(classification_report(y_test5, y_pred5, target_names=label_encoder.classes_))





🔹 Skema 2: MLP + TF-IDF + 80/20
Akurasi: 98.23%
              precision    recall  f1-score   support

     negatif       0.97      0.78      0.87       134
      netral       0.98      1.00      0.99      4010
     positif       0.99      0.96      0.98      1856

    accuracy                           0.98      6000
   macro avg       0.98      0.91      0.94      6000
weighted avg       0.98      0.98      0.98      6000



In [13]:
# ====================== SKEMA 3 ======================
# RF + TF-IDF + 70/30
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_tfidf, y_encoded, test_size=0.3, random_state=42)
model_rf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf_tfidf.fit(X_train3, y_train3)
y_pred3 = model_rf_tfidf.predict(X_test3)
print("\n🔹 Skema 3: RF + TF-IDF + 70/30")
print(f"Akurasi: {accuracy_score(y_test3, y_pred3) * 100:.2f}%")
print(classification_report(y_test3, y_pred3, target_names=label_encoder.classes_))


🔹 Skema 3: RF + TF-IDF + 70/30
Akurasi: 99.03%
              precision    recall  f1-score   support

     negatif       1.00      0.75      0.86       206
      netral       0.99      1.00      0.99      6003
     positif       1.00      0.99      0.99      2791

    accuracy                           0.99      9000
   macro avg       0.99      0.91      0.95      9000
weighted avg       0.99      0.99      0.99      9000



In [18]:
!pip freeze > requirements.txt