# Analisis Sentimen Komentar YouTube
Template Notebook untuk submission analisis sentimen

## 1. Import Libraries

In [1]:
!pip install numpy==1.26.0 scipy==1.11.2 gensim==4.3.1 scikit-learn==1.3.0 --quiet

from gensim.models import Word2Vec
import pandas as pd
import torch
import numpy as np
import re
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

## 2. Load Dataset

In [2]:
# Ganti path sesuai lokasi file
df = pd.read_csv('youtube_comments.csv')
df.head()

Unnamed: 0,comment_id,author_name,author_channel_url,comment_text,like_count,published_at,updated_at
0,UgyzYuIJOPhE-NwjzDR4AaABAg,@miftahudin4769,http://www.youtube.com/@miftahudin4769,Haduh maaf bg tlt 6thn,0,2025-04-13T19:21:53Z,2025-04-13T19:21:53Z
1,UgzhQD2OX9x09du7TGx4AaABAg,@AlfinNufus-z3k,http://www.youtube.com/@AlfinNufus-z3k,23 juta 😮 6 tahun kemudian jadi 2 juta😂,0,2025-04-10T02:33:13Z,2025-04-10T02:33:13Z
2,UgyWOQ9L7Op1wlQseKR4AaABAg,@AlfinNufus-z3k,http://www.youtube.com/@AlfinNufus-z3k,Mana nih yang 2025😮,0,2025-04-10T02:30:02Z,2025-04-10T02:30:02Z
3,Ugyi5Onmf0ktu0QXKQZ4AaABAg,@Nurma-gs5dl,http://www.youtube.com/@Nurma-gs5dl,Lah Sekarang Kalo Gak Salah Yang Seken Paling ...,0,2025-04-04T10:52:45Z,2025-04-04T10:52:45Z
4,UgxSL6JOTy9kyJAmGz54AaABAg,@NanaUraa,http://www.youtube.com/@NanaUraa,gw terakhir,0,2025-03-27T20:50:26Z,2025-03-27T20:50:26Z


## 3. Preprocessing
- Lowercase
- Hapus URL, angka, tanda baca, emoji
- Stopword removal

In [3]:
basic_stopwords = {...}  # definisikan stopword list Bahasa Indonesia

def simple_preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\x00-\x7F]", "", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in basic_stopwords]
    return ' '.join(tokens)

df['clean_text'] = df['comment_text'].astype(str).apply(simple_preprocess)
df[['comment_text', 'clean_text']].head()

Unnamed: 0,comment_text,clean_text
0,Haduh maaf bg tlt 6thn,haduh maaf bg tlt thn
1,23 juta 😮 6 tahun kemudian jadi 2 juta😂,juta tahun kemudian jadi juta
2,Mana nih yang 2025😮,mana nih yang
3,Lah Sekarang Kalo Gak Salah Yang Seken Paling ...,lah sekarang kalo gak salah yang seken paling ...
4,gw terakhir,gw terakhir


## 4. Labeling
- Kelas: positif, netral, negatif
- Metode: keyword-based

In [4]:
MODEL_ID = "w11wo/indonesian-roberta-base-sentiment-classifier"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
model.eval()

def dl_labeling(text):
    # 1. Jika teks kosong atau hanya whitespace, langsung netral
    if not isinstance(text, str) or text.strip() == "":
        return "netral"

    # 2. Tokenisasi
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    # Pastikan LongTensor
    inputs["input_ids"] = inputs["input_ids"].long()

    # 3. Jika setelah tokenisasi seq_len == 0, netral
    if inputs["input_ids"].size(1) == 0:
        return "netral"

    # 4. Inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits  = outputs.logits

    # 5. Ambil prediksi
    pred_id  = torch.argmax(logits, dim=1).item()
    hf_label = model.config.id2label[pred_id].upper()

    # 6. Map ke Bahasa Indonesia
    label_map = {
        "POSITIVE": "positif",
        "NEGATIVE": "negatif",
        "NEUTRAL" : "netral"
    }
    return label_map.get(hf_label, "netral")

# Terapkan
df["label"] = df["clean_text"].apply(dl_labeling)
print(df["label"].value_counts())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


label
negatif    6114
netral     4181
positif    3245
Name: count, dtype: int64


## 5. Feature Extraction (TF-IDF)

In [5]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_text'])
y = df['label']

## 6. Train-Test Split (80/20)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(X_train.shape, X_test.shape)

(10832, 5000) (2708, 5000)


## 7. Evaluasi Tiap Skema
Menampilkan akurasi dan classification report

In [14]:
# Definisikan ulang X dan y
X = df['clean_text']  # atau kolom teks asli kamu
y = df['label']

# Definisikan skema
schemes = [
    {
        'name': 'SVM + TF-IDF (80/20)',
        'model': SVC(),
        'feature': 'tfidf',
        'test_size': 0.2
    },
    {
        'name': 'RF + Word2Vec (80/20)',
        'model': RandomForestClassifier(),
        'feature': 'w2v',
        'test_size': 0.2
    },
    {
        'name': 'RF + TF-IDF (70/30)',
        'model': RandomForestClassifier(),
        'feature': 'tfidf',
        'test_size': 0.3
    }
]

results = []

for scheme in schemes:
    # split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=scheme['test_size'], random_state=42, stratify=y
    )

    # ekstraksi fitur
    if scheme['feature'] == 'tfidf':
        vec = TfidfVectorizer(max_features=5000)
        X_train_vec = vec.fit_transform(X_train)
        X_test_vec  = vec.transform(X_test)

    elif scheme['feature'] == 'w2v':
        # latih Word2Vec pada korpus train
        tokenized = [doc.split() for doc in X_train]
        w2v_model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=2, workers=4)
        # fungsi rata-rata vektor per dokumen
        def doc2vec(doc):
            words = doc.split()
            vecs = [w2v_model.wv[w] for w in words if w in w2v_model.wv]
            return np.mean(vecs, axis=0) if vecs else np.zeros(w2v_model.vector_size)
        # transform
        X_train_vec = np.vstack([doc2vec(d) for d in X_train])
        X_test_vec  = np.vstack([doc2vec(d) for d in X_test])

    # training
    clf = scheme['model']
    clf.fit(X_train_vec, y_train)

    # evaluasi
    train_acc = accuracy_score(y_train, clf.predict(X_train_vec))
    test_acc  = accuracy_score(y_test,  clf.predict(X_test_vec))

    results.append({
      'Skema': scheme['name'],
      'Train Acc.': round(train_acc * 100, 2),
      'Test Acc.': round(test_acc * 100, 2),
      'Model': clf,          # SIMPAN model
      'Vectorizer': vec      # SIMPAN vectorizer
})
# Tampilkan hasil
results_df = pd.DataFrame(results)
print(results_df)

                   Skema  Train Acc.  Test Acc.  \
0   SVM + TF-IDF (80/20)       91.05      67.21   
1  RF + Word2Vec (80/20)       97.00      56.91   
2    RF + TF-IDF (70/30)       96.75      62.75   

                                               Model  \
0                                              SVC()   
1  (DecisionTreeClassifier(max_features='sqrt', r...   
2  (DecisionTreeClassifier(max_features='sqrt', r...   

                           Vectorizer  
0  TfidfVectorizer(max_features=5000)  
1  TfidfVectorizer(max_features=5000)  
2  TfidfVectorizer(max_features=5000)  


## 8. Inference Example
Coba prediksi komentar baru

In [15]:
# Cari model dan vectorizer dari skema yang kamu mau
chosen = next(item for item in results if item['Skema'] == 'SVM + TF-IDF (80/20)')

# Load model dan vectorizer
model = chosen['Model']
vectorizer = chosen['Vectorizer']

# Prediksi komentar baru
example = "videonya keren!"
clean = simple_preprocess(example)
vec = vectorizer.transform([clean])
pred = model.predict(vec)

print(pred)


['positif']
