<a href="https://colab.research.google.com/github/DhaniAAA/Scrapping-Ulasan/blob/main/Sentiment%20Onnx.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
"""
Sentiment Analysis Model Training Script - Google Colab Version

Script untuk training model sentiment analysis dan konversi ke ONNX.
Optimized untuk Google Colab dengan UI upload/download.

Cara pakai di Google Colab:
1. Upload notebook ini
2. Run semua cells
3. Upload CSV dataset saat diminta
4. Download model ONNX yang dihasilkan

Author: Sentiment Analysis Team
"""

# ============================================
# CELL 1: Install Dependencies
# ============================================
import onnxruntime as rt
from skl2onnx.common.data_types import StringTensorType
from skl2onnx import convert_sklearn
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import io
from google.colab import files
from pathlib import Path
from datetime import datetime
import re
import json
import numpy as np
import pandas as pd
print("📦 Installing dependencies...")
print("=" * 80)

# Install required packages
#!pip install -q pandas scikit-learn skl2onnx onnxruntime numpy

print("✅ Dependencies installed!")
print()

# ============================================
# CELL 2: Import Libraries
# ============================================
print("📚 Importing libraries...")


print("✅ Libraries imported!")
print()

# ============================================
# CELL 3: Helper Functions
# ============================================


def preprocess_text(text):
    """Preprocessing teks untuk training"""
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def load_and_prepare_data(df):
    """Load dan prepare dataset"""
    print("📊 Preparing dataset...")

    # Check required columns
    if 'text' not in df.columns or 'sentiment' not in df.columns:
        raise ValueError("Dataset harus memiliki kolom 'text' dan 'sentiment'")

    # Preprocess text
    df['text_clean'] = df['text'].apply(preprocess_text)

    # Remove empty texts
    df = df[df['text_clean'].str.len() > 0]

    # Normalize sentiment labels
    df['sentiment'] = df['sentiment'].str.lower()

    # Standardize labels
    label_map = {
        'positive': 'pos',
        'neutral': 'neu',
        'negative': 'neg'
    }
    df['sentiment'] = df['sentiment'].replace(label_map)

    # Filter valid labels
    valid_labels = {'pos', 'neu', 'neg'}
    df = df[df['sentiment'].isin(valid_labels)]

    print(f"✅ Dataset loaded: {len(df)} samples")
    print("\n📈 Class distribution:")
    print(df['sentiment'].value_counts())
    print("\n📊 Class percentages:")
    print(df['sentiment'].value_counts(normalize=True) * 100)
    print()

    return df


def create_pipeline(max_features=5000, ngram_range=(1, 2)):
    """Create scikit-learn pipeline"""
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            min_df=2,
            max_df=0.8,
            sublinear_tf=True,
            strip_accents=None,  # Must be None for ONNX compatibility
            lowercase=True,
            token_pattern=r'\b\w+\b'
        )),
        ('clf', LogisticRegression(
            max_iter=1000,
            C=1.0,
            class_weight='balanced',
            solver='lbfgs',
            random_state=42,
            n_jobs=-1
        ))
    ])
    return pipeline


def train_model(df, test_size=0.2):
    """Train model"""
    print("🎯 Training model...")
    print("=" * 80)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        df['text_clean'],
        df['sentiment'],
        test_size=test_size,
        random_state=42,
        stratify=df['sentiment']
    )

    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")

    # Create and train pipeline
    pipeline = create_pipeline()
    pipeline.fit(X_train, y_train)

    # Cross-validation
    print("\n🔄 Cross-validation...")
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, n_jobs=-1)
    print(f"CV scores: {cv_scores}")
    print(f"Mean CV: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

    # Test evaluation
    print("\n📊 Test evaluation...")
    y_pred = pipeline.predict(X_test)
    test_score = accuracy_score(y_test, y_pred)
    print(f"Test accuracy: {test_score:.4f}")

    print("\n📋 Classification Report:")
    print(classification_report(y_test, y_pred))

    print("🔢 Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print()

    return pipeline, X_test, y_test, y_pred, cv_scores


def convert_to_onnx(pipeline):
    """Convert to ONNX"""
    print("🔄 Converting to ONNX...")

    initial_type = [('input_text', StringTensorType([None, 1]))]

    onnx_model = convert_sklearn(
        pipeline,
        initial_types=initial_type,
        target_opset=15,
        options={id(pipeline): {'zipmap': False}}
    )

    # Save to bytes
    onnx_bytes = onnx_model.SerializeToString()

    print(f"✅ ONNX model created ({len(onnx_bytes) / (1024*1024):.2f} MB)")
    return onnx_bytes


def test_onnx_model(onnx_bytes, label_map):
    """Test ONNX model"""
    print("\n🧪 Testing ONNX model...")
    print("=" * 80)

    # Load model from bytes
    sess = rt.InferenceSession(onnx_bytes)

    # Test samples
    test_texts = [
        "Produk bagus sekali! Sangat puas 😍",
        "Pengiriman lambat, sangat kecewa",
        "Biasa saja, tidak istimewa",
        "Amazing product! Highly recommended",
        "Terrible service, waste of money",
        "It's okay, nothing special"
    ]

    for text in test_texts:
        text_clean = preprocess_text(text)
        input_data = np.array([[text_clean]])
        outputs = sess.run(None, {'input_text': input_data})

        probs = outputs[1][0]
        pred_idx = np.argmax(probs)
        pred_label = label_map['index_to_label'][pred_idx]
        confidence = probs[pred_idx]

        print(f"📝 Text: {text}")
        print(f"   Prediction: {pred_label.upper()} ({confidence:.2%})")
        print()


print("✅ Helper functions defined!")
print()

# ============================================
# CELL 4: Upload Dataset
# ============================================
print("=" * 80)
print("📤 UPLOAD YOUR DATASET")
print("=" * 80)
print("Format CSV dengan kolom: 'text' dan 'sentiment'")
print("Sentiment values: pos/neu/neg atau positive/neutral/negative")
print()

# Upload file
uploaded = files.upload()

# Get filename
filename = list(uploaded.keys())[0]
print(f"\n✅ File uploaded: {filename}")

# Load dataset
df = pd.read_csv(io.BytesIO(uploaded[filename]))
print(f"📊 Loaded {len(df)} rows")
print()

# Show sample
print("📋 Sample data:")
print(df.head())
print()

# ============================================
# CELL 5: Prepare Data
# ============================================
df = load_and_prepare_data(df)

# ============================================
# CELL 6: Train Model
# ============================================
pipeline, X_test, y_test, y_pred, cv_scores = train_model(df)

# ============================================
# CELL 7: Convert to ONNX
# ============================================
onnx_bytes = convert_to_onnx(pipeline)

# ============================================
# CELL 8: Create Label Map
# ============================================
print("📝 Creating label map...")
label_map = {
    'index_to_label': list(pipeline.classes_)
}
label_map_json = json.dumps(label_map, indent=2)
print(f"✅ Label map: {label_map}")
print()

# ============================================
# CELL 9: Test ONNX Model
# ============================================
test_onnx_model(onnx_bytes, label_map)

# ============================================
# CELL 10: Save Metrics
# ============================================
print("💾 Saving metrics...")
test_score = accuracy_score(y_test, y_pred)
metrics = {
    'accuracy': float(test_score),
    'cv_mean': float(cv_scores.mean()),
    'cv_std': float(cv_scores.std()),
    'training_date': datetime.now().isoformat(),
    'n_samples': len(df),
    'class_distribution': df['sentiment'].value_counts().to_dict()
}
metrics_json = json.dumps(metrics, indent=2)
print("✅ Metrics:")
print(metrics_json)
print()

# ============================================
# CELL 11: Download Files
# ============================================
print("=" * 80)
print("📥 DOWNLOAD MODEL FILES")
print("=" * 80)

# Save ONNX model
with open('sentiment_model.onnx', 'wb') as f:
    f.write(onnx_bytes)
print("✅ sentiment_model.onnx created")

# Save label map
with open('label_map.json', 'w') as f:
    f.write(label_map_json)
print("✅ label_map.json created")

# Save metrics
with open('model_metrics.json', 'w') as f:
    f.write(metrics_json)
print("✅ model_metrics.json created")

print("\n📦 Downloading files...")
files.download('sentiment_model.onnx')
files.download('label_map.json')
files.download('model_metrics.json')

print("\n" + "=" * 80)
print("🎉 TRAINING COMPLETE!")
print("=" * 80)
print("\n📋 Summary:")
print(f"   • Test Accuracy: {test_score:.2%}")
print(f"   • CV Score: {cv_scores.mean():.2%} (+/- {cv_scores.std() * 2:.2%})")
print(f"   • Training Samples: {len(df)}")
print(f"   • Model Size: {len(onnx_bytes) / (1024*1024):.2f} MB")
print("\n📝 Next steps:")
print("   1. Upload sentiment_model.onnx to Supabase Storage")
print("   2. Upload label_map.json to Supabase Storage")
print("   3. Update .env.local with model URLs")
print("   4. Test in your application")
print("\n✨ Happy analyzing!")


📦 Installing dependencies...
✅ Dependencies installed!

📚 Importing libraries...
✅ Libraries imported!

✅ Helper functions defined!

📤 UPLOAD YOUR DATASET
Format CSV dengan kolom: 'text' dan 'sentiment'
Sentiment values: pos/neu/neg atau positive/neutral/negative



Saving dataset_ulasan_sentimen.csv to dataset_ulasan_sentimen.csv

✅ File uploaded: dataset_ulasan_sentimen.csv
📊 Loaded 47990 rows

📋 Sample data:
                                                text sentiment
0                                               good  positive
1                                              masuk  negative
2          dana kok ngambil daget yt eror mohon baik  negative
3  alhamdulillah akun dana cicilnyanunggu2 sekian...  positive
4                                              sidik   neutral

📊 Preparing dataset...
✅ Dataset loaded: 47990 samples

📈 Class distribution:
sentiment
neg    30140
pos    12335
neu     5515
Name: count, dtype: int64

📊 Class percentages:
sentiment
neg    62.804751
pos    25.703272
neu    11.491977
Name: proportion, dtype: float64

🎯 Training model...
Training samples: 38392
Test samples: 9598

🔄 Cross-validation...
CV scores: [0.90701914 0.90311238 0.9084397  0.90440219 0.90830946]
Mean CV: 0.9063 (+/- 0.0043)

📊 Test evaluation..

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🎉 TRAINING COMPLETE!

📋 Summary:
   • Test Accuracy: 91.03%
   • CV Score: 90.63% (+/- 0.43%)
   • Training Samples: 47990
   • Model Size: 0.20 MB

📝 Next steps:
   1. Upload sentiment_model.onnx to Supabase Storage
   2. Upload label_map.json to Supabase Storage
   3. Update .env.local with model URLs
   4. Test in your application

✨ Happy analyzing!
