In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Load dataset
train_data = pd.read_csv('train.csv')  # Data training (ada label)
test_data = pd.read_csv('test_cleaned.csv')    # Data testing (hanya id dan text)

print("Training data columns:", train_data.columns.tolist())
print("Test data columns:", test_data.columns.tolist())

# 2. Pisahkan features dan target dari training data
X_train_raw = train_data['text']  # Kolom text dari training (MASIH TEXT)
y_train = train_data['label']     # Kolom label dari training

# 3. Test data hanya punya id dan text
test_ids = test_data['id']        # Kolom id dari test
X_test_raw = test_data['text']    # Kolom text dari test (MASIH TEXT)

print(f"\nContoh text data:")
print(f"Training: {X_train_raw.iloc[0][:60]}...")
print(f"Test: {X_test_raw.iloc[0][:10]}...")

# 4. PREPROCESSING PENTING: Convert Text to Numbers
vectorizer = TfidfVectorizer(
    max_features=350,      # Batasi jumlah fitur
    ngram_range=(1, 2),     # Gunakan unigram dan bigram
     min_df=2,                    # Ignore terms that appear in less than 2 documents
    max_df=0.8,                  # Ignore terms that appear in more than 80% of documents
    lowercase=True,              # Convert to lowercase
    sublinear_tf=True            # Use sublinear tf scaling
)

# Fit pada training data, transform kedua dataset
print("\nüîÑ Converting text to numerical features...")
X_train = vectorizer.fit_transform(X_train_raw)  # Convert training text to numbers
X_test = vectorizer.transform(X_test_raw)        # Convert test text to numbers

print(f"‚úÖ Conversion successful!")
print(f"X_train shape: {X_train.shape}")  # (samples, features)
print(f"X_test shape: {X_test.shape}")    # (samples, features)

# 5. Train Model Decision Tree
print("\nü§ñ Training Decision Tree model...")
dt_model = DecisionTreeClassifier(
    random_state=42,
    max_depth=75,           # Kontrol kedalaman
    min_samples_split=70,   # Minimal samples untuk split
    min_samples_leaf=10     # Minimal samples di leaf
)

dt_model.fit(X_train, y_train)
print("‚úÖ Model trained successfully!")

# 6. Predict pada test data
print("\nüîÆ Making predictions...")
predictions = dt_model.predict(X_test)
probabilities = dt_model.predict_proba(X_test)

# 7. Simpan hasil prediksi
results = pd.DataFrame({
    'id': test_ids,
    'text': X_test_raw,
    'predicted_label': predictions,
    'confidence': np.max(probabilities, axis=1)  # Confidence score
})

# 8. Tampilkan hasil
print(f"\nüìã Hasil Prediksi:")
print(results.head(20))

print(f"\nüìä Distribusi Prediksi:")
print(results['predicted_label'].value_counts())

print(f"\n‚ÑπÔ∏è  Jumlah fitur yang digunakan: {X_train.shape[1]}")
print(f"‚ÑπÔ∏è  Classes yang diprediksi: {dt_model.classes_}")

Training data columns: ['text', 'label']
Test data columns: ['id', 'text']

Contoh text data:
Training: @hyalfay @NatharElyas @BosPurwa @NatharElyas @BosPurwa Prabo...
Test: gua beda g...

üîÑ Converting text to numerical features...
‚úÖ Conversion successful!
X_train shape: (5000, 350)
X_test shape: (5000, 350)

ü§ñ Training Decision Tree model...
‚úÖ Model trained successfully!

üîÆ Making predictions...

üìã Hasil Prediksi:
    id                                               text predicted_label  \
0    0  gua beda gapernah download shope tagih data bocor       reformasi   
1    1  bayang kiai dakwah ilmuwan tel kontribusi bang...         harmoni   
2    2  pimpin visi bikin indonesia kuat global desa k...      pemerataan   
3    3  masyarakat terap strategi investasi didik tera...        ideologi   
4    4  aman data pribadi rakyat sik durung maksim gay...       reformasi   
5    5  omon omon doang prose tuh lapor jokowi gibran ...         harmoni   
6    6  ayo tangkap kader 