In [None]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [None]:
pip install transformers[torch] accelerate -U

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, balanced_accuracy_score
from datasets import load_metric
from transformers import BertTokenizer, AutoModel


# Load datasets
small_train_data = pd.read_csv('/content/stopword_besar.csv')
main_test_data = pd.read_csv('/content/5000.csv')



In [None]:
small_train_data

Unnamed: 0,text,label
0,kunjungan prabowo meresmikan menyerahkan proye...,Sumber Daya Alam
1,anies tepuk tangan meriah rektor mewajibkan ma...,Politik
2,emng bener sih pendukung yg goblok jg dg pendu...,Demografi
3,anies bersikap kritis kinerja prabowo dianggap...,Politik
4,anies baswedan harap asn tni polri pegang sump...,Politik
...,...,...
4569,ngeliat debat kemaren pas prabowo kicep kekira...,Politik
4570,masyarakat prabowo gibran memiliki visi sejala...,Politik
4571,imo both are irrational but yg irrational tbh ...,Ekonomi
4572,look at that ganjar sdh berkecimpung lgislatif...,Pertahanan dan Keamanan


In [None]:
main_test_data

Unnamed: 0,text,label
0,kunjungan prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,anies dapat tepuk tangan meriah saat jadi rekt...,Politik
2,emng bener sih pendukung ada yg goblok begitu ...,Demografi
3,sewaktu anies bersikap kritis ke kinerja pak p...,Politik
4,anies baswedan harap asn termasuk tni dan polr...,Politik
...,...,...
4995,ngeliat debat kemaren pas prabowo kicep kekira...,Politik
4996,masyarakat yakin bahwa prabowo gibran memiliki...,Politik
4997,imo both are irrational but yg satu jauh lebih...,Ekonomi
4998,look at that pak ganjar anda sdh berkecimpung ...,Pertahanan dan Keamanan


In [None]:
# Ensure correct data types
small_train_data['text'] = small_train_data['text'].astype(str)
small_train_data['label'] = small_train_data['label'].astype(str)

main_test_data['text'] = main_test_data['text'].astype(str)
main_test_data['label'] = main_test_data['label'].astype(str)

# Define manual label mapping
label_mapping = {
    "Politik": 0,
    "Sosial Budaya": 1,
    "Ideologi": 2,
    "Pertahanan dan Keamanan": 3,
    "Ekonomi": 4,
    "Sumber Daya Alam": 5,
    "Demografi": 6,
    "Geografi": 7
}

small_train_data['label'] = small_train_data['label'].map(label_mapping)
main_test_data['label'] = main_test_data['label'].map(label_mapping)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(small_train_data)
test_dataset = Dataset.from_pandas(main_test_data)

In [None]:
# Load IndoBERT tokenizer

tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")

# Tokenize datasets with padding and truncation
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

model = AutoModelForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", num_labels=8)
# Load IndoBERT model


# Define training arguments
# Define training arguments
def compute_metrics(p):
    metric = load_metric("accuracy")
    preds = p.predictions.argmax(-1)
    return metric.compute(predictions=preds, references=p.label_ids)

training_args = TrainingArguments(
    output_dir='./results',                 # Directory to save the model checkpoints
    overwrite_output_dir=True,              # Overwrite the content of the output directory
    num_train_epochs=10,                     # Increase the number of epochs for better performance
    per_device_train_batch_size=16,         # Training batch size per device
    per_device_eval_batch_size=32,          # Evaluation batch size per device
    warmup_steps=500,                       # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                      # Weight decay for regularization
    logging_dir='./logs',                   # Directory for storing logs
    logging_steps=50,                       # Log every 50 steps
    evaluation_strategy="steps",            # Evaluate during training at each logging step
    eval_steps=100,                         # Evaluate every 100 steps
    save_steps=500,                         # Save checkpoint every 500 steps
    save_total_limit=2,                     # Limit the total amount of checkpoints, delete the older ones
    learning_rate=5e-5,                     # Learning rate for optimizer
    gradient_accumulation_steps=2,          # Number of updates steps to accumulate the gradients for, before performing a backward/update pass
    fp16=True,                              # Enable mixed precision training (faster training with lower memory usage)
    load_best_model_at_end=True,            # Load the best model found during evaluation at the end of training
    metric_for_best_model="accuracy",       # Metric to use to compare model performance
    label_smoothing_factor=0.1,             # Label smoothing to improve generalization
    dataloader_num_workers=4,               # Number of subprocesses to use for data loading
    report_to="none",                      # Integrate with Weights & Biases for experiment tracking\
    run_name="training_run",                # Name of the W&B run
    logging_first_step=True,                # Log the first global_step
    logging_strategy="epoch",               # Log metrics at the end of each epoch
    disable_tqdm=False                      # Enable/disable tqdm progress bars
)

# Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Map:   0%|          | 0/4574 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


Step,Training Loss,Validation Loss,Accuracy
100,2.0116,1.445979,0.5936
200,1.5097,1.177526,0.6754
300,1.1078,1.042146,0.7448
400,1.1078,0.93107,0.8052
500,0.9477,0.835985,0.8426
600,0.8104,0.838196,0.8354
700,0.8104,0.839507,0.85
800,0.6773,0.748732,0.8908
900,0.5961,0.714278,0.9036
1000,0.5961,0.7483,0.891


  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The repository for accuracy contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/accuracy.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y\
The repository for accuracy contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/accuracy.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


TrainOutput(global_step=1430, training_loss=0.771186846953172, metrics={'train_runtime': 524.3967, 'train_samples_per_second': 87.224, 'train_steps_per_second': 2.727, 'total_flos': 3008837000110080.0, 'train_loss': 0.771186846953172, 'epoch': 10.0})

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(f"Evaluation results: {results}")

# Predict on the test dataset
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
predicted_labels = predictions.predictions.argmax(-1)

# Add predictions to the test dataset
test_dataset = test_dataset.add_column("predicted_label", predicted_labels)

# Extract true labels
true_labels = test_dataset['label']

# Calculate and print classification report
report = classification_report(true_labels, predicted_labels, target_names=label_mapping.keys())
print(report)

# Calculate balanced accuracy for each category and overall
balanced_accuracy = balanced_accuracy_score(true_labels, predicted_labels)
print(f"Balanced Accuracy: {balanced_accuracy * 100:.2f}%")

# Save predictions
test_dataset.to_csv('predicted_test_data_2.csv', index=False)

  self.pid = os.fork()


  self.pid = os.fork()


Evaluation results: {'eval_loss': 0.7483000755310059, 'eval_accuracy': 0.891, 'eval_runtime': 9.7416, 'eval_samples_per_second': 513.264, 'eval_steps_per_second': 16.117, 'epoch': 10.0}


  self.pid = os.fork()
  self.pid = os.fork()


                         precision    recall  f1-score   support

                Politik       0.97      0.89      0.92      2972
          Sosial Budaya       0.69      0.96      0.80       587
               Ideologi       0.81      0.83      0.82       400
Pertahanan dan Keamanan       0.92      0.93      0.93       400
                Ekonomi       0.87      0.91      0.89       367
       Sumber Daya Alam       0.83      0.89      0.86       192
              Demografi       0.86      0.61      0.72        62
               Geografi       0.88      0.70      0.78        20

               accuracy                           0.89      5000
              macro avg       0.85      0.84      0.84      5000
           weighted avg       0.90      0.89      0.89      5000

Balanced Accuracy: 83.97%


Creating CSV from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

7552216

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import BertTokenizer, AutoModel


# Menyimpan model yang telah dilatih
trainer.save_model("./saved_model_final_large")
tokenizer.save_pretrained("./saved_model_final_large")

# Memuat model dan tokenizer yang telah disimpan
model = AutoModelForSequenceClassification.from_pretrained("./saved_model_final_large")
tokenizer = BertTokenizer.from_pretrained("./saved_model_final_large")

Hyper-Tuning

In [None]:
#dataset baru
small_train_data = pd.read_csv('/content/stopword_kecil.csv')
main_test_data = pd.read_csv('/content/5000.csv')


In [None]:
small_train_data

Unnamed: 0,text,label
0,pendapatan apbd jumbo indonesia mencapai rp,Ekonomi
1,pasangan duet capres cawapres ganjar pranowo m...,Ekonomi
2,prabowo gibran terapkan program melancarkan ek...,Ekonomi
3,jaua suparna anies s nya fakultas ekonomi s ny...,Ekonomi
4,capres nomor urut anies baswedan pergeseran su...,Ekonomi
...,...,...
1227,peta politiknya lihat anies prab cek jg anies ...,Geografi
1228,n reii ganjar mahfud titip dipikirkan gagasann...,Geografi
1229,oiya kak km cek pul,Geografi
1230,kampung ku aeange deket bukit kamoung petani y...,Geografi


In [None]:
main_test_data

Unnamed: 0,text,label
0,kunjungan prabowo ini untuk meresmikan dan men...,Sumber Daya Alam
1,anies dapat tepuk tangan meriah saat jadi rekt...,Politik
2,emng bener sih pendukung ada yg goblok begitu ...,Demografi
3,sewaktu anies bersikap kritis ke kinerja pak p...,Politik
4,anies baswedan harap asn termasuk tni dan polr...,Politik
...,...,...
4995,ngeliat debat kemaren pas prabowo kicep kekira...,Politik
4996,masyarakat yakin bahwa prabowo gibran memiliki...,Politik
4997,imo both are irrational but yg satu jauh lebih...,Ekonomi
4998,look at that pak ganjar anda sdh berkecimpung ...,Pertahanan dan Keamanan


In [None]:
# Ensure correct data types
small_train_data['text'] = small_train_data['text'].astype(str)
small_train_data['label'] = small_train_data['label'].astype(str)

main_test_data['text'] = main_test_data['text'].astype(str)
main_test_data['label'] = main_test_data['label'].astype(str)

# Define manual label mapping
label_mapping = {
    "Politik": 0,
    "Sosial Budaya": 1,
    "Ideologi": 2,
    "Pertahanan dan Keamanan": 3,
    "Ekonomi": 4,
    "Sumber Daya Alam": 5,
    "Demografi": 6,
    "Geografi": 7
}

small_train_data['label'] = small_train_data['label'].map(label_mapping)
main_test_data['label'] = main_test_data['label'].map(label_mapping)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(small_train_data)
test_dataset = Dataset.from_pandas(main_test_data)

In [None]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1232
})

In [None]:
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 5000
})

In [None]:

# Tokenize datasets with padding and truncation
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])



# Define training arguments
# Define training arguments
def compute_metrics(p):
    metric = load_metric("accuracy")
    preds = p.predictions.argmax(-1)
    return metric.compute(predictions=preds, references=p.label_ids)

training_args_tuning = TrainingArguments(
    output_dir='./results_tuning',
    overwrite_output_dir=True,
    num_train_epochs=5,                    # Ubah jumlah epoch yang lebih sedikit
    per_device_train_batch_size=8,         # Ubah ukuran batch menjadi lebih kecil
    per_device_eval_batch_size=16,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs_tuning',
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_steps=100,
    save_total_limit=1,
    learning_rate=3e-5,                    # Ubah laju pembelajaran jika perlu
    report_to="none",
    disable_tqdm=False
)
# Initialize Trainer

trainer_tuning = Trainer(
    model=model,
    args=training_args_tuning,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer_tuning.train()

Map:   0%|          | 0/1232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,0.1779,0.411674,0.888
2,0.2196,0.426433,0.8924
3,0.12,0.505055,0.8838
4,0.0479,0.43019,0.9064
5,0.0617,0.46394,0.9024


TrainOutput(global_step=770, training_loss=0.11670777789958112, metrics={'train_runtime': 180.8456, 'train_samples_per_second': 34.062, 'train_steps_per_second': 4.258, 'total_flos': 405212853534720.0, 'train_loss': 0.11670777789958112, 'epoch': 5.0})

In [None]:
# Evaluate the model
results = trainer_tuning.evaluate()
print(f"Evaluation results: {results}")

# Predict on the test dataset
predictions = trainer_tuning.predict(test_dataset)

# Convert predictions to labels
predicted_labels = predictions.predictions.argmax(-1)

# Add predictions to the test dataset
test_dataset = test_dataset.add_column("predicted_label", predicted_labels)

# Extract true labels
true_labels = test_dataset['label']

# Calculate and print classification report
report = classification_report(true_labels, predicted_labels, target_names=label_mapping.keys())
print(report)

# Calculate balanced accuracy for each category and overall
balanced_accuracy = balanced_accuracy_score(true_labels, predicted_labels)
print(f"Balanced Accuracy: {balanced_accuracy * 100:.2f}%")

# Save predictions
test_dataset.to_csv('predicted_test_data_tuning_3.csv', index=False)

Evaluation results: {'eval_loss': 0.46394023299217224, 'eval_accuracy': 0.9024, 'eval_runtime': 10.9414, 'eval_samples_per_second': 456.978, 'eval_steps_per_second': 28.607, 'epoch': 5.0}
                         precision    recall  f1-score   support

                Politik       0.98      0.89      0.93      2972
          Sosial Budaya       0.83      0.94      0.88       587
               Ideologi       0.84      0.88      0.86       400
Pertahanan dan Keamanan       0.84      0.97      0.90       400
                Ekonomi       0.87      0.85      0.86       367
       Sumber Daya Alam       0.65      0.96      0.78       192
              Demografi       0.66      0.92      0.77        62
               Geografi       0.90      0.90      0.90        20

               accuracy                           0.90      5000
              macro avg       0.82      0.91      0.86      5000
           weighted avg       0.91      0.90      0.90      5000

Balanced Accuracy: 91.39%


Creating CSV from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

7552216

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Menyimpan model yang telah dilatih
trainer_tuning.save_model("./saved_model_final_large_2")
tokenizer.save_pretrained("./saved_model_final__large_2")


('./saved_model_final__large_2/tokenizer_config.json',
 './saved_model_final__large_2/special_tokens_map.json',
 './saved_model_final__large_2/vocab.txt',
 './saved_model_final__large_2/added_tokens.json')

In [None]:

# Memuat model dan tokenizer yang telah disimpan
model = AutoModelForSequenceClassification.from_pretrained("./saved_model_final_large_2")
tokenizer = AutoTokenizer.from_pretrained("./saved_model_final_large_2")

In [None]:
import pandas as pd
import torch
# Misalkan dataset Anda disimpan dalam DataFrame baru yang disebut 'dataset'
dataset = pd.read_csv('/content/dataTest.csv')

# Tokenisasi teks
tokenized_texts = tokenizer(dataset['Text'].tolist(), padding='max_length', truncation=True, max_length=128, return_tensors='pt')

# Prediksi label
with torch.no_grad():
    outputs = model(**tokenized_texts)
    predictions = outputs.logits.argmax(dim=-1)

# Mengganti kolom 'Text' dengan hasil prediksi label (dalam angka)
dataset['Label'] = predictions.tolist()

# Reverse mapping dari angka kembali ke label teks
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
dataset['Label'] = dataset['Label'].map(reverse_label_mapping)

# Menyimpan dataset yang telah diupdate
dataset.to_csv('Prediksi_BDC_3.csv', index=False)
