In [None]:
# %pip install transformers torch

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import TFBertForQuestionAnswering, BertTokenizerFast
import tensorflow as tf

# Load dataset
df = pd.read_csv('../data/final_dataset.csv')

# Define a function to find context based on the user's question
def find_context_for_question(question, dataset):
    for _, row in dataset.iterrows():
        if row['question'].strip().lower() == question.strip().lower():
            return row['context']
    return None

# Define the model and tokenizer
model_name = "Rifky/Indobert-QA"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = TFBertForQuestionAnswering.from_pretrained(model_name)

# Function to answer a question using the model and found context
def answer_question(question):
    context = find_context_for_question(question, df)
    if context is None:
        return "Pertanyaan tidak ditemukan dalam dataset."
    
    inputs = tokenizer(question, context, return_tensors="tf")
    outputs = model(inputs)
    
    answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0]
    answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    
    return answer

# Example usage
user_question = "Apa itu ROI?"
answer = answer_question(user_question)
print(f"Q: {user_question}\nA: {answer}")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForQuestionAnswering: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


Q: Apa itu ROI?
A: rasio keuntungan atau kerugian yang dihasilkan dari investasi relatif terhadap biaya investasi


In [3]:
from transformers import pipeline

# Inisialisasi pipeline untuk question answering
qa_pipeline = pipeline(
    "question-answering",
    model="Rifky/Indobert-QA",
    tokenizer="Rifky/Indobert-QA"
)

# Contoh penggunaan pipeline
context = """
Pangeran Harya Dipanegara (atau biasa dikenal dengan nama Pangeran Diponegoro, 
lahir di Ngayogyakarta Hadiningrat, 11 November 1785 – meninggal di Makassar, 
Hindia Belanda, 8 Januari 1855 pada umur 69 tahun) adalah salah seorang pahlawan 
nasional Republik Indonesia, yang memimpin Perang Diponegoro atau Perang Jawa selama 
periode tahun 1825 hingga 1830 melawan pemerintah Hindia Belanda. Sejarah mencatat, 
Perang Diponegoro atau Perang Jawa dikenal sebagai perang yang menelan korban terbanyak 
dalam sejarah Indonesia, yakni 8.000 korban serdadu Hindia Belanda, 7.000 pribumi, 
dan 200 ribu orang Jawa serta kerugian materi 25 juta Gulden.
"""
question = "kapan pangeran diponegoro meninggal?"

# Melakukan penjawaban pertanyaan
result = qa_pipeline({
    'context': context,
    'question': question
})

# Menampilkan hasil
print(result)


{'score': 0.9597133994102478, 'start': 176, 'end': 190, 'answer': '8 Januari 1855'}


In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained GPT-2 model and tokenizer
model_name = 'openai-community/gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Define the prompt
prompt = "My name is Merve and my favorite"

# Encode the prompt into tokens
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text from the model
output = model.generate(input_ids, max_length=100, num_return_sequences=1)

# Decode the generated text back into string
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)




config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Hari ini cuacanya sangat cerah dan saya merasa.

"I am a woman who has been married for over a year. I am a woman who has been married for over a year. I am a woman who has been married for over a year. I am a woman who has been married for over a year. I am a woman who has been married for over a year. I am a woman who has been married for over a year. I am


In [1]:
# import tensorflow as tf
# from transformers import BertTokenizerFast, TFBertForQuestionAnswering, create_optimizer, DefaultDataCollator
# from datasets import load_dataset, Dataset
# from transformers import TrainingArguments
# import matplotlib.pyplot as plt
# from sklearn.metrics import f1_score
# import nltk
# import re

# nltk.download('punkt')

# # Define model and tokenizer
# model_name = "indobenchmark/indobert-large-p2"  # An advanced model for Indonesian language
# tokenizer = BertTokenizerFast.from_pretrained(model_name)
# model = TFBertForQuestionAnswering.from_pretrained(model_name)

# # Text cleaning function
# def clean_text(text):
#     text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
#     text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
#     return text.strip()

# # Tokenize dataset
# def preprocess_function(examples):
#     questions = [clean_text(q) for q in examples["question"]]
#     contexts = [clean_text(c) for c in examples["context"]]
#     inputs = tokenizer(
#         questions,
#         contexts,
#         max_length=256,
#         truncation="only_second",
#         return_offsets_mapping=True,
#         padding="max_length",
#     )
#     offset_mapping = inputs.pop("offset_mapping")
#     answers = examples["answer"]
#     start_positions = []
#     end_positions = []

#     for i, (answer, offset) in enumerate(zip(answers, offset_mapping)):
#         start_char = examples["answer_start"][i]
#         end_char = start_char + len(answer)

#         sequence_ids = inputs.sequence_ids(i)

#         # Find the start and end of the context
#         idx = 0
#         while sequence_ids[idx] != 1:
#             idx += 1
#         context_start = idx
#         while sequence_ids[idx] == 1:
#             idx += 1
#         context_end = idx - 1

#         if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
#             start_positions.append(0)
#             end_positions.append(0)
#         else:
#             start_idx = context_start
#             while start_idx <= context_end and offset[start_idx][0] <= start_char:
#                 start_idx += 1
#             start_positions.append(start_idx - 1)

#             end_idx = context_start
#             while end_idx <= context_end and offset[end_idx][1] < end_char:
#                 end_idx += 1
#             end_positions.append(end_idx - 1)

#     inputs["start_positions"] = start_positions
#     inputs["end_positions"] = end_positions
#     return inputs

# # Load dataset
# dataset = load_dataset('id_cord19', 'QA')  # An Indonesian financial QA dataset

# # Apply preprocessing
# tokenized_datasets = dataset.map(preprocess_function, batched=True)

# # Split the tokenized dataset into train and validation sets
# train_test_split = tokenized_datasets['train'].train_test_split(test_size=0.1)
# train_dataset = train_test_split['train']
# val_dataset = train_test_split['test']

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="../models/fine_tuned_model",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir='../logs',
#     logging_steps=10,
# )

# data_collator = DefaultDataCollator(return_tensors="tf")

# def create_tf_dataset(tokenized_datasets):
#     def generator():
#         for example in tokenized_datasets:
#             yield (
#                 {
#                     'input_ids': example['input_ids'],
#                     'attention_mask': example['attention_mask']
#                 },
#                 (
#                     example['start_positions'],
#                     example['end_positions']
#                 )
#             )

#     return tf.data.Dataset.from_generator(
#         generator,
#         output_signature=(
#             {
#                 'input_ids': tf.TensorSpec(shape=(256,), dtype=tf.int32),
#                 'attention_mask': tf.TensorSpec(shape=(256,), dtype=tf.int32)
#             },
#             (
#                 tf.TensorSpec(shape=(), dtype=tf.int32),
#                 tf.TensorSpec(shape=(), dtype=tf.int32)
#             )
#         )
#     ).batch(4)

# train_tf_dataset = create_tf_dataset(train_dataset)
# val_tf_dataset = create_tf_dataset(val_dataset)

# # Loss and accuracy functions
# def custom_loss(y_true, y_pred):
#     y_true_start, y_true_end = y_true
#     y_pred_start, y_pred_end = y_pred

#     start_loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_start, y_pred_start, from_logits=True)
#     end_loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_end, y_pred_end, from_logits=True)
    
#     return (start_loss + end_loss) / 2

# def compute_start_logits_accuracy(y_true, y_pred):
#     return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

# def compute_end_logits_accuracy(y_true, y_pred):
#     return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

# # Create optimizer and compile model
# num_train_steps = len(train_tf_dataset) * training_args.num_train_epochs
# optimizer, lr_schedule = create_optimizer(
#     init_lr=training_args.learning_rate,
#     num_warmup_steps=0,
#     num_train_steps=num_train_steps,
#     weight_decay_rate=training_args.weight_decay,
# )

# model.compile(optimizer=optimizer,
#               loss=custom_loss,
#               metrics={
#                   'start_positions': compute_start_logits_accuracy,
#                   'end_positions': compute_end_logits_accuracy
#               })

# # Callback to log and plot metrics
# class LossAccuracyF1Logger(tf.keras.callbacks.Callback):
#     def __init__(self, validation_data):
#         super(LossAccuracyF1Logger, self).__init__()
#         self.validation_data = validation_data
#         self.epoch_loss = []
#         self.epoch_start_accuracy = []
#         self.epoch_end_accuracy = []
#         self.epoch_f1 = []

#     def on_epoch_end(self, epoch, logs=None):
#         self.epoch_loss.append(logs['loss'])
#         self.epoch_start_accuracy.append(logs['start_positions_sparse_categorical_accuracy'])
#         self.epoch_end_accuracy.append(logs['end_positions_sparse_categorical_accuracy'])

#         # Calculate F1 score on validation data
#         predictions_val, true_labels_val = self._predict(self.validation_data)
#         f1_val = self.compute_f1_score(predictions_val, true_labels_val)
#         self.epoch_f1.append(f1_val)

#         # Plot the metrics
#         self.plot()

#     def _predict(self, dataset):
#         predictions = []
#         true_labels = []
#         for batch in dataset:
#             inputs = {'input_ids': batch[0]['input_ids'], 'attention_mask': batch[0]['attention_mask']}
#             true_labels.extend(batch[1][0].numpy())  # start_positions
#             true_labels.extend(batch[1][1].numpy())  # end_positions
#             start_logits, end_logits = model.predict(inputs)
#             pred_start = tf.argmax(start_logits, axis=-1).numpy()
#             pred_end = tf.argmax(end_logits, axis=-1).numpy()
#             predictions.extend(pred_start)
#             predictions.extend(pred_end)
#         return predictions, true_labels

#     def compute_f1_score(self, predictions, true_labels):
#         # Calculate F1 score
#         f1_val = f1_score(true_labels, predictions, average='weighted')
#         return f1_val

#     def plot(self):
#         # Plot the metrics
#         plt.figure(figsize=(18, 5))
#         plt.subplot(1, 3, 1)
#         plt.plot(self.epoch_loss, label='Loss')
#         plt.xlabel('Epoch')
#         plt.ylabel('Loss')
#         plt.legend()
#         plt.title('Training Loss')

#         plt.subplot(1, 3, 2)
#         plt.plot(self.epoch_start_accuracy, label='Start Accuracy')
#         plt.plot(self.epoch_end_accuracy, label='End Accuracy')
#         plt.xlabel('Epoch')
#         plt.ylabel('Accuracy')
#         plt.legend()
#         plt.title('Training Accuracy')

#         plt.subplot(1, 3, 3)
#         plt.plot(range(1, len(self.epoch_f1) + 1), self.epoch_f1, label='F1 Score')
#         plt.xlabel('Epoch')
#         plt.ylabel('F1 Score')
#         plt.legend()
#         plt.title('Validation F1 Score')

#         plt.show()

# logger = LossAccuracyF1Logger(validation_data=val_tf_dataset)

# tf.keras.mixed_precision.set_global_policy('mixed_float16')
# tf.config.run_functions_eagerly(True)

# try:
#     model.fit(train_tf_dataset, epochs=training_args.num_train_epochs, callbacks=[logger])
# except Exception as e:
#     print(f"An error occurred during training: {e}")
#     raise





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alifs\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/1.47G [00:00<?, ?B/s]




All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

Some layers of TFBertForQuestionAnswering were not initialized from the model checkpoint at indobenchmark/indobert-large-p2 and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetNotFoundError: Dataset 'id_cord19' doesn't exist on the Hub or cannot be accessed. If the dataset is private or gated, make sure to log in with `huggingface-cli login` or visit the dataset page at https://huggingface.co/datasets/id_cord19 to ask for access.