# Install requirements

In [None]:
!pip install Datasets
!pip install transformers
!pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
import datasets

### Use for debug only

In [None]:
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

In [None]:
def preprocess_function(examples, tokenizer, max_length=128):
    return tokenizer(examples["text"],
                     truncation=True,
                     padding='max_length',
                     max_length=max_length,
                     return_tensors="np")

In [None]:
def get_data(train_path):
   train_df = pd.read_json(train_path, lines=True)
   train_df = train_df[['text', 'label']]
   train_df, test_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
   return train_df, test_df


In [None]:
file_path = 'data/subtaskA_train_monolingual.jsonl'

In [None]:
train_df, test_df = get_data(file_path)

In [None]:
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
# take only 100 sample from training data (test purposes)
train_df = train_df[:100]
test_df = test_df[:100]

In [None]:
labels = np.asarray(train_df['label'])

In [None]:
id2label = {0: "human", 1: "machine"}
label2id = {"human": 0, "machine": 1 }

In [None]:
# pandas dataframe to huggingface Dataset
train_dataset= Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
print(train_dataset)

In [None]:
dataset = datasets.DatasetDict({'train': train_dataset, 'test': test_dataset})

In [None]:
print(dataset)

In [None]:
dataset["train"][0]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokenizer("Hello, this is a sentence!", "And this sentence goes with it.")

In [None]:
# pre_tokenizer_columns = set(dataset["train"].features)
encoded_dataset = dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
# tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
# print("Columns added by tokenizer:", tokenizer_columns)

In [None]:
encoded_dataset["train"].features["label"]

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label2id), id2label=id2label, label2id=label2id)

In [None]:
tf_train_dataset = model.prepare_tf_dataset(encoded_dataset["train"],
                                      batch_size=8,
                                      shuffle=True,
                                      tokenizer=tokenizer)

In [None]:
tf_test_dataset = model.prepare_tf_dataset(encoded_dataset["train"],
                                          batch_size=8,
                                          shuffle=False,
                                          tokenizer=tokenizer)

In [None]:
model.compile(optimizer=Adam(3e-5), metrics=['accuracy'])  # No loss argument!

In [None]:
model.fit(tf_train_dataset)