import os, time
os.environ['OMP_NUM_THREADS'] = '8'

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.profiler import profile, record_function, ProfilerActivity
from datasets import load_dataset

imdb = load_dataset("imdb",split=['train[:5]','test[:5]'])

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_imdb_tr = imdb[0].map(preprocess_function, batched=True)
tokenized_imdb_ts = imdb[1].map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="my_awesome_model2",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_imdb_tr,
        eval_dataset=tokenized_imdb_ts,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
# python -m venv venv_10_3
# 
# pip install torch==2.0 transformers pandas datasets scikit-learn evaluate
# tb0 = torch._C._profiler.gather_traceback(python=True, script=True, cpp=True) 

with profile(with_stack=True,
    profile_memory=True, record_shapes=True) as prof:
    start_time = time.time()
    trainer.train()
    end_time = time.time()
# symbolized_tracebacks = torch._C._profiler.symbolize_tracebacks([tb0])
# python test2.py &> pereval_tr_def.txt
# print(symbolized_tracebacks[0])




print(prof.key_averages().table())
total_time = end_time-start_time

print("Training Time Taken:")

print(total_time)
del os.environ['OMP_NUM_THREADS']


# Print the profiling results