In [27]:
!pip install datasets evaluate transformers[sentencepiece]



In [28]:
from datasets import load_dataset
import pandas as pd
df_train = pd.read_csv('train.csv', engine='python')
df_trained = load_dataset('csv', data_files = 'train.csv')
df_tested = load_dataset('csv', data_files = 'test.csv')

In [29]:
df_train_shuff=df_trained['train'].shuffle().select(range(1000))
df_test_shuff=df_tested['train'].shuffle().select(range(100))

In [30]:
df_train_shuff[:3]

{'qa_id': [8309, 175, 4414],
 'question_title': ["Why don't spinning tops fall over?",
  'Access control Service and Azure Websites Could not load System.IdentityModel.Services',
  'How to measure a crooked room?'],
 'question_body': ['One topic which was covered in university, but which I never understood, is how a spinning top "magically" resists the force of gravity. The conservation of energy explanations make sense, but I don\'t believe that they provide as much insight as a mechanical explanation would.\n\nThe hyperphysics link Cedric provided looks similar to a diagram that I saw in my physics textbook. This diagram illustrates precession nicely, but doesn\'t explain why the top doesn\'t fall. Since the angular acceleration is always tangential, I would expect that the top should spiral outwards until it falls to the ground. However, the diagram seems to indicate that the top should be precessing in a circle, not a spiral. Another reason I am not satisfied with this explanation 

In [31]:
def lowercase_condition_answer(example):
  return {"answer": example["answer"].lower()}

In [32]:
def lowercase_condition_title(example):
  return {"question_title": example["question_title"].lower()}

In [33]:
def lowercase_condition_body(example):
  return {"question_body": example["question_body"].lower()}

In [34]:
df_trained.map(lowercase_condition_answer)
df_trained.map(lowercase_condition_title)
df_trained.map(lowercase_condition_body)
df_tested.map(lowercase_condition_answer)
df_tested.map(lowercase_condition_title)
df_tested.map(lowercase_condition_body)

DatasetDict({
    train: Dataset({
        features: ['qa_id', 'question_title', 'question_body', 'question_user_name', 'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 'url', 'category', 'host'],
        num_rows: 476
    })
})

In [35]:
def compute_body_length(example):
    return {"question_length": len(example["question_body"].split())}

In [36]:
df_trained = df_trained.map(compute_body_length)

df_trained["train"][0]
df_tested = df_tested.map(compute_body_length)

df_trained["train"][0]

{'qa_id': 0,
 'question_title': 'What am I losing when using extension tubes instead of a macro lens?',
 'question_body': 'After playing around with macro photography on-the-cheap (read: reversed lens, rev. lens mounted on a straight lens, passive extension tubes), I would like to get further with this. The problems with the techniques I used is that focus is manual and aperture control is problematic at best. This limited my setup to still subjects (read: dead insects) Now, as spring is approaching, I want to be able to shoot live insects. I believe that for this, autofocus and settable aperture will be of great help.\n\nSo, one obvious but expensive option is a macro lens (say, EF 100mm Macro) However, I am not really interested in yet another prime lens. An alternative is the electrical extension tubes.\n\nExcept for maximum focusing distance, what am I losing when using tubes (coupled with a fine lens, say EF70-200/2.8) instead of a macro lens?\n',
 'question_user_name': 'ysap',
 '

In [37]:
df_trained["train"].sort("question_length")
df_tested["train"].sort("question_length")

Dataset({
    features: ['qa_id', 'question_title', 'question_body', 'question_user_name', 'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 'url', 'category', 'host', 'question_length'],
    num_rows: 476
})

In [38]:
df_trained_filtered = df_trained.filter(lambda x: x["question_length"] > 30)
print(df_trained_filtered.num_rows)
df_tested_filtered = df_tested.filter(lambda x: x["question_length"] > 30)
print(df_tested_filtered.num_rows)

{'train': 5654}
{'train': 445}


In [39]:
import html
df_trained_filt = df_trained_filtered.map(lambda x: {"answer": html.unescape(x["answer"])})
df_trained_filt = df_trained_filt.map(lambda x: {"question_body": html.unescape(x["question_body"])})
df_trained_filt = df_trained_filt.map(lambda x: {"question_title": html.unescape(x["question_title"])})
df_tested_filt = df_tested_filtered.map(lambda x: {"answer": html.unescape(x["answer"])})
df_tested_filt = df_tested_filt.map(lambda x: {"question_body": html.unescape(x["question_body"])})
df_tested_filt = df_tested_filt.map(lambda x: {"question_title": html.unescape(x["question_title"])})

In [40]:
new_df_trained = df_trained_filt.map(
    lambda x: {"answer": [html.unescape(o) for o in x["answer"]]}, batched=True
)
new_df_tested = df_tested_filt.map(
    lambda x: {"answer": [html.unescape(o) for o in x["answer"]]}, batched=True
)

In [41]:
from transformers import AutoTokenizer, DistilBertTokenizer, TFDistilBertForQuestionAnswering
slow_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad", use_fast=False)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")


def tokenize_function(examples):
    return tokenizer(examples["answer"], truncation=True)
def slow_tokenize_function(examples):
    return slow_tokenizer(examples["answer"], truncation=True)


tokenized_dataset = df_trained_filt.map(slow_tokenize_function, batched=True, num_proc=8)

All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


In [42]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["answer"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    """
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    """
    return result

In [43]:
tokenized_dataset = df_trained_filt.map(tokenize_and_split, batched=True)
#ids = tokenizer.convert_tokens_to_ids(tokenized_dataset)
tokenized_dataset_test = df_tested_filt.map(tokenize_and_split, batched=True)

In [44]:
result = tokenize_and_split(df_trained["train"][0])
#tokenizer.convert_ids_to_tokens(result["qa_id"])
#[len(inp) for inp in result[ids]]
result_test = tokenize_and_split(df_tested["train"][0])
#[len(inp) for inp in result_test["qa_id"]]

In [45]:
tokenized_dataset.set_format("pandas")
tokenized_dataset_test.set_format("pandas")

In [46]:
tokenized_dataset_clean = tokenized_dataset["train"].train_test_split(train_size=0.8, seed=42)
tokenized_dataset_test_clean = tokenized_dataset_test["train"].train_test_split(train_size=0.8, seed=42)

In [47]:
tokenized_dataset_clean.save_to_disk("tokenized_dataset")
tokenized_dataset_test_clean.save_to_disk("tokenized_dataset_test")

Saving the dataset (0/1 shards):   0%|          | 0/4523 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1131 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/356 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/89 [00:00<?, ? examples/s]

In [None]:
import numpy as np
import tensorflow as tf
from transformers import DataCollatorWithPadding
tokenized_dataset = tokenizer(
    df_trained_filt["train"]["question_title"],
    df_trained_filt["train"]["question_body"],
    df_trained_filt["train"]["answer"],
    padding=True,
    truncation=True,
)
tokenized_dataset_answer = tokenizer(tokenized_dataset_clean["answer"])
tokenized_dataset_body = tokenizer(tokenized_dataset_clean["question_body"])

def tokenize_function(example):
    return tokenizer(example["question_title"], example["question_body"],  example["answer"],truncation=False)
tokenized_datasets = tokenized_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["question_title_", "question_body_", "answer_"]}
[len(x) for x in samples["qa_id"]]

batch = data_collator(samples)

{k: v.shape for k, v in batch.items()}

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["question_title", "question_body", "answer"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
tf_test_dataset = tokenized_dataset_test["train"].to_tf_dataset(
    columns=["question_title", "question_body", "answer"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay
import tensorflow as tf

batch_size = 8
num_epochs = 3
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)
from tensorflow.keras.optimizers import Adam

opt = Adam(learning_rate=lr_scheduler)

In [None]:

from tensorflow.keras.losses import SparseCategoricalCrossentropy

checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = TFDistilBertForQuestionAnswering.from_pretrained(checkpoint, num_labels=3)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])
model.fit(tf_train_dataset, validation_data=tf_test_dataset, epochs=3)




"""
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
labels = tf.convert_to_tensor([1, 1])
model.train_on_batch(batch, labels)


model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    tf_train_dataset,
    validation_data=tf_test_dataset,
)
"""


In [None]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay

batch_size = 8
num_epochs = 3
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)
from tensorflow.keras.optimizers import Adam

opt = Adam(learning_rate=lr_scheduler)

In [None]:
import tensorflow as tf

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])

In [None]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)

In [None]:
preds = model.predict(tf_validation_dataset)["logits"]

In [None]:
class_preds = np.argmax(preds, axis=1)
print(preds.shape, class_preds.shape)

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=class_preds, references=df_tested_filt["train"]["label"])

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
import tensorflow as tf

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="tf")
outputs = model(**inputs)

answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)
