<a href="https://colab.research.google.com/github/FlorianShepherd/ml/blob/main/nlp_beginner_tutorial_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import files
files.upload()


In [None]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle competitions download -c nlp-getting-started

In [None]:
!mkdir Dataset
!cp /content/nlp-getting-started.zip /content/Dataset/nlp-getting-started.zip
!unzip -q /content/Dataset/nlp-getting-started.zip -d /content/Dataset
!rm /content/Dataset/nlp-getting-started.zip


In [None]:
! pip install --upgrade pip
! pip install --no-cache-dir transformers
! pip install sentencepiece
! pip install accelerate
! pip install datasets

In [None]:
train_df = pd.read_csv("/content/Dataset/train.csv")
test_df = pd.read_csv("/content/Dataset/test.csv")

In [None]:
train_df["target"] = train_df["target"].astype(float)

mode = train_df.mode().iloc[0]
train_df.fillna(mode, inplace=True)
train_df.isna().sum()

In [None]:
train_df.iloc[3]

In [None]:
train_df["input"] = "TEXT1: " + train_df["keyword"] + "; TEXT2: " + train_df["location"] + "; TEXT3: " + train_df["text"]
# train_df["input"] = "TEXT2: " + train_df["location"] + "; TEXT3: " + train_df["text"]

In [None]:
train_df["text"]

In [None]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
#model_nm = 'microsoft/DeBERTa-v3-base'
tokz = AutoTokenizer.from_pretrained(model_nm, use_fast=False)

In [None]:
tokz.tokenize(train_df.iloc[0]["input"])

In [None]:
def tok_func(x): return tokz(x["input"])

In [None]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(train_df)

In [None]:
ds

In [None]:
tok_ds = ds.map(tok_func, batched=True)

In [None]:
tok_ds

In [None]:
row = tok_ds[0]
row['input'], row['input_ids']

In [None]:
tok_ds = tok_ds.rename_columns({'target':'labels'})

In [None]:
dds = tok_ds.train_test_split(0.25, seed=1337)

In [None]:
mode = test_df.mode().iloc[0]
test_df.fillna(mode, inplace=True)
test_df.isna().sum()

test_df["input"] = "TEXT1: " + test_df["keyword"] + "; TEXT2: " + test_df["location"] + "; TEXT3: " + test_df["text"]
# test_df["input"] = "TEXT2: " + test_df["location"] + "; TEXT3: " + test_df["text"]


eval_ds = Dataset.from_pandas(test_df).map(tok_func, batched=True)

In [None]:
eval_ds

In [None]:
def mse(x,y): return np.mean((x-y)**2)
def mse_d(eval_pred): return {'mse': mse(*eval_pred)}

def acc(x,y): return np.mean(1.0-np.abs(np.round(x)-np.round(y)))
def acc_d(eval_pred): return {'acc': acc(*eval_pred)}

acc(0, 1)

In [None]:
from transformers import TrainingArguments,Trainer


In [None]:
bs = 128
epochs = 5
lr = 8e-5
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=acc_d)

In [None]:
trainer.train();

In [None]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds[preds>=0.5] = 1.
preds[preds<0.5] = 0.

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'target': preds.astype(int).T[0].tolist()
})

submission.to_csv('submission.csv', index=False)


In [None]:
preds.astype(int).T[0].tolist()[:10]

In [None]:
dds["test"]["labels"][:10]

In [None]:
preds = trainer.predict(dds["test"]).predictions.astype(float)
preds[preds>=0.5] = 1.
preds[preds<0.5] = 0.
preds[10:20]