In [1]:
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uqq transformers[torch] accelerate datasets kaggle fastkaggle evaluate

In [2]:
from fastkaggle import *

In [3]:
# for working with paths in Python, I recommend using `pathlib.Path`
from pathlib import Path

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not iskaggle and not cred_path.exists():
    creds = ''
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [7]:
comp = 'contradictory-my-dear-watson'

path = setup_comp(comp)

In [8]:
path

Path('contradictory-my-dear-watson')

In [9]:
!ls {path}

model_01  sample_submission.csv  test.csv  train.csv


In [32]:
import pandas as pd
from fastai.text.all import *

df = pd.read_csv(path/'train.csv')
df_test = pd.read_csv(path/'test.csv')

In [33]:
df.describe(include='all')

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
count,12120,12120,12120,12120,12120,12120.0
unique,12120,8209,12119,15,15,
top,5130fd2cb5,and these comments were considered in formulating the interim rules.,I am not sure.,en,English,
freq,1,3,2,6870,6870,
mean,,,,,,0.990759
std,,,,,,0.824523
min,,,,,,0.0
25%,,,,,,0.0
50%,,,,,,1.0
75%,,,,,,2.0


In [34]:
df['language'].unique()

array(['English', 'French', 'Thai', 'Turkish', 'Urdu', 'Russian',
       'Bulgarian', 'German', 'Arabic', 'Chinese', 'Hindi', 'Swahili',
       'Vietnamese', 'Spanish', 'Greek'], dtype=object)

In [35]:
def inputize(df):
    df['input'] = 'TEXT1: ' + df.premise + '; TEXT2: ' + df.hypothesis + '; LANG1: ' + df.lang_abv
    del df['language']
    del df['lang_abv']
    del df['premise']
    del df['hypothesis']
    return df

df = inputize(df)

df['input'].head()

0                                                                                    TEXT1: and these comments were considered in formulating the interim rules.; TEXT2: The rules developed in the interim were put together with these comments in mind.; LANG1: en
1                                                                                              TEXT1: These are issues that we wrestle with in practice groups of law firms, she said. ; TEXT2: Practice groups are not permitted to work on these issues.; LANG1: en
2                                                                                                        TEXT1: Des petites choses comme celles-là font une différence énorme dans ce que j'essaye de faire.; TEXT2: J'essayais d'accomplir quelque chose.; LANG1: fr
3                                                                                           TEXT1: you know they can't really defend themselves like somebody grown uh say my age you know yeah; TEXT2: They can't def

In [36]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'label', 'input'],
    num_rows: 12120
})

In [37]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer

model_nm = 'symanto/xlm-roberta-base-snli-mnli-anli-xnli'
tokenizer = AutoTokenizer.from_pretrained(model_nm)

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=3)



In [38]:
tokenizer.tokenize("G'day folks, I'm Jeremy from fast.ai!")

['▁G',
 "'",
 'day',
 '▁folks',
 ',',
 '▁I',
 "'",
 'm',
 '▁Jeremy',
 '▁from',
 '▁fast',
 '.',
 'ai',
 '!']

In [39]:
def tok_func(x): return tokenizer(x["input"])

tok_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/12120 [00:00<?, ? examples/s]

In [40]:
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT1: and these comments were considered in formulating the interim rules.; TEXT2: The rules developed in the interim were put together with these comments in mind.; LANG1: en',
 [0,
  13018,
  70981,
  82825,
  136,
  6097,
  24626,
  3542,
  90698,
  23,
  26168,
  1916,
  70,
  1940,
  464,
  91736,
  5,
  74,
  13018,
  70981,
  304,
  12,
  581,
  91736,
  126809,
  23,
  70,
  1940,
  464,
  3542,
  3884,
  25842,
  678,
  6097,
  24626,
  23,
  7086,
  5,
  74,
  6,
  73386,
  82825,
  22,
  2])

In [41]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'input', 'input_ids', 'attention_mask'],
        num_rows: 9090
    })
    test: Dataset({
        features: ['id', 'label', 'input', 'input_ids', 'attention_mask'],
        num_rows: 3030
    })
})

In [42]:
df_test = inputize(df_test)
test_ds = Dataset.from_pandas(df_test).map(tok_func, batched=True)

Map:   0%|          | 0/5195 [00:00<?, ? examples/s]

In [43]:
from transformers import TrainingArguments,Trainer

bs = 4
epochs = 4
lr = 1e-7

In [44]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none', save_steps=0)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [45]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_accuracy(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [46]:
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokenizer, compute_metrics=compute_accuracy)

In [47]:
trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3936,0.36431,0.875578
2,0.4681,0.410373,0.882508
3,0.4447,0.425494,0.884818
4,0.4762,0.426211,0.885479


In [48]:
if not iskaggle:
    trainer.save_model(path/"model_01")
    # model = AutoModelForSequenceClassification.from_pretrained(path/"01.pth")
    # tokenizer = AutoTokenizer.from_pretrained(path/"01.pth")
    # trainer = Trainer(model, args, train_dataset=dds['test'], eval_dataset=dds['test'],
    #                   tokenizer=tokenizer, compute_metrics=accuracy_metric)

In [49]:
if not iskaggle:
    !tar -cf {(path/"model_01.tar")} {(path/"model_01")}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [50]:
sample_submission = pd.read_csv(path/'sample_submission.csv')
sample_submission

Unnamed: 0,id,prediction
0,c6d58c3f69,1
1,cefcc82292,1
2,e98005252c,1
3,58518c10ba,1
4,c32b0d16df,1
...,...,...
5190,5f90dd59b0,1
5191,f357a04e86,1
5192,1f0ea92118,1
5193,0407b48afb,1


In [51]:
preds = trainer.predict(test_ds).predictions
preds

array([[-2.6582031, -3.4941406,  6.265625 ],
       [-3.6015625,  6.0976562, -3.3125   ],
       [ 6.5546875, -1.9160156, -3.2890625],
       ...,
       [ 6.7460938, -2.9355469, -2.2441406],
       [ 5.3984375, -1.5419922, -2.7011719],
       [-3.2832031, -2.7539062,  6.0703125]], dtype=float32)

In [52]:
preds = np.argmax(preds, axis=-1)
preds

array([2, 1, 0, ..., 0, 0, 2])

In [53]:
submission = Dataset.from_dict({
    'id': test_ds['id'],
    'prediction': preds
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

67549

In [35]:
!head submission.csv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


id,score
c6d58c3f69,2
cefcc82292,1
e98005252c,0
58518c10ba,1
c32b0d16df,1
aa2510d454,1
865d1c7b16,1
a16f7ed56b,0
6d9fa191e6,1


In [None]:
# if not iskaggle:
#    from kaggle import api
#    api.competition_submit_cli('submission.csv', 'contradictory-watson-01', comp)

In [43]:
if not iskaggle:
    push_notebook('alexchalk', 'contradictory-watson-submission-01',
                  title='Contradictory Watson Submission 01',
                  file='01.ipynb',
                  competition=comp, private=True, gpu=True)

Kernel version 3 successfully pushed.  Please check progress at https://www.kaggle.com/code/alexchalk/contradictory-watson-submission-01
