In [1]:
import pandas as pd
import numpy as np

In [2]:
! pip install -U git+https://github.com/huggingface/transformers.git
! pip install -U git+https://github.com/huggingface/accelerate.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-zugyzv53
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-zugyzv53
  Resolved https://github.com/huggingface/transformers.git to commit abaca9f9432a84cfaa95531de4c72334f38a42f2
  Installing build dependencies ... [?25l- \ | / done
[?25h  Getting requirements to build wheel ... [?25l- \ done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- \ done
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | done
[?25h  Created wheel for transformers: filename=transformers-4.31.0.dev0-py3-none-any.whl size=7286950 sha256=2561db4f450f802cf6b9013b77cbebb04cafacbe5813750b6595517df5519b44
  Stored in d

In [3]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [4]:
train_df["target"] = train_df["target"].astype(float)

mode = train_df.mode().iloc[0]
train_df.fillna(mode, inplace=True)
train_df.isna().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

In [5]:
train_df.iloc[3]

id                                                          6
keyword                                            fatalities
location                                                  USA
text        13,000 people receive #wildfires evacuation or...
target                                                    1.0
Name: 3, dtype: object

In [6]:
train_df["input"] = "TEXT1: " + train_df["keyword"] + "; TEXT2: " + train_df["location"] + "; TEXT3: " + train_df["text"]

In [7]:
train_df["input"]

0       TEXT1: fatalities; TEXT2: USA; TEXT3: Our Deed...
1       TEXT1: fatalities; TEXT2: USA; TEXT3: Forest f...
2       TEXT1: fatalities; TEXT2: USA; TEXT3: All resi...
3       TEXT1: fatalities; TEXT2: USA; TEXT3: 13,000 p...
4       TEXT1: fatalities; TEXT2: USA; TEXT3: Just got...
                              ...                        
7608    TEXT1: fatalities; TEXT2: USA; TEXT3: Two gian...
7609    TEXT1: fatalities; TEXT2: USA; TEXT3: @aria_ah...
7610    TEXT1: fatalities; TEXT2: USA; TEXT3: M1.94 [0...
7611    TEXT1: fatalities; TEXT2: USA; TEXT3: Police i...
7612    TEXT1: fatalities; TEXT2: USA; TEXT3: The Late...
Name: input, Length: 7613, dtype: object

In [8]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
#model_nm = 'microsoft/DeBERTa-v3-base'
tokz = AutoTokenizer.from_pretrained(model_nm)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
tokz.tokenize(train_df.iloc[0]["input"])

['▁TEXT',
 '1',
 ':',
 '▁fatalities',
 ';',
 '▁TEXT',
 '2',
 ':',
 '▁USA',
 ';',
 '▁TEXT',
 '3',
 ':',
 '▁Our',
 '▁Deeds',
 '▁are',
 '▁the',
 '▁Reason',
 '▁of',
 '▁this',
 '▁#',
 'earthquake',
 '▁May',
 '▁ALL',
 'AH',
 '▁Forgive',
 '▁us',
 '▁all']

In [10]:
def tok_func(x): return tokz(x["input"])

In [11]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(train_df)

In [12]:
ds

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'input'],
    num_rows: 7613
})

In [13]:
tok_ds = ds.map(tok_func, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

In [14]:
tok_ds

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7613
})

In [15]:
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT1: fatalities; TEXT2: USA; TEXT3: Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 [1,
  54453,
  435,
  294,
  23915,
  346,
  54453,
  445,
  294,
  2222,
  346,
  54453,
  508,
  294,
  581,
  65453,
  281,
  262,
  18037,
  265,
  291,
  953,
  117831,
  903,
  4924,
  17018,
  43632,
  381,
  305,
  2])

In [16]:
tok_ds = tok_ds.rename_columns({'target':'labels'})

In [17]:
dds = tok_ds.train_test_split(0.25, seed=42)

In [18]:
mode = test_df.mode().iloc[0]
test_df.fillna(mode, inplace=True)
test_df.isna().sum()

test_df["input"] = "TEXT1: " + test_df["keyword"] + "; TEXT2: " + test_df["location"] + "; TEXT3: " + test_df["text"]
eval_ds = Dataset.from_pandas(test_df).map(tok_func, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [19]:
eval_ds

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3263
})

In [20]:
def mse(x,y): return np.mean((x-y)**2)
def mse_d(eval_pred): return {'mse': mse(*eval_pred)}

In [21]:
from transformers import TrainingArguments,Trainer


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [22]:
bs = 128
epochs = 3
lr = 8e-5
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=mse_d)

Downloading pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
trainer.train();

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.133062,0.133062
2,No log,0.133567,0.133567
3,No log,0.133924,0.133924


In [24]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds[preds>=0.5] = 1.
preds[preds<0.5] = 0.

In [25]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'target': preds.astype(int).T[0].tolist()
})

submission.to_csv('submission.csv', index=False)


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

22746

In [26]:
preds.astype(int).T[0].tolist()[:10]

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]

In [27]:
dds["test"]["labels"][:10]

[1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0]

In [28]:
preds = trainer.predict(dds["test"]).predictions.astype(float)
preds[preds>=0.5] = 1.
preds[preds<0.5] = 0.
preds[10:20]

array([0., 1., 0., 0., 1., 1., 1., 0., 1., 0.])