In [1]:
import pandas as pd
import glob
import torch, os
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast, TrainingArguments, Trainer, DistilBertForSequenceClassification, DistilBertTokenizerFast
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU')
else:
    device = torch.device("cpu")
    print('CPU')

CPU


In [None]:
train_paths = ['/content/train/california_wildfires_2018_train.tsv',
               '/content/train/canada_wildfires_2016_train.tsv',
               '/content/train/cyclone_idai_2019_train.tsv',
               '/content/train/ecuador_earthquake_2016_train.tsv',
               '/content/train/greece_wildfires_2018_train.tsv',
               '/content/train/hurricane_dorian_2019_train.tsv',
               '/content/train/hurricane_florence_2018_train.tsv',
               '/content/train/hurricane_harvey_2017_train.tsv',
               '/content/train/hurricane_irma_2017_train.tsv',
               '/content/train/hurricane_maria_2017_train.tsv',
               '/content/train/hurricane_matthew_2016_train.tsv',
               '/content/train/italy_earthquake_aug_2016_train.tsv',
               '/content/train/kaikoura_earthquake_2016_train.tsv',
               '/content/train/kerala_floods_2018_train.tsv',
               '/content/train/maryland_floods_2018_train.tsv',
               '/content/train/midwestern_us_floods_2019_train.tsv',
               '/content/train/pakistan_earthquake_2019_train.tsv',
               '/content/train/puebla_mexico_earthquake_2017_train.tsv',
               '/content/train/srilanka_floods_2017_train.tsv']

val_paths = ['/content/dev/california_wildfires_2018_dev.tsv',
             '/content/dev/canada_wildfires_2016_dev.tsv',
             '/content/dev/cyclone_idai_2019_dev.tsv',
             '/content/dev/ecuador_earthquake_2016_dev.tsv',
             '/content/dev/greece_wildfires_2018_dev.tsv',
             '/content/dev/hurricane_dorian_2019_dev.tsv',
             '/content/dev/hurricane_florence_2018_dev.tsv',
             '/content/dev/hurricane_harvey_2017_dev.tsv',
             '/content/dev/hurricane_irma_2017_dev.tsv',
             '/content/dev/hurricane_maria_2017_dev.tsv',
             '/content/dev/hurricane_matthew_2016_dev.tsv',
             '/content/dev/italy_earthquake_aug_2016_dev.tsv',
             '/content/dev/kaikoura_earthquake_2016_dev.tsv',
             '/content/dev/kerala_floods_2018_dev.tsv',
             '/content/dev/maryland_floods_2018_dev.tsv',
             '/content/dev/midwestern_us_floods_2019_dev.tsv',
             '/content/dev/pakistan_earthquake_2019_dev.tsv',
             '/content/dev/puebla_mexico_earthquake_2017_dev.tsv',
             '/content/dev/srilanka_floods_2017_dev.tsv']

test_paths = ['/content/test/california_wildfires_2018_test.tsv',
              '/content/test/canada_wildfires_2016_test.tsv',
              '/content/test/cyclone_idai_2019_test.tsv',
              '/content/test/ecuador_earthquake_2016_test.tsv',
              '/content/test/greece_wildfires_2018_test.tsv',
              '/content/test/hurricane_dorian_2019_test.tsv',
              '/content/test/hurricane_florence_2018_test.tsv',
              '/content/test/hurricane_harvey_2017_test.tsv',
              '/content/test/hurricane_irma_2017_test.tsv',
              '/content/test/hurricane_maria_2017_test.tsv',
              '/content/test/hurricane_matthew_2016_test.tsv',
              '/content/test/italy_earthquake_aug_2016_test.tsv',
              '/content/test/kaikoura_earthquake_2016_test.tsv',
              '/content/test/kerala_floods_2018_test.tsv',
              '/content/test/maryland_floods_2018_test.tsv',
              '/content/test/midwestern_us_floods_2019_test.tsv',
              '/content/test/pakistan_earthquake_2019_test.tsv',
              '/content/test/puebla_mexico_earthquake_2017_test.tsv',
              '/content/test/srilanka_floods_2017_test.tsv']

train_file_paths = []

for path in train_paths:
    files = glob.glob(path)
    train_file_paths.extend(files)

val_file_paths = []

for path in val_paths:
    files = glob.glob(path)
    val_file_paths.extend(files)

test_file_paths = []

for path in test_paths:
    files = glob.glob(path)
    test_file_paths.extend(files)

In [None]:
train_dfs = []

for file in train_file_paths:
    df = pd.read_csv(file, sep='\t')
    df = df.iloc[:, 1:] # removing tweet ids
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)

val_dfs = [] # dataframes

for file in val_file_paths:
    df = pd.read_csv(file, sep='\t')
    df = df.iloc[:, 1:] # removing tweet ids
    val_dfs.append(df)

val_df = pd.concat(val_dfs, ignore_index=True)

test_dfs = [] # dataframes

for file in test_file_paths:
    df = pd.read_csv(file, sep='\t')
    df = df.iloc[:, 1:] # removing tweet ids
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)

ValueError: No objects to concatenate

In [None]:
class_labels = train_df['class_label'].unique().tolist()
class_labels = [s.strip() for s in class_labels]

In [None]:
id2label = {id:label for id, label in enumerate(class_labels)}
label2id = {label:id for id, label in enumerate(class_labels)}

In [None]:
train_df['class_label_num'] = pd.factorize(train_df.class_label)[0] # 53531
val_df['class_label_num'] = pd.factorize(val_df.class_label)[0] # 7793
test_df['class_label_num'] = pd.factorize(test_df.class_label)[0] # 15160

In [None]:
train_df.class_label.value_counts().plot(kind='pie', figsize=(10,10))

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('google-bert/bert-base-uncased', max_length=512)

In [None]:
model = BertForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels=len(class_labels), id2label=id2label, label2id=label2id)
model.to(device)

In [None]:
train_tweets = list(train_df.tweet_text)
train_labels = list(train_df.class_label_num)

val_tweets = list(val_df.tweet_text)
val_labels = list(val_df.class_label_num)

test_tweets = list(test_df.tweet_text)
test_labels = list(test_df.class_label_num)

In [None]:
print(len(train_tweets), len(val_tweets), len(test_tweets))
print(len(train_labels), len(val_labels), len(test_labels))

In [None]:
train_encodings = tokenizer(train_tweets, truncation=True, padding=True)
val_encodings  = tokenizer(val_tweets, truncation=True, padding=True)
test_encodings = tokenizer(test_tweets, truncation=True, padding=True)

In [None]:
class DataLoader(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataloader = DataLoader(train_encodings, train_labels)

val_dataloader = DataLoader(val_encodings, val_labels)

test_dataloader = DataLoader(test_encodings, test_labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)

    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./preds_checkpoints',
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_dir='./multi-class-logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    fp16=True,
    load_best_model_at_end=True
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics= compute_metrics
)

NameError: name 'model' is not defined

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.656,3.500642,0.303734,0.127655,0.133671,0.142059
100,0.7809,3.496252,0.283973,0.128478,0.138155,0.136802
150,0.6613,3.56336,0.281535,0.127709,0.133143,0.128289
200,0.5009,3.724212,0.289747,0.124982,0.12862,0.137257
250,0.6099,3.593887,0.29334,0.12655,0.12979,0.128931
300,0.5767,3.688752,0.285898,0.12987,0.135478,0.131966
350,0.5743,3.837301,0.28038,0.126393,0.13674,0.13315
400,0.5593,3.798586,0.304761,0.130917,0.131425,0.151993
450,0.5152,3.926718,0.269216,0.123712,0.132138,0.130752
500,0.514,3.866246,0.29411,0.124984,0.12565,0.127755


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,2.1233,2.153698,0.280765,0.058716,0.042788,0.110905
100,1.6635,2.347993,0.301424,0.111092,0.093399,0.148205
150,1.2049,2.54207,0.292442,0.1297,0.131533,0.134649
200,0.8413,2.919263,0.280252,0.125646,0.129088,0.130484
250,0.9331,2.763415,0.325035,0.134779,0.129096,0.144039
300,0.8547,3.038362,0.291544,0.131824,0.135085,0.149391
350,0.9077,3.054461,0.280893,0.130696,0.13611,0.158583
400,0.8947,3.14443,0.320159,0.130548,0.144493,0.167257
450,0.7849,3.264558,0.266778,0.127827,0.139442,0.128476
500,0.8052,3.072797,0.310535,0.131206,0.125304,0.148239


Checkpoint destination directory ./preds_checkpoints/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [None]:
q = [trainer.evaluate(eval_dataset = train_df) for train_df in [train_dataloader, val_dataloader, test_dataloader]]

pd.DataFrame(q, index=['train','val','test']).iloc[:,:5]

In [None]:
def predict(tweet):
    inputs = tokenizer(tweet, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")

    outputs = model(**inputs)

    probs = outputs[0].softmax(1)

    pred_label_idx = probs.argmax()
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label

In [None]:
text = 'Some people here are severely injured we need help right now!!!'
predict(text)

In [None]:
model_path = 'disaster_classifier'
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
model_path = 'disaster_classifier'


model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [None]:
nlp('I see bodies all over the place and those seem to be corpses')