Install the transformers package and other packages necessary to run the bert model.

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 449 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [None]:
!pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.6.2-py3-none-any.whl (30 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.6.2


Import required packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import torch
from transformers import TrainingArguments, Trainer
from transformers import GPT2Model, GPT2Config, BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import EarlyStoppingCallback
import nltk
import preprocessor as p
import time
import language_tool_python
import re
import pickle
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
tool = language_tool_python.LanguageTool('en-US')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Read in data

In [None]:
# read data
data = pd.read_csv("train.csv")

EDA

In [None]:
# create new columns to identify which tweets contains 4+ consecutive letters

data['3_consec_letters'] = [int(bool(re.search("([a-z\\d])\\1\\1\\1+", tweet.lower()))) for tweet in data['text']]

# how many of the tweets contain 4+ consecutive letters?

tgts = data[data['3_consec_letters'].isin([1])]['target']
print(len(tgts))

# what percentage of these are about a disaster?

print(np.sum(tgts)/len(tgts))

87
0.42528735632183906


In [None]:
# create a series with language tool matches (grammar suggestions) --> TAKES 30 OR SO MINUTES!!!

language_tool = [tool.check(p.tokenize(tweet)) for tweet in data['text']]

In [None]:
# pickle checkpoint for language_tool

#pickle.dump(language_tool, open('language_tool.pickle', 'wb'))
language_tool = pickle.load(open('language_tool.pickle', 'rb'))
language_tool[:4]

[[Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['HASHTAG', 'HASHTAGS'], 'offsetInContext': 33, 'context': 'Our Deeds are the Reason of this $HASHTAG$ May ALLAH Forgive us all', 'offset': 33, 'errorLength': 9, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'Our Deeds are the Reason of this $HASHTAG$ May ALLAH Forgive us all'})],
 [Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['Range', 'Rouge', 'Congé'], 'offsetInContext': 20, 'context': 'Forest fire near La Ronge Sask. Canada', 'offset': 20, 'errorLength': 5, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'Forest fire near La Ronge Sask.'})],
 [],
 [Match({'ruleId': 'MORFOLOGIK_RULE_EN_US', 'message': 'Possible spelling mistake found.', 'replacements': ['NUMBER', 'NUMBERS'], 'offsetInContext': 0, 'context': '$NUMBER$ people receive $HASHTAG$ evacuation ord...', 'offset': 0, 'errorLen

In [None]:
# from the different rule ids in the language tool grammar check, 
# which are red flags that could possibly predict our target?

ids = []
for i in range(data.shape[0]):
  for match in language_tool[i]:
    ids.append(match.ruleId)

ids = list(set(ids))

percent_true = []
totals = []

for j in range(len(ids)):
  true = 0
  total = 0
  for i in range(data.shape[0]):
    for match in language_tool[i]:
      if match.ruleId == ids[j]:
        total += 1
        true += data['target'][i]
        break
  if total > 0:
    percent_true.append(true / total)
    totals.append(total)
  else:
    percent_true.append(0)
    totals.append(0)

ids = list(zip(ids, percent_true, totals))

red_flag_ids = []
for id in ids:
  if (id[1] < .3 or id[1] > .7) and id[2] > 30:
    red_flag_ids.append(id)

print(red_flag_ids)
red_flag_ids = [id[0] for id in red_flag_ids]

[('I_LOWERCASE', 0.1724137931034483, 116), ('PROFANITY', 0.21893491124260356, 169), ('GONNA', 0.23076923076923078, 39), ('COMMA_COMPOUND_SENTENCE', 0.24064171122994651, 187), ('UPPERCASE_SENTENCE_START', 0.266384778012685, 473), ('EN_CONTRACTION_SPELLING', 0.23148148148148148, 108)]


In [None]:
# what percentage tweets that contain typographical errors (excluding UPPERCASE_SENTENCE_START) are about a disaster?

true = 0
total = 0
for i in range(data.shape[0]):
  for match in language_tool[i]:
    if match.ruleIssueType == 'typographical' and match.ruleId != 'UPPERCASE_SENTENCE_START':
      total += 1
      true += data['target'][i]
      break

print(true/total, total)

0.33406593406593404 455


In [None]:
# add columns to indicate presence of these grammar "red flags" in tweets

for id in red_flag_ids:
  data[id] = 0

data['TYPOGRAPHICAL'] = 0

for i in range(data.shape[0]):
  for match in language_tool[i]:
    if match.ruleIssueType == 'typographical' and match.ruleId != 'UPPERCASE_SENTENCE_START':
      data['TYPOGRAPHICAL'][i] == 1
  
    for id in red_flag_ids:
      if match.ruleId == id:
        data[id][i] == 1

In [None]:
# what cities are possible red flags in predicting if a tweet is about a disaster?

percent_true = []
totals = []
locations = list(set(data['location']))

for location in locations:
  true = 0
  total = 0
  for i in range(data.shape[0]):
    if data['location'][i] == location:
      true += data['target'][i]
      total += 1
  if total > 0:
    percent_true.append(true / total)
    totals.append(total)
  else:
    percent_true.append(0)
    totals.append(0)

locations = list(zip(locations, percent_true, totals))

red_flag_locations = []
for location in locations:
  if (location[1] < .4 or location[1] > .6) and location[2] > 15:
    red_flag_locations.append(location)

print(red_flag_locations)
red_flag_locations = [location[0] for location in red_flag_locations]

[('Worldwide', 0.631578947368421, 19), ('Washington, DC', 0.7142857142857143, 21), ('London', 0.35555555555555557, 45), ('Mumbai', 0.8636363636363636, 22), ('Los Angeles, CA', 0.3076923076923077, 26), ('Kenya', 0.25, 20), ('USA', 0.6442307692307693, 104), ('Nigeria', 0.7857142857142857, 28), ('India', 0.8333333333333334, 24), ('New York', 0.22535211267605634, 71)]


In [None]:
# what keywords are possible red flags in predicting if a tweet is about a disaster?

percent_true = []
totals = []
keywords = list(set(data['keyword']))

for keyword in keywords:
  true = 0
  total = 0
  for i in range(data.shape[0]):
    if data['keyword'][i] == keyword:
      true += data['target'][i]
      total += 1
  if total > 0:
    percent_true.append(true / total)
    totals.append(total)
  else:
    percent_true.append(0)
    totals.append(0)

keywords = list(zip(keywords, percent_true, totals))

red_flag_keywords = []
for keyword in keywords:
  if (keyword[1] < .34 or keyword[1] > .66) and keyword[2] > 20:
    red_flag_keywords.append(keyword)

print(red_flag_keywords)
red_flag_keywords = [keyword[0] for keyword in red_flag_keywords]

[('explode', 0.07894736842105263, 38), ('survived', 0.30303030303030304, 33), ('twister', 0.125, 40), ('typhoon', 0.9736842105263158, 38), ('harm', 0.0975609756097561, 41), ('outbreak', 0.975, 40), ('evacuation', 0.7777777777777778, 36), ('debris', 1.0, 37), ('collide', 0.08823529411764706, 34), ('bleeding', 0.14285714285714285, 35), ('attack', 0.6944444444444444, 36), ('stretcher', 0.09090909090909091, 33), ('screams', 0.14285714285714285, 35), ('mudslide', 0.32432432432432434, 37), ('injured', 0.6857142857142857, 35), ('massacre', 0.6944444444444444, 36), ('collision', 0.7435897435897436, 39), ('battle', 0.19230769230769232, 26), ('drown', 0.09375, 32), ('upheaval', 0.15789473684210525, 38), ('detonation', 0.28125, 32), ('bush%20fires', 0.72, 25), ('thunderstorm', 0.7878787878787878, 33), ('obliterated', 0.12903225806451613, 31), ('demolition', 0.2571428571428571, 35), ('wreck', 0.1891891891891892, 37), ('arson', 0.6875, 32), ('drowning', 0.2647058823529412, 34), ('evacuated', 0.8888

In [None]:
# create columns for red flag locations and columns

for location in red_flag_locations:
  data[location] = 0

for keyword in red_flag_keywords:
  data[keyword] = 0

for i in range(data.shape[0]):
  for location in red_flag_locations:
    if data['location'][i] == location:
      data[location][i] == 1
  for keyword in red_flag_keywords:
    if data['keyword'][i] == keyword:
      data[keyword][i] == 1

Preprocess the data

In [None]:
# tokenize tweets

data['text'] = [p.tokenize(tweet) for tweet in data['text']]

In [None]:
# define pretrained tokenizer and model

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# train/dev/split

X = list(data["text"])
y = list(data["target"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

# create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Train the Bert model

In [None]:
# define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    learning_rate=8e-05,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# train pre-trained model
trainer.train()

model_path = "twitter_bert_disaster"
model.save_pretrained(model_path)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 6090
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2286


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.498,0.457869,0.812869,0.775,0.77865,0.776821
1000,0.3981,0.565744,0.8109,0.778309,0.766091,0.772152
1500,0.3531,0.485313,0.822718,0.791733,0.78179,0.78673
2000,0.2276,0.737413,0.824688,0.789969,0.791209,0.790588


***** Running Evaluation *****
  Num examples = 1523
  Batch size = 8
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1523
  Batch size = 8
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1523
  Batch size = 8
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1523
  Batch size = 8
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget

Create Predictions

In [None]:
# generate predictions for tweets in training set

X_train = X
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)

# create torch dataset

train_dataset = Dataset(X_train_tokenized)

# load trained model

model_path = "twitter_bert_disaster"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# define test trainer

train_trainer = Trainer(model)

# make prediction

raw_train_pred, _, _ = train_trainer.predict(train_dataset)

# preprocess raw predictions

data['bert_pred'] = np.argmax(raw_train_pred, axis=1)

loading configuration file twitter_bert_disaster/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.13.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file twitter_bert_disaster/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceCla

Fit more models with the Bert guesses

In [None]:
# fit three models from previous predictions + other numeric columns.

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns = ['id', 'keyword', 'location', 'text', 'target'], axis = 1), data['target'], test_size=0.33, random_state=42)

rf1 = RandomForestClassifier(max_depth=9, random_state=42)
rf1.fit(X_train, y_train)

rf2 = RandomForestClassifier(max_depth=8, random_state=0)
rf2.fit(X_train, y_train)

reg = LogisticRegression()
reg.fit(X_train, y_train)

LogisticRegression()

In [None]:
# load test data

test_data = pd.read_csv("test.csv")

In [None]:
# create 3 consecutive letters column

test_data['3_consec_letters'] = [int(bool(re.search("([a-z\\d])\\1\\1\\1+", tweet.lower()))) for tweet in test_data['text']]

In [None]:
# create language tool list --> TAKES AROUND 30 MINUTES

language_tool = [tool.check(p.tokenize(tweet)) for tweet in test_data['text']]

In [None]:
# pickle checkpoint for new language_tool list

#pickle.dump(language_tool, open('language_tool2.pickle', 'wb'))
language_tool = pickle.load(open('language_tool2.pickle', 'rb'))
language_tool[:4]

In [None]:
# add language tool "red flag" indicators to dataframe

for id in red_flag_ids:
  test_data[id] = 0

test_data['TYPOGRAPHICAL'] = 0

for i in range(test_data.shape[0]):
  for match in language_tool[i]:
    if match.ruleIssueType == 'typographical' and match.ruleId != 'UPPERCASE_SENTENCE_START':
      test_data['TYPOGRAPHICAL'][i] == 1
  
    for id in red_flag_ids:
      if match.ruleId == id:
        test_data[id][i] == 1

In [None]:
# add location & keyword "red flag" indicators to dataframe

for location in red_flag_locations:
  test_data[location] = 0

for keyword in red_flag_keywords:
  test_data[keyword] = 0

for i in range(test_data.shape[0]):
  for location in red_flag_locations:
    if test_data['location'][i] == location:
      test_data[location][i] == 1
  for keyword in red_flag_keywords:
    if test_data['keyword'][i] == keyword:
      test_data[keyword][i] == 1

In [None]:
# tokenize tweets

test_data['text'] = [p.tokenize(tweet) for tweet in test_data['text']]

In [None]:
# prepare data for fine-tuned bert predictions

X_test = list(test_data["text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# create torch dataset

test_dataset = Dataset(X_test_tokenized)

# load trained model

model_path = "twitter_bert_disaster"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# define test trainer

test_trainer = Trainer(model)

# make prediction

raw_pred, _, _ = test_trainer.predict(test_dataset)

# preprocess raw predictions

test_data['bert_pred'] = np.argmax(raw_pred, axis=1)

In [None]:
# make final predictions from bagging three models fit to the bert predictions and other columns

Xtest = test_data.drop(columns = ['id', 'keyword', 'location', 'text', 'target'], axis = 1)

test_data['target'] = [int(round(x)) for x in list((rf1.predict(Xtest) + rf2.predict(Xtest) + reg.predict(Xtest))/3)]

In [None]:
# send to csv to submit to kaggle

test_data.loc[:,['id', 'target']].to_csv('twitter_disaster_predictions.csv', index = False)