In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
df_train['text'] = df_train['text'].apply(lambda x: str(x).lower())
df_test['text'] = df_test['text'].apply(lambda x: str(x).lower())
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,"heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,apocalypse lighting. #spokane #wildfires
4,11,,,typhoon soudelor kills 28 in china and taiwan


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df_train['text'], df_train['target'], test_size=0.1) # Split the data into val and test
X_test = df_test['text']
X_train.shape, y_train.shape

((6851,), (6851,))

Sklearn allows to create pipelines of preprocessors and models, which can drastically simplify the workflow, as we won't have to apply preprocessors manually to every input data prior to running rpedictions on it. Now let's create such pipepline. We will use Sklearn SGDClassifier as our model 

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline


clf = Pipeline([
    ('vect', CountVectorizer()), #CountVectorizer creates a scipy.sparse_matrix where it counts the occurence of each word in every word in each text  
    ('tfidf', TfidfTransformer(use_idf=True)), #TfidfTransformer scales the n-grams of words obtained from CountVectorizer according to the length of text
    ('sgd_clf', SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, max_iter=1000, tol=None)) #Main model, used to classify tweets into disasterous or not based on the sparse matrix obtained during preprocessing
    ])

clf.fit(X_train, y_train)

validations = clf.predict(X_val) #Run the claaifier on the validation to assess the performance, as targets for X_test are not given

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(f'Accuracy: {accuracy_score(y_val, validations):.2f}')
print(f'Precision: {precision_score(y_val, validations):.2f}')
print(f'Recall: {recall_score(y_val, validations):.2f}')
print(f'F1-Score: {f1_score(y_val, validations):.2f}')

Accuracy: 0.82
Precision: 0.83
Recall: 0.71
F1-Score: 0.76


Now we will try to use pretrained BERT Classifier using a pretrained model from Hugging Face

In [49]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #Initialize a tokenizer to preprocess tweets so we can feed them into the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) #Download a pretrained model. We will fine-tune it on our tweets

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
X_tokenized = tokenizer(X_train.tolist(), padding=True, truncation=False, max_length=128, return_tensors='pt')
input_ids = X_tokenized['input_ids']
attention_mask = X_tokenized['attention_mask']
y_train_tensor = torch.tensor(y_train.tolist())



In [51]:
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(input_ids, attention_mask, y_train_tensor)
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [52]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=1e-4, weight_decay=0.01)

#CUDA Support
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

model = model.to(device)

model.train()
for epoch in range(2):
    for batch in train_loader:
        batch = [item.to(device) for item in batch]
        
        b_input_ids, b_attention_mask, b_y_train_tensor = batch

        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_y_train_tensor)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
    print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

Epoch: 1, Loss: 1.5182080268859863
Epoch: 2, Loss: 0.9327085018157959


In [53]:
#Create a dataset for evaluation
X_val_tokenized = tokenizer(X_val.tolist(), padding=True, truncation=False, max_length=128, return_tensors='pt')
input_ids_val = X_val_tokenized['input_ids']
attention_mask_val = X_val_tokenized['attention_mask']
y_val_tensor = torch.tensor(y_val.tolist())

val_dataset = TensorDataset(input_ids_val, attention_mask_val, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True) 

In [54]:
def evaluate(model, loader: DataLoader, device):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in loader:
        batch = [item.to(device) for item in batch]
        
        b_input_ids, b_attention_mask, b_y_val_tensor = batch

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_y_val_tensor)
        
        logits = outputs.logits
        loss = outputs.loss
        total_eval_loss += loss.item() 

        logits = logits.detach().cpu().numpy() # Get raw model predictions
        y_val_np = b_y_val_tensor.to('cpu').numpy() # Numpy array of correct values

        predictions = np.argmax(logits, axis=1) 
        total_eval_accuracy += np.sum(predictions == y_val_np) # Calculate how many predictions coincide with labels
    avg_loss = total_eval_loss / len(loader) 
    avg_accuracy = total_eval_accuracy / len(loader.dataset) 
    return avg_loss, avg_accuracy

In [55]:
val_loss, val_accuracy = evaluate(model=model, loader=val_loader, device=device)
print(f'Average Accuracy: {val_accuracy}')
print(f'Average Loss: {val_loss}')

Average Accuracy: 0.5879265091863517
Average Loss: 0.6840447584788004
