In [71]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [49]:
test_df.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [5]:
train_df[train_df["target"] == 0]["text"].values[1]

'I love fruits'

In [6]:
train_df[train_df["target"] == 1]["text"].values[1]

'Forest fire near La Ronge Sask. Canada'

In [72]:
import re
import string

#Cleanup the strings so we only get important words

#Remove any urls
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

#Remove any extra html
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

#Remove any emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Remove puncations and hashtags
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def clean_tweet(text):
    text = remove_url(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = remove_punct(text)
    return text

#Only apply this to train_df
train_df['text'] = train_df['text'].apply(lambda x : clean_tweet(x))

In [65]:
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this earthquake Ma...,1
1,4,,,Forest fire near La Ronge Sask Canada,1
2,5,,,All residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,Just got sent this photo from Ruby Alaska as s...,1
5,8,,,RockyFire Update California Hwy 20 closed in ...,1
6,10,,,flood disaster Heavy rain causes flash floodin...,1
7,13,,,Im on top of the hill and I can see a fire in ...,1
8,14,,,Theres an emergency evacuation happening now i...,1
9,15,,,Im afraid that the tornado is coming to our area,1


In [90]:
# Split the data into train and validation sets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Convert the text into vectors
vectorizer = CountVectorizer()

x_train, x_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.3, random_state=42)

#After splitting into train and test, tokenize
x_train_vectors = vectorizer.fit_transform(x_train)
x_test_vectors = vectorizer.transform(x_test)

In [81]:
x_train_vectors

<5329x14210 sparse matrix of type '<class 'numpy.int64'>'
	with 68468 stored elements in Compressed Sparse Row format>

In [82]:
y_train

1186    0
4071    1
5461    1
5787    1
7445    0
       ..
5226    0
5390    0
860     0
7603    1
7270    1
Name: target, Length: 5329, dtype: int64

In [88]:
from sklearn.neighbors import KNeighborsClassifier

#Just a basic kneighbors to get something working

clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(x_train_vectors, y_train);

In [96]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(x_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy is {accuracy*100:.2f}%")

Accuracy is 69.35%


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [103]:
from torch import nn

class DisasterClassifier(nn.Module):
    def __init__(self, model):
        super(DisasterClassifier, self).__init__()
        
        self.bert = model
        
        self.fc1 = nn.Linear(768, 32)
        self.relu = nn.ReLU(inplace=True)
        
        self.fc2 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids=input_ids,attention_mask=attention_mask)[0][:, 0]
        x = self.fc1(x)
        x = self.relu(x)
        
        x = self.fc2(x)
        x = self.sigmoid(x)

        return x

In [104]:
import torch
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
    best_val_loss = float('inf')
    early_stopping_threshold_count = 0
    
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        model.train()
        
        for train_input, train_label in tqdm(train_dataloader):
            attention_mask = train_input['attention_mask'].to(device)
            input_ids = train_input['input_ids'].squeeze(1).to(device)

            train_label = train_label.to(device)

            output = model(input_ids, attention_mask)

            loss = criterion(output, train_label.float().unsqueeze(1))

            total_loss_train += loss.item()

            acc = ((output >= 0.5).int() == train_label.unsqueeze(1)).sum().item()
            total_acc_train += acc

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        with torch.no_grad():
            total_acc_val = 0
            total_loss_val = 0
            
            model.eval()
            
            for val_input, val_label in tqdm(val_dataloader):
                attention_mask = val_input['attention_mask'].to(device)
                input_ids = val_input['input_ids'].squeeze(1).to(device)

                val_label = val_label.to(device)

                output = model(input_ids, attention_mask)

                loss = criterion(output, val_label.float().unsqueeze(1))

                total_loss_val += loss.item()

                acc = ((output >= 0.5).int() == val_label.unsqueeze(1)).sum().item()
                total_acc_val += acc
            
            print(f'Epochs: {epoch + 1} '
                  f'| Train Loss: {total_loss_train / len(train_dataloader): .3f} '
                  f'| Train Accuracy: {total_acc_train / (len(train_dataloader.dataset)): .3f} '
                  f'| Val Loss: {total_loss_val / len(val_dataloader): .3f} '
                  f'| Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')
            
            if best_val_loss > total_loss_val:
                best_val_loss = total_loss_val
                torch.save(model, f"best_model.pt")
                print("Saved model")
                early_stopping_threshold_count = 0
            else:
                early_stopping_threshold_count += 1
                
            if early_stopping_threshold_count >= 1:
                print("Early stopping")
                break

In [128]:
from torch.utils.data import Dataset

class DisasterDataset(Dataset):
    def __init__(self, text_df, label_df, tokenizer):
        texts = text_df.values.tolist()

        texts = [clean_tweet(text) for text in texts]

        self.texts = [tokenizer(text, padding='max_length',
                                max_length=150,
                                truncation=True,
                                return_tensors="pt")
                      for text in texts]
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        label = -1
        if hasattr(self, 'labels'):
            label = self.labels[idx]

        return text, label

        self.labels = label_df.values.tolist()

In [124]:
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

torch.manual_seed(0)
np.random.seed(0)
    
bert_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
base_model = AutoModel.from_pretrained(bert_name)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_dataloader = DataLoader(DisasterDataset(x_train, y_train,tokenizer), batch_size=8, shuffle=True, num_workers=0)
val_dataloader = DataLoader(DisasterDataset(x_test, y_test,tokenizer), batch_size=8, num_workers=0)

model = DisasterClassifier(base_model)


learning_rate = 1e-5
epochs = 5
train(model, train_dataloader, val_dataloader, learning_rate, epochs)

 10%|████▍                                     | 70/667 [09:57<55:41,  5.60s/it]

In [101]:
sample_submission = pd.read_csv("data/sample_submission.csv")

In [102]:
sample_submission["target"] = clf.predict(test_vectors)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [12]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [13]:
sample_submission.to_csv("submission.csv", index=False)