In [1]:
import pandas as pd 
import numpy as np 


### load the data 

##### without change the dtype 

In [5]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [7]:
# for training dataset 
print("Training Set Shape = {} ".format(df_train.shape))
print("Training Set Memory Usage = {:.2f} MB".format(df_train.memory_usage().sum() / 1024**2))

# for test dataset 
print("Test Set Shape  = {} ".format(df_test.shape))
print("Test Set Memory Usage = {:.2f} MB".format(df_test.memory_usage().sum() / 1024**2))



Training Set Shape = (7613, 5) 
Training Set Memory Usage = 0.29 MB
Test Set Shape  = (3263, 4) 
Test Set Memory Usage = 0.10 MB


##### with change the dtype

In [10]:
df_train = pd.read_csv("train.csv" , dtype = {"id":np.int16 , "target" : np.int8})
df_test = pd.read_csv("test.csv" , dtype = {"id" : np.int16} )

In [12]:
# for training dataset 
print("Training Set Shape = {} ".format(df_train.shape))
print("Training Set Memory Usage = {:.2f} MB".format(df_train.memory_usage().sum() / 1024**2))

# for test dataset 
print("Test Set Shape  = {} ".format(df_test.shape))
print("Test Set Memory Usage = {:.2f} MB".format(df_test.memory_usage().sum() / 1024**2))



Training Set Shape = (7613, 5) 
Training Set Memory Usage = 0.20 MB
Test Set Shape  = (3263, 4) 
Test Set Memory Usage = 0.08 MB


##### data cleaning 
* we will drop all nan values and limit the row in training dataset to 1000 

In [15]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [17]:
df_train = df_train.iloc[:1000,:]

In [19]:
# for training dataset 
print("Training Set Shape = {} ".format(df_train.shape))
print("Training Set Memory Usage = {:.2f} MB".format(df_train.memory_usage().sum() / 1024**2))


Training Set Shape = (1000, 5) 
Training Set Memory Usage = 0.03 MB


In [5]:
df_train.target.value_counts()

target
0    607
1    393
Name: count, dtype: int64

In [6]:
df_test

Unnamed: 0,id,keyword,location,text
15,46,ablaze,London,Birmingham Wholesale Market is ablaze BBC News...
16,47,ablaze,Niall's place | SAF 12 SQUAD |,@sunkxssedharry will you wear shorts for race ...
17,51,ablaze,NIGERIA,#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriag...
18,58,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...
19,60,ablaze,"Los Angeles, Califnordia",PSA: IÛªm splitting my personalities.\r\n\r\n...
...,...,...,...,...
3246,10804,wrecked,Love Reiss,@yakubOObs think he deactivated because his no...
3247,10806,wrecked,Seattle Washington,RT CNBC '3 words from Disney CEO Bob Iger wrec...
3248,10807,wrecked,Acey mountain islanddåÇTorontoåÈ,Smackdown tyme this should put me in a good mo...
3249,10816,wrecked,los angeles,@thrillhho jsyk I haven't stopped thinking abt...


In [7]:
text = df_train["text"]
labels = df_train["target"]

In [8]:
import torch 
from transformers import BertTokenizer , BertForSequenceClassification 
from torch.utils.data import DataLoader , Dataset 
from transformers import AdamW
from sklearn.metrics import accuracy_score

In [9]:
if torch.cuda.is_available():
    device=torch.device('cuda')
else:
    device=torch.device('cpu')
print(device)

cuda


In [10]:
class TextDataset(Dataset):
    def __init__(self , texts , labels , tokenizer , max_len):
        self.texts = texts.reset_index(drop = True)
        self.labels = labels.reset_index(drop = True)
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index) -> dict:
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length = self.max_len,
            return_token_type_ids=False,
            padding = "max_length",
            truncation = True,
            return_tensors = "pt"
        )

        return {
            "input_ids" : encoding["input_ids"].flatten(),
            "attention_mask" : encoding["attention_mask"].flatten(),
            "labels" : torch.tensor(label , dtype = torch.long)
        }

In [11]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_len = 256 



In [12]:
# Split the dataset 
from sklearn.model_selection import train_test_split 
train_texts , val_texts , train_labels , val_labels = train_test_split(text , labels , random_state = 42 , stratify = df_train["target"])

# Creating dataloader instance 
train_dataset = TextDataset(train_texts , train_labels , tokenizer , max_len)
val_dataset = TextDataset(val_texts , val_labels , tokenizer , max_len)

# Creating loader object 
train_loader = DataLoader(train_dataset , batch_size = 16 , shuffle = True)
val_loader = DataLoader(val_dataset , batch_size = 16 , shuffle = False)

In [13]:
# Initializing Bert Model 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased" , num_labels = 2)
optimizer = AdamW(model.parameters() , lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
if torch.cuda.is_available():
    device=torch.device('cuda')
else:
    device=torch.device('cpu')
model.to(device)


def train_epoch(model,data_loader,optimizer,device):
    model.train()
    total_loss=0
    
    for batch in data_loader:
        input_ids=batch['input_ids'].to(device)
        attention_mask=batch['attention_mask'].to(device)
        labels=batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs=model(input_ids,attention_mask=attention_mask,labels=labels)
        loss=outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss+=loss.item()
        
    return total_loss/len(data_loader)

def evaluate(model, data_loader, device):
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return accuracy_score(true_labels, preds)

        

In [15]:
num_epochs = 10 
for epoch in range(num_epochs):
    train_loss = train_epoch(model , train_loader , optimizer , device)
    val_accuracy = evaluate(model , val_loader , device)
    print(f"Epoch {epoch + 1} , Loss: {train_loss}, Validation Accuracy:{val_accuracy}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1 , Loss: 0.5901380542744982, Validation Accuracy:0.808
Epoch 2 , Loss: 0.39681873771738496, Validation Accuracy:0.776
Epoch 3 , Loss: 0.29691163307808816, Validation Accuracy:0.804
Epoch 4 , Loss: 0.17879277523210707, Validation Accuracy:0.844
Epoch 5 , Loss: 0.10560117455873083, Validation Accuracy:0.828
Epoch 6 , Loss: 0.06969476727015794, Validation Accuracy:0.784
Epoch 7 , Loss: 0.05836831020349835, Validation Accuracy:0.84
Epoch 8 , Loss: 0.06642380566831599, Validation Accuracy:0.772
Epoch 9 , Loss: 0.03355034736004916, Validation Accuracy:0.816
Epoch 10 , Loss: 0.0247619683913728, Validation Accuracy:0.836


* `Validation Accuracy:0.836` not bad 

In [40]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e