In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [3]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [4]:
train_df["cleantext"]=train_df['text'].str.replace(r'http\S+|www.\S+', '', regex=True)
train_df["cleantext"] = train_df['text'].str.replace(r'@\w+|#\w+', '', regex=True)
train_df["cleantext"] = train_df['text'].str.replace(r'[^\w\s]', '', regex=True)

In [5]:
X = train_df['cleantext'].tolist()
y=train_df["target"].tolist()

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
X_test=test_df["text"].tolist()

In [8]:
BATCH_SIZE = 32
EPOCHS = 5

In [9]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [10]:



preset = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(preset)
model = BertForSequenceClassification.from_pretrained(preset, num_labels=2)


def tokenize_data(texts, tokenizer, max_length=160):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')


train_encodings = tokenize_data(X_train, tokenizer)
val_encodings = tokenize_data(X_val, tokenizer)


train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(y_val))


train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


optimizer = AdamW(model.parameters(), lr=1e-5)


def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)
    accuracy = total_correct / total_samples
    return accuracy


def early_stopping(eval_accuracies, patience=2):
    if len(eval_accuracies) < patience + 1:
        return False
    return eval_accuracies[-1] <= min(eval_accuracies[-patience:])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = EPOCHS
best_val_accuracy = 0
eval_accuracies = []

for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, device)
    val_accuracy = evaluate(model, val_loader, device)
    eval_accuracies.append(val_accuracy)
    
    print(f"Epoch {epoch + 1}/{epochs} | Training Loss: {train_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}")
    
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_bert_model.pth')  # Save best model
        
    if early_stopping(eval_accuracies):
        print("Early stopping triggered.")
        break




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 | Training Loss: 0.4945 | Validation Accuracy: 0.8283
Epoch 2/5 | Training Loss: 0.3691 | Validation Accuracy: 0.8367
Epoch 3/5 | Training Loss: 0.3016 | Validation Accuracy: 0.8351
Early stopping triggered.


In [11]:
if os.path.exists('best_bert_model.pth'):
    print("Model saved successfully.")
else:
    print("Model not found.")

Model saved successfully.


In [12]:
model.load_state_dict(torch.load('best_bert_model.pth'))
model.eval()  
print("Model loaded successfully.")

  model.load_state_dict(torch.load('best_bert_model.pth'))


Model loaded successfully.


In [13]:
import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/lib/kaggle/gcp.py
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/working/best_bert_model.pth


In [14]:
from torch.utils.data import Dataset, DataLoader

class TweetDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings['input_ids'])
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    

In [15]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


X_test_encodings = tokenizer(list(X_test), padding=True, truncation=True, max_length=160, return_tensors="pt")


In [16]:
test_dataset = TweetDataset(X_test_encodings)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [17]:

model.eval()

all_predictions = []


with torch.no_grad():
    for batch in test_dataloader:
  
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
      
        outputs = model(input_ids, attention_mask=attention_mask)
        
 
        predictions = torch.argmax(outputs.logits, dim=1)
        
       
        all_predictions.extend(predictions.cpu().numpy())


sample_submission = pd.DataFrame({'id': test_df['id'], 'target': all_predictions})
sample_submission.to_csv("submission.csv", index=False)
print("Your submission was successfully saved!")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Your submission was successfully saved!


In [18]:
X_test2='bnkoksoxssx'

In [19]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

def predict_tweet(tweet, model_path='best_bert_model.pth', max_length=160):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.load_state_dict(torch.load(model_path))
    model.eval()

 
    inputs = tokenizer(tweet, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        logits = model(**inputs).logits

  
    return "Disaster" if torch.argmax(logits) == 1 else "Not a disaster"




In [27]:
predict_tweet("this photo is disastrous")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path))


'Not a disaster'