In [None]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification

# Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line) for line in file]
    headlines = [item['headline'] for item in data]
    labels = [item['is_sarcastic'] for item in data]
    return headlines, labels

# Preprocess data for LSTM
def preprocess_data(headlines, labels, max_num_words=20000, max_sequence_length=30):
    tokenizer = Tokenizer(num_words=max_num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(headlines)
    sequences = tokenizer.texts_to_sequences(headlines)
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')
    return np.array(padded_sequences), np.array(labels), tokenizer

# LSTM Dataset class
class SarcasmDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Build LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=64, output_dim=1):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return self.sigmoid(output)

# Train LSTM
def train_lstm_model(model, train_loader, val_loader, epochs=10, lr=0.001):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(data).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, labels in val_loader:
                data, labels = data.to(device), labels.to(device)
                outputs = model(data).squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}")

# Fine-tune BERT
def fine_tune_bert(train_texts, train_labels, val_texts, val_labels, max_length=30, batch_size=16, epochs=3):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

    train_dataset = SarcasmDataset(train_encodings['input_ids'], train_labels)
    val_dataset = SarcasmDataset(val_encodings['input_ids'], val_labels)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    optimizer = optim.AdamW(model.parameters(), lr=5e-5)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for data, labels in train_loader:
            data, labels = data.to(device), labels.to(device, dtype=torch.long)
            outputs = model(input_ids=data, labels=labels)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, labels in val_loader:
                data, labels = data.to(device), labels.to(device, dtype=torch.long)
                outputs = model(input_ids=data, labels=labels)
                val_loss += outputs.loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}")

    return model

# Main function
if __name__ == "__main__":
    # Load and preprocess data
    file_path = "Sarcasm_Headlines_Dataset.json"
    headlines, labels = load_data(file_path)
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(headlines, labels, test_size=0.2, random_state=42)
    val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

    # Preprocess for LSTM
    train_data, train_labels, tokenizer = preprocess_data(train_texts, train_labels)
    val_data, val_labels, _ = preprocess_data(val_texts, val_labels)

    train_dataset = SarcasmDataset(train_data, train_labels)
    val_dataset = SarcasmDataset(val_data, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)

    # Train LSTM
    vocab_size = len(tokenizer.word_index) + 1
    lstm_model = LSTMModel(vocab_size).to(device)
    train_lstm_model(lstm_model, train_loader, val_loader)

    # Fine-tune BERT
    bert_model = fine_tune_bert(train_texts, train_labels, val_texts, val_labels)

    # Save models
    torch.save(lstm_model.state_dict(), "lstm_model.pth")
    bert_model.save_pretrained("bert_model")


Using device: cuda
Epoch 1/10, Train Loss: 0.6277038758027487, Val Loss: 0.6802504850758446
Epoch 2/10, Train Loss: 0.43485903723279856, Val Loss: 0.9730860312779744
Epoch 3/10, Train Loss: 0.2988508968150982, Val Loss: 0.9646177874671088
Epoch 4/10, Train Loss: 0.20401031925088034, Val Loss: 1.2026039173205694
Epoch 5/10, Train Loss: 0.13524196227211194, Val Loss: 1.299395184384452
Epoch 6/10, Train Loss: 0.09014942599642585, Val Loss: 1.6374515374501546
Epoch 7/10, Train Loss: 0.06087037356174941, Val Loss: 1.8656105001767476
Epoch 8/10, Train Loss: 0.051734759807573666, Val Loss: 1.862462208006117
Epoch 9/10, Train Loss: 0.035505242419738695, Val Loss: 2.101059687137604
Epoch 10/10, Train Loss: 0.026297169066829003, Val Loss: 2.145445545514425


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.data = torch.tensor(data, dtype=torch.long)
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1/3, Train Loss: 0.2664728472692662, Val Loss: 0.18822716510537282
Epoch 2/3, Train Loss: 0.11072215254480054, Val Loss: 0.23038433155294655
Epoch 3/3, Train Loss: 0.052521710382633084, Val Loss: 0.2896714705044492


## Evaluation


In [3]:
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer
from torch.utils.data import DataLoader


def evaluate_lstm_model(model, test_data, test_labels):
    model.eval()  # 设置为评估模式
    predictions = []
    with torch.no_grad():  # 禁用梯度计算
        for i in range(len(test_data)):
            input_data = torch.tensor(test_data[i]).unsqueeze(0).to(device)  # 增加 batch 维度
            output = model(input_data).squeeze().cpu().numpy()
            predictions.append(1 if output >= 0.5 else 0)  # 概率 > 0.5 则为正类

    accuracy = accuracy_score(test_labels, predictions)
    print(f"LSTM Model Accuracy: {accuracy:.4f}")
    return accuracy


def evaluate_bert_model(model, test_texts, test_labels, batch_size=16):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=30, return_tensors="pt")
    test_dataset = SarcasmDataset(test_encodings['input_ids'], test_labels)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    model.eval()  # 设置为评估模式
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(input_ids=inputs).logits
            preds = torch.argmax(outputs, axis=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    print(f"BERT Model Accuracy: {accuracy:.4f}")
    return accuracy

# 使用测试集评估 LSTM
test_data, test_labels, _ = preprocess_data(test_texts, test_labels)
evaluate_lstm_model(lstm_model, test_data, test_labels)

# 使用测试集评估 BERT
evaluate_bert_model(bert_model, test_texts, test_labels)





LSTM Model Accuracy: 0.5989


  self.data = torch.tensor(data, dtype=torch.long)


BERT Model Accuracy: 0.9329


0.9329140461215933

### Improve BERT Model

In [4]:
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line) for line in file]
    headlines = [item['headline'] for item in data]
    labels = [item['is_sarcastic'] for item in data]
    return headlines, labels

# Dataset class for BERT
class SarcasmDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Train BERT model
def train_bert_model(train_texts, train_labels, val_texts, val_labels, max_length=30, batch_size=16, epochs=5):
    # Tokenizer and encoding
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

    # Dataset and DataLoader
    train_dataset = SarcasmDataset(train_encodings, train_labels)
    val_dataset = SarcasmDataset(val_encodings, val_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Model initialization
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Training loop
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
                labels = batch['labels'].to(device)
                outputs = model(**inputs, labels=labels)
                val_loss += outputs.loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, Val Loss: {val_loss / len(val_loader):.4f}")

    return model, tokenizer

# Evaluate BERT model
def evaluate_bert_model(model, tokenizer, test_texts, test_labels, max_length=30, batch_size=16):
    # Tokenizer and encoding
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    test_dataset = SarcasmDataset(test_encodings, test_labels)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Evaluation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs).logits
            preds = torch.argmax(outputs, axis=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    cm = confusion_matrix(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    print(f"BERT Model Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", report)
    return accuracy, cm, report

# Main function
if __name__ == "__main__":
    # Load and preprocess data
    file_path = "Sarcasm_Headlines_Dataset.json"
    headlines, labels = load_data(file_path)
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(headlines, labels, test_size=0.2, random_state=42)
    val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

    # Train BERT model
    print("Training BERT Model...")
    bert_model, tokenizer = train_bert_model(train_texts, train_labels, val_texts, val_labels)

    # Save model
    bert_model.save_pretrained("optimized_bert_model")
    tokenizer.save_pretrained("optimized_bert_model")

    # Evaluate BERT model
    print("Evaluating BERT Model...")
    evaluate_bert_model(bert_model, tokenizer, test_texts, test_labels)


Using device: cuda
Training BERT Model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/5, Train Loss: 0.2579, Val Loss: 0.2013
Epoch 2/5, Train Loss: 0.1143, Val Loss: 0.2318
Epoch 3/5, Train Loss: 0.0535, Val Loss: 0.2468
Epoch 4/5, Train Loss: 0.0391, Val Loss: 0.2473
Epoch 5/5, Train Loss: 0.0317, Val Loss: 0.3045
Evaluating BERT Model...
BERT Model Accuracy: 0.9210
Confusion Matrix:
 [[1360  123]
 [ 103 1276]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.92      1483
           1       0.91      0.93      0.92      1379

    accuracy                           0.92      2862
   macro avg       0.92      0.92      0.92      2862
weighted avg       0.92      0.92      0.92      2862



In [7]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# 检查设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 加载模型和分词器
def load_model_and_tokenizer(model_path="optimized_bert_model"):
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path).to(device)
    model.eval()  # 设置为评估模式
    return model, tokenizer

# 分析输入文本
def analyze_text(text, model, tokenizer, max_length=30):
    # 文本分词和编码
    inputs = tokenizer(text, truncation=True, padding=True, max_length=max_length, return_tensors="pt").to(device)

    # 模型推理
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, axis=1).item()  # 获取类别

    # 返回分析结果
    return "Sarcastic" if prediction == 1 else "Not Sarcastic"

# 主功能
if __name__ == "__main__":
    # 加载模型和分词器
    model_path = "optimized_bert_model"  # 确保该路径存在并包含训练好的模型
    model, tokenizer = load_model_and_tokenizer(model_path)

    # 输入循环
    print("Enter a sentence to analyze (type 'exit' to quit):")
    while True:
        user_input = input("Input: ")
        if user_input.lower() == "exit":
            print("Exiting...")
            break

        result = analyze_text(user_input, model, tokenizer)
        print(f"Analysis: {result}")


Using device: cuda
Enter a sentence to analyze (type 'exit' to quit):
Analysis: Not Sarcastic
Analysis: Not Sarcastic
Analysis: Not Sarcastic
Analysis: Not Sarcastic
Analysis: Not Sarcastic
Analysis: Sarcastic
Analysis: Sarcastic
Analysis: Sarcastic
Analysis: Not Sarcastic
Analysis: Not Sarcastic
Analysis: Sarcastic
Exiting...
