In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cworthy/CT24_checkworthy_english_dev.csv
/kaggle/input/cworthy/CT24_checkworthy_english_train.csv
/kaggle/input/cworthy/CT24_checkworthy_english_dev-test.csv
/kaggle/input/cworthy/CT24_checkworthy_english/CT24_checkworthy_english/CT24_checkworthy_english_dev-test.tsv
/kaggle/input/cworthy/CT24_checkworthy_english/CT24_checkworthy_english/CT24_checkworthy_english_train.tsv
/kaggle/input/cworthy/CT24_checkworthy_english/CT24_checkworthy_english/CT24_checkworthy_english_dev.tsv
/kaggle/input/cworthytest/CT24_checkworthy_english_test.csv
/kaggle/input/cworthytest/CT24_checkworthy_english/CT24_checkworthy_english_dev.csv
/kaggle/input/cworthytest/CT24_checkworthy_english/CT24_checkworthy_english_train.csv
/kaggle/input/cworthytest/CT24_checkworthy_english/CT24_checkworthy_english_dev-test.csv
/kaggle/input/cworthytest/CT24_checkworthy_english/CT24_checkworthy_english/CT24_checkworthy_english_dev-test.tsv
/kaggle/input/cworthytest/CT24_checkworthy_english/CT24_checkworthy_engli

In [33]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
import pandas as pd

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

# Load train, validation, and test datasets
train_df = pd.read_csv('/kaggle/input/cworthytest/CT24_checkworthy_english/CT24_checkworthy_english_train.csv')
val_df1 = pd.read_csv('/kaggle/input/cworthytest/CT24_checkworthy_english/CT24_checkworthy_english_dev.csv')
val_df2 = pd.read_csv('/kaggle/input/cworthytest/CT24_checkworthy_english/CT24_checkworthy_english_dev-test.csv')

val_df = pd.concat([val_df1, val_df2], ignore_index = True)

test_df = pd.read_csv('/kaggle/input/cworthytest/CT24_checkworthy_test_gold/english/CT24_checkworthy_english_test_gold.csv')

test_df.head()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,Sentence_id,Text,class_label
0,35988,They said they were just going to get inspecto...,Yes
1,35991,"And from that point on, I've voted to -- I mov...",Yes
2,36029,I sit on the Senate Armed Services Committee.,No
3,36043,We need to depend on all of our tools -- diplo...,No
4,36063,And that is -- and I don't know if my colleagu...,No


In [None]:
# Preprocess the data
def preprocess_data(df):
    sentences = df['Text'].tolist()
    labels = df['class_label'].tolist()
    label_map = {'Yes': 1, 'No': 0}
    labels = [label_map[label] for label in labels]
    tokenized_texts = [tokenizer.encode(sent, add_special_tokens=True, max_length=128, truncation=True) for sent in sentences]
    max_len = max(len(tokens) for tokens in tokenized_texts)
    padded_tokenized_texts = [tokens + [0] * (max_len - len(tokens)) for tokens in tokenized_texts]
    input_ids = torch.tensor(padded_tokenized_texts)
    labels = torch.tensor(labels)
    return input_ids, labels

train_inputs, train_labels = preprocess_data(train_df)
val_inputs, val_labels = preprocess_data(val_df)
test_inputs, test_labels = preprocess_data(test_df)

# Create DataLoader for train, validation, and test datasets
batch_size = 32
train_data = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_data = TensorDataset(val_inputs, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size)
test_data = TensorDataset(test_inputs, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# Set optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Train the model
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        batch_input_ids, batch_labels = batch
        optimizer.zero_grad()
        outputs = model(batch_input_ids, labels=batch_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Optional: clip gradients to prevent exploding gradients
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}')

# Evaluate on validation set
model.eval()
val_predictions = []
val_true_labels = []
for batch in val_dataloader:
    batch_input_ids, batch_labels = batch
    with torch.no_grad():
        outputs = model(batch_input_ids)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).tolist()
    val_predictions.extend(predictions)
    val_true_labels.extend(batch_labels.tolist())

val_accuracy = accuracy_score(val_true_labels, val_predictions)
print(f'Validation Accuracy: {val_accuracy:.4f}')

# Test the final model on test set
test_predictions = []
test_true_labels = []
for batch in test_dataloader:
    batch_input_ids, batch_labels = batch
    with torch.no_grad():
        outputs = model(batch_input_ids)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).tolist()
    test_predictions.extend(predictions)
    test_true_labels.extend(batch_labels.tolist())

test_accuracy = accuracy_score(test_true_labels, test_predictions)
print(f'Test Accuracy: {test_accuracy:.4f}')



In [3]:
from sklearn.metrics import f1_score

# Evaluate on validation set
model.eval()
val_predictions = []
val_true_labels = []
for batch in val_dataloader:
    batch_input_ids, batch_labels = batch
    with torch.no_grad():
        outputs = model(batch_input_ids)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).tolist()
    val_predictions.extend(predictions)
    val_true_labels.extend(batch_labels.tolist())

# Compute F1 score for positive class
val_f1_score = f1_score(val_true_labels, val_predictions, pos_label=1)
print(f'Validation F1 Score (Positive Class): {val_f1_score:.4f}')

# Test the final model on test set
test_predictions = []
test_true_labels = []
for batch in test_dataloader:
    batch_input_ids, batch_labels = batch
    with torch.no_grad():
        outputs = model(batch_input_ids)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).tolist()
    test_predictions.extend(predictions)
    test_true_labels.extend(batch_labels.tolist())

# Compute F1 score for positive class
test_f1_score = f1_score(test_true_labels, test_predictions, pos_label=1)
print(f'Test F1 Score (Positive Class): {test_f1_score:.4f}')

Validation F1 Score (Positive Class): 0.0000
Test F1 Score (Positive Class): 0.0000


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [4]:
# Test the final model on test set
test_predictions = []
test_true_labels = []
test_sentences = []
for batch in test_dataloader:
    batch_input_ids, batch_labels = batch
    with torch.no_grad():
        outputs = model(batch_input_ids)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).tolist()
    test_predictions.extend(predictions)
    test_true_labels.extend(batch_labels.tolist())
    test_sentences.extend([tokenizer.decode(ids, skip_special_tokens=True) for ids in batch_input_ids])

# Output class predictions of test set
results_df = pd.DataFrame({'Text': test_sentences, 'True_label': test_true_labels, 'Predicted_label': test_predictions})
results_df['Predicted_label'] = results_df['Predicted_label'].apply(lambda x: 'Yes' if x == 1 else 'No')

In [18]:
results_df.head()

Unnamed: 0,Text,True_label,Predicted_label
0,They said they were just going to get inspecto...,0,No
1,"And from that point on, I've voted to -- I mov...",0,No
2,I sit on the Senate Armed Services Committee.,0,No
3,We need to depend on all of our tools -- diplo...,0,No
4,And that is -- and I don't know if my colleagu...,0,No


In [21]:
test_df.head()

Unnamed: 0,Sentence_id,Text,class_label
0,35988,They said they were just going to get inspecto...,No
1,35991,"And from that point on, I've voted to -- I mov...",No
2,36029,I sit on the Senate Armed Services Committee.,No
3,36043,We need to depend on all of our tools -- diplo...,No
4,36063,And that is -- and I don't know if my colleagu...,No


In [23]:
results_df['Text'] = test_df[' Sentence_id']

results_df.head()

Unnamed: 0,Text,True_label,Predicted_label
0,35988,0,No
1,35991,0,No
2,36029,0,No
3,36043,0,No
4,36063,0,No


In [24]:
results_df = results_df.drop(['True_label'], axis = 1)
results_df.head()

Unnamed: 0,Text,Predicted_label
0,35988,No
1,35991,No
2,36029,No
3,36043,No
4,36063,No


In [25]:
results_df.rename(columns = {'Predicted_label':'class_label'}, inplace = True)
results_df.rename(columns = {'Text':'id'}, inplace = True)
results_df.head()

Unnamed: 0,Text,class_label
0,35988,No
1,35991,No
2,36029,No
3,36043,No
4,36063,No


In [26]:
results_df['run_id'] = 'Model_1'
results_df.head()

Unnamed: 0,Text,class_label,run_id
0,35988,No,Model_1
1,35991,No,Model_1
2,36029,No,Model_1
3,36043,No,Model_1
4,36063,No,Model_1


In [28]:
# Output class predictions of test set to TSV
results_df.to_csv('/kaggle/working/task1_english.csv', index=False)
results_df.to_csv('/kaggle/working/task1_english.tsv', sep='\t', index=False)