<a href="https://colab.research.google.com/github/BruvCoder/AI-knows-Christianity/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.optim import Adam

# --- Setup and Data Loading ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

df = pd.read_csv('/content/christian_sentences_10000.csv')
data = df.sample(frac=1, random_state=42).reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(np.array(data['Sentences']), np.array(data['Label']), test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

# --- Efficient Dataset Class ---
class ChristianDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_length):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        sentence = self.X[idx]
        label = self.y[idx]

        # Tokenize each sample on the fly
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }, torch.tensor(label, dtype=torch.float32)

# --- Model Architecture ---
class MyModel(nn.Module):
    def __init__(self, bert):
        super(MyModel, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.25)
        self.classifier = nn.Sequential(
            nn.Linear(768, 384),
            nn.ReLU(),
            nn.Linear(384, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)[0][:, 0]
        output = self.dropout(bert_output)
        output = self.classifier(output)
        return output.squeeze()

# --- Instantiating and Training ---
max_length = 100
training_data = ChristianDataset(X_train, y_train, tokenizer, max_length)
validation_data = ChristianDataset(X_val, y_val, tokenizer, max_length)

training_dataLoader = DataLoader(training_data, batch_size=32, shuffle=True)
validation_dataLoader = DataLoader(validation_data, batch_size=32, shuffle=False) # No need to shuffle validation data

# Instantiate the model with BERT
model = MyModel(bert_model).to(device)

# A lower learning rate is crucial for fine-tuning BERT
optimizer = Adam(model.parameters(), lr=2e-5)
loss_fn = nn.BCELoss()

epochs = 3 # Start with fewer epochs for fine-tuning

for epoch in range(epochs):
    print(f"--- Epoch {epoch+1} of {epochs} ---")
    model.train()
    for batch_idx, (inputs, labels) in enumerate(training_dataLoader):
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        if (batch_idx + 1) % 100 == 0:
            print(f"Batch {batch_idx+1}/{len(training_dataLoader)}, Training Loss: {loss.item():.4f}")

    # Validation loop
    model.eval()
    validation_loss = 0
    with torch.no_grad():
        for inputs, labels in validation_dataLoader:
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask)
            validation_loss += loss_fn(outputs, labels).item()

    avg_val_loss = validation_loss / len(validation_dataLoader)
    print(f"Epoch {epoch+1} finished. Average Validation Loss: {avg_val_loss:.4f}")

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

--- Epoch 1 of 3 ---
Batch 100/219, Training Loss: 0.0100
Batch 200/219, Training Loss: 0.0040
Epoch 1 finished. Average Validation Loss: 0.0023
--- Epoch 2 of 3 ---
Batch 100/219, Training Loss: 0.0019
Batch 200/219, Training Loss: 0.0013
Epoch 2 finished. Average Validation Loss: 0.0007
--- Epoch 3 of 3 ---
Batch 100/219, Training Loss: 0.0008
Batch 200/219, Training Loss: 0.0006
Epoch 3 finished. Average Validation Loss: 0.0004


In [14]:
model.eval()

# --- Inference Function ---
def predict(text):
    """
    Predicts the label and confidence for a given text.
    """
    # Tokenize the input text
    encoding = tokenizer(
        text,
        max_length=100,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        # Get the model's output probability
        output = model(input_ids, attention_mask)

    # Classify the prediction based on a 0.5 threshold
    predicted_class = (output > 0.5).int().item()

    # Determine the label based on the class
    if predicted_class == 1:
        label = "True"
        confidence_score = output.item() * 100
    else:
        label = "False"
        confidence_score = (1 - output.item()) * 100

    return label, confidence_score

# --- Example Usage ---
test_text = "Jesus Christ is lord and saviour"
label, confidence = predict(test_text)
print(f"Text: '{test_text}'")
print(f"Predicted Label: {label}")
print(f"Confidence: {confidence:.2f}%")

test_text2 = "God is not triune"
label2, confidence2 = predict(test_text2)
print(f"Text: '{test_text2}'")
print(f"Predicted Label: {label2}")
print(f"Confidence: {confidence2:.2f}%")

Text: 'Jesus Christ is lord and saviour'
Predicted Label: True
Confidence: 99.96%
Text: 'God is not triune'
Predicted Label: False
Confidence: 99.97%
