In [None]:
#pip install transformers datasets pandas scikit-learn

In [12]:
# CLEAR CACHE 
import gc
import torch
gc.collect()

torch.cuda.empty_cache()

In [None]:
import time
import pandas as pd
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, set_seed
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Set seeds for reproducibility
def set_all_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    set_seed(seed)

set_all_seeds(42)

start_time = time.time()
# Load the dataset
file_path = 'cleaned_history_text.csv'  # Update with the correct file path
df = pd.read_csv(file_path)

# Load the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('facebook/xlm-roberta-xl')

# Tokenize the text with truncation to max length
max_length = 512
texts = df['Text'].tolist()
labels = df['Violence'].tolist()

# Load the pre-trained model
model = XLMRobertaForSequenceClassification.from_pretrained('facebook/xlm-roberta-xl', num_labels=2)

# Move to device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to classify sentences in batches
def classify_sentences_in_batches(sentences, model, tokenizer, batch_size=4, max_length=512):
    model.eval()
    pred_labels = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        encodings = tokenizer(batch, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
        encodings = {key: val.to(device) for key, val in encodings.items()}
        with torch.no_grad():
            outputs = model(**encodings)
            logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
        batch_pred_labels = torch.argmax(probs, dim=1).cpu().numpy()
        pred_labels.extend(batch_pred_labels)
        # Clear CUDA cache to free memory
        torch.cuda.empty_cache()
    return np.array(pred_labels)

# Evaluate the pre-trained model on the entire dataset
pred_labels = classify_sentences_in_batches(texts, model, tokenizer, max_length=max_length)

# Calculate overall metrics
precision = precision_score(labels, pred_labels)
recall = recall_score(labels, pred_labels)
f1 = f1_score(labels, pred_labels)
accuracy = accuracy_score(labels, pred_labels)

print(f'Overall Precision: {precision:.4f}')
print(f'Overall Recall: {recall:.4f}')
print(f'Overall F1 Score: {f1:.4f}')
print(f'Overall Accuracy: {accuracy:.4f}')

# Generate classification report for both classes
report = classification_report(labels, pred_labels, target_names=['Non-Violent', 'Violent'])
print(report)

end_time = time.time()
elapsed_time = end_time - start_time
hours, rem = divmod(elapsed_time, 3600)
minutes, seconds = divmod(rem, 60)
print(f"Elapsed time: {int(hours)} hours, {int(minutes)} minutes, {seconds:.2f} seconds")

# Print misclassified test sentences with their predicted and actual labels
print("\nMisclassified Test Sentences:")
print("-" * 50)
for sentence, actual_label, predicted_label in zip(texts, labels, pred_labels):
    if actual_label != predicted_label:
        print(f"Sentence: \n{sentence}\nActual Label: {'Violent' if actual_label == 1 else 'Non-Violent'} | Predicted Label: {'Violent' if predicted_label == 1 else 'Non-Violent'}")
        print("-" * 50)

# Plot confusion matrix
def plot_confusion_matrix(labels, pred_labels):
    cm = confusion_matrix(labels, pred_labels)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Violent', 'Violent'], yticklabels=['Non-Violent', 'Violent'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Plot ROC curve
def plot_roc_curve(labels, predictions):
    fpr, tpr, _ = roc_curve(labels, predictions[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(12, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

# Plot the confusion matrix
plot_confusion_matrix(labels, pred_labels)

# Plot the ROC curve
logits = model(**tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)).logits
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy()
plot_roc_curve(labels, probs)

# Example usage for new sentence classification
new_sentences = [
    "This is a peaceful example.",
    "He killed and beheaded all enemies."
]
new_pred_labels = classify_sentences_in_batches(new_sentences, model, tokenizer, max_length=max_length)

# Print predictions with better formatting
print("Predictions for New Sentences:")
print("-" * 50)
for sentence, label in zip(new_sentences, new_pred_labels):
    print(f"Sentence: \n{sentence}\nPredicted Label: {'Violent' if label == 1 else 'Non-Violent'}")
    print("-" * 50)


You are using a model of type xlm-roberta-xl to instantiate a model of type xlm-roberta. This is not supported for all configurations of models and can yield errors.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at facebook/xlm-roberta-xl and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.embeddings.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.10.attention.output.LayerNorm.b

## Save Model and Tokenizer

In [23]:
# Save the trained model and tokenizer
model.save_pretrained('./saved_model_Roberta_small_more_optimized')
tokenizer.save_pretrained('./saved_model_Roberta_small_more_optimized')


('./saved_model_Roberta_small_more_optimized/tokenizer_config.json',
 './saved_model_Roberta_small_more_optimized/special_tokens_map.json',
 './saved_model_Roberta_small_more_optimized/vocab.json',
 './saved_model_Roberta_small_more_optimized/merges.txt',
 './saved_model_Roberta_small_more_optimized/added_tokens.json')

## Load Model

In [17]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import torch
# Load the saved model and tokenizer
model_path = './saved_model_bert'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

## Predict New input

In [22]:
import torch
# Define a function to classify new sentences
def classify_new_sentences(sentences, model, tokenizer):
    # Tokenize the new sentences
    encodings = tokenizer(sentences, truncation=True, padding=True, return_tensors='pt')
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    encodings = {key: val.to(device) for key, val in encodings.items()}
    
    # Make predictions
    model.eval()
    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits
    
    # Convert logits to probabilities and then to labels
    probs = torch.nn.functional.softmax(logits, dim=1)
    pred_labels = torch.argmax(probs, dim=1).cpu().numpy()
    
    return pred_labels

# Example usage
new_sentences = [
   
    # George Orwell
    "The most effective way to destroy people is to deny and obliterate their own understanding of their history.",
    # Dan Brown
    "History is always written by the winners. When two cultures clash, the loser is obliterated, and the winner writes the history books. Books, which glorify their own cause and disparage the conquered foe. As Napoleon once said, what is history, but a fable agreed upon? ",
    # Sun Tzu
    "If you know the enemy and know yourself, you need not fear the result of a hundred battles. If you know yourself but not the enemy, for every victory gained you will also suffer a defeat. If you know neither the enemy nor yourself, you will succumb in every battle.",
    # Winston Churchill
    "We shall defend our island, whatever the cost may be, we shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the streets, we shall fight in the hills; we shall never surrender.",
    # Robert Kennedy
    "What has violence ever accomplished? What has it ever created? No martyr's cause has ever been stilled by an assassin's bullet. No wrongs have ever been righted by riots and civil disorders. A sniper is only a coward, not a hero; and an uncontrolled or uncontrollable mob is only the voice of madness, not the voice of the people.",
    
    "Kill them all! slaughter them and behead them? leave none alive until they all surrender and give us all their money, power and women! I want to drink their blood and feast on their flesh! Leave none alive you hear me!",
    "While the Romans were in such difficulties, the barbarians suddenly surrounded them on all sides at once, coming through the densest thickets, as they were acquainted with the paths. At first they hurled their volleys from a distance; then, as no one defended himself and many were wounded, they approached closer to them. For the Romans were not proceeding in any regular order, but were mixed in helter-skelter with the waggons and the unarmed, and so, being unable to form readily anywhere in a body, and being fewer at every point than their assailants, they suffered greatly and could offer no resistance at all.",
    "In the battle, the knight broke the lines and slaughtered his enemies and then beheaded the king.",
    "I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character.",
    "You can fool all of the people some of the time, and some of the people all of the time, but you can't fool all of the people all of the time.",
    "Battles are won by slaughter and maneuver. The greater the general, the more he contributes in maneuver, the less he demands in slaughter.",
    "Remembering the loss of those Irishmen from all parts of the island who were sent to their deaths in the imperialist slaughter of the First World War is crucial to understanding our history. It is also important to recognise the special significance in which the Battle of the Somme and the First World War is held.",
    "In the battle, the soldiers started counting the sheeps that they got as a reward for their awersome performance. They managed to cheer the crowds and "
    
]
pred_labels = classify_new_sentences(new_sentences, model, tokenizer)


# Print predictions with better formatting
print("Predictions for New Sentences:")
print("-" * 50)
for sentence, label in zip(new_sentences, pred_labels):
    print(f"Sentence: \n{sentence}\nPredicted Label: {'Violent' if label == 1 else 'Non-Violent'}")
    print("-" * 50)

Predictions for New Sentences:
--------------------------------------------------
Sentence: 
The most effective way to destroy people is to deny and obliterate their own understanding of their history.
Predicted Label: Violent
--------------------------------------------------
Sentence: 
History is always written by the winners. When two cultures clash, the loser is obliterated, and the winner writes the history books. Books, which glorify their own cause and disparage the conquered foe. As Napoleon once said, what is history, but a fable agreed upon? 
Predicted Label: Non-Violent
--------------------------------------------------
Sentence: 
If you know the enemy and know yourself, you need not fear the result of a hundred battles. If you know yourself but not the enemy, for every victory gained you will also suffer a defeat. If you know neither the enemy nor yourself, you will succumb in every battle.
Predicted Label: Non-Violent
--------------------------------------------------
Sent

## Parameter Number

In [9]:
print(f'{sum(p.numel() for p in model.parameters())}')

109483778


## Trying without Finetuning

In [None]:
import time
import pandas as pd
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from transformers import ElectraTokenizer, ElectraForSequenceClassification, set_seed
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Set seeds for reproducibility
def set_all_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    set_seed(seed)

set_all_seeds(42)

start_time = time.time()
# Load the dataset
file_path = 'cleaned_history_text.csv'  # Update with the correct file path
df = pd.read_csv(file_path)

# Load the tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-large-discriminator')

# Tokenize the text with truncation to max length
max_length = 512
texts = df['Text'].tolist()
labels = df['Violence'].tolist()

# Load the pre-trained model
model = ElectraForSequenceClassification.from_pretrained('google/electra-large-discriminator', num_labels=2)

# Move to device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to classify sentences in batches
def classify_sentences_in_batches(sentences, model, tokenizer, batch_size=100, max_length=512):  # Smaller batch size
    model.eval()
    pred_labels = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        encodings = tokenizer(batch, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
        encodings = {key: val.to(device) for key, val in encodings.items()}
        with torch.no_grad():
            outputs = model(**encodings)
            logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
        batch_pred_labels = torch.argmax(probs, dim=1).cpu().numpy()
        pred_labels.extend(batch_pred_labels)
        # Clear CUDA cache to free memory
        torch.cuda.empty_cache()
    return np.array(pred_labels)

# Evaluate the pre-trained model on the entire dataset
pred_labels = classify_sentences_in_batches(texts, model, tokenizer, max_length=max_length)

# Calculate overall metrics
precision = precision_score(labels, pred_labels)
recall = recall_score(labels, pred_labels)
f1 = f1_score(labels, pred_labels)
accuracy = accuracy_score(labels, pred_labels)

print(f'Overall Precision: {precision:.4f}')
print(f'Overall Recall: {recall:.4f}')
print(f'Overall F1 Score: {f1:.4f}')
print(f'Overall Accuracy: {accuracy:.4f}')

# Generate classification report for both classes
report = classification_report(labels, pred_labels, target_names=['Non-Violent', 'Violent'])
print(report)

end_time = time.time()
elapsed_time = end_time - start_time
hours, rem = divmod(elapsed_time, 3600)
minutes, seconds = divmod(rem, 60)
print(f"Elapsed time: {int(hours)} hours, {int(minutes)} minutes, {seconds:.2f} seconds")

# Print misclassified test sentences with their predicted and actual labels
print("\nMisclassified Test Sentences:")
print("-" * 50)
for sentence, actual_label, predicted_label in zip(texts, labels, pred_labels):
    if actual_label != predicted_label:
        print(f"Sentence: \n{sentence}\nActual Label: {'Violent' if actual_label == 1 else 'Non-Violent'} | Predicted Label: {'Violent' if predicted_label == 1 else 'Non-Violent'}")
        print("-" * 50)

# Plot confusion matrix
def plot_confusion_matrix(labels, pred_labels):
    cm = confusion_matrix(labels, pred_labels)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Violent', 'Violent'], yticklabels=['Non-Violent', 'Violent'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Plot ROC curve
def plot_roc_curve(labels, predictions):
    fpr, tpr, _ = roc_curve(labels, predictions[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(12, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

# Plot the confusion matrix
plot_confusion_matrix(labels, pred_labels)


# Example usage for new sentence classification
new_sentences = [
    "This is a peaceful example.",
    "He killed and beheaded all enemies."
]
new_pred_labels = classify_sentences_in_batches(new_sentences, model, tokenizer, max_length=max_length)

# Print predictions with better formatting
print("Predictions for New Sentences:")
print("-" * 50)
for sentence, label in zip(new_sentences, new_pred_labels):
    print(f"Sentence: \n{sentence}\nPredicted Label: {'Violent' if label == 1 else 'Non-Violent'}")
    print("-" * 50)


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## TEST LATER FOR BETTER HYPERPARAMETERS

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
import numpy as np
import torch

# Load the dataset
file_path = 'history_text.csv'  # Update with the correct file path
df = pd.read_csv(file_path)

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Violence'], test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

# Convert to Dataset object
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'label': list(train_labels)})
test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'label': list(test_labels)})

# Define the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    learning_rate=2e-5,              # Adjust learning rate
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10,
    eval_steps=10,
    load_best_model_at_end=True,     # Load the best model at the end
    metric_for_best_model="f1",      # Define the metric to use for early stopping
    greater_is_better=True,
)

# Define a function to compute metrics
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    precision = precision_score(labels, pred)
    recall = recall_score(labels, pred)
    f1 = f1_score(labels, pred)
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Define Trainer with early stopping callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping
)

# Train the model
trainer.train()

# Evaluate the model on the test set
predictions = trainer.predict(test_dataset)

# Convert predictions to label IDs
pred_labels = np.argmax(predictions.predictions, axis=1)

# Calculate and print metrics
precision = precision_score(test_labels, pred_labels)
recall = recall_score(test_labels, pred_labels)
f1 = f1_score(test_labels, pred_labels)
accuracy = accuracy_score(test_labels, pred_labels)

print(f'Overall Precision: {precision:.4f}')
print(f'Overall Recall: {recall:.4f}')
print(f'Overall F1 Score: {f1:.4f}')
print(f'Overall Accuracy: {accuracy:.4f}')

# Generate classification report for both classes
report = classification_report(test_labels, pred_labels, target_names=['Non-Violent', 'Violent'])
print(report)
