In [2]:
# !pip install transformers datasets torch scikit-learn

# 1 Load Dataset

In [14]:
import pandas as pd

df = pd.read_csv("/content/multilingual_fake_real_news.csv")

In [15]:
df

Unnamed: 0,text,label
0,भारत ने इंग्लैंड को तीसरे टेस्ट मैच में हराया।,fake
1,وزیر اعظم نے نئے ترقیاتی منصوبے کا اعلان کیا۔,real
2,A man claimed to have seen a UFO in his backya...,real
3,Breaking News: Stock market crashes due to unf...,real
4,La finale de la Coupe du Monde sera diffusée e...,fake
...,...,...
1495,پاکستان نے بھارت کو کرکٹ میچ میں شکست دی۔,fake
1496,پاکستان نے بھارت کو کرکٹ میچ میں شکست دی۔,fake
1497,A man claimed to have seen a UFO in his backya...,fake
1498,Une météorite aurait été observée dans le ciel...,fake


# 2 Train Test Split

In [17]:
from datasets import Dataset


# Encode labels
label_map = {"real": 1, "fake": 0}
df["label"] = df["label"].map(label_map)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split into train/test
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
test_dataset = dataset['test']


# 3 Tokenization

In [18]:
from transformers import AutoTokenizer

# Load tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize dataset
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

# 4 Fine Tuning

In [19]:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6982,0.698976,0.463333,0.633257,0.463333,1.0
2,0.708,0.692748,0.523333,0.19209,0.447368,0.122302
3,0.6971,0.690909,0.536667,0.0,0.0,0.0
4,0.6943,0.693006,0.496667,0.174863,0.363636,0.115108
5,0.6972,0.692896,0.496667,0.174863,0.363636,0.115108


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=375, training_loss=0.6954754390716553, metrics={'train_runtime': 375.2249, 'train_samples_per_second': 15.99, 'train_steps_per_second': 0.999, 'total_flos': 394666583040000.0, 'train_loss': 0.6954754390716553, 'epoch': 5.0})

# 5 Evaluation

In [20]:
# Evaluate
results = trainer.evaluate()
print("Accuracy:", results["eval_accuracy"])

Accuracy: 0.49666666666666665


In [21]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Predict on the validation dataset
predictions = trainer.predict(val_dataset)

# Get predicted labels and true labels
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Print classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=["real", "fake"]))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))


Classification Report:
              precision    recall  f1-score   support

        real       0.45      0.78      0.58       142
        fake       0.45      0.16      0.23       158

    accuracy                           0.45       300
   macro avg       0.45      0.47      0.40       300
weighted avg       0.45      0.45      0.40       300

Confusion Matrix:
[[111  31]
 [133  25]]


# 6 Saving Fine Tuend Model

In [22]:
# Save model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/sentencepiece.bpe.model',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

# 7 Prediction System

In [24]:

import torch

# Load the fine-tuned model and tokenizer
model_path = "./fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Function for prediction
def predict(text):
    # Preprocess the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Forward pass to get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Map predictions to labels
    labels = {0: "real", 1: "fake"}
    return labels[predicted_class]

# Example predictions
example_texts = [
    "Breaking news: Scientists discover a cure for cancer.",
    "جعلی خبر: مشہور اداکارہ نے فلم انڈسٹری چھوڑ دی۔",
    "Le président a annoncé de nouvelles mesures écologiques.",
]

for text in example_texts:
    result = predict(text)
    print(f"Text: {text}\nPrediction: {result}\n")


Text: Breaking news: Scientists discover a cure for cancer.
Prediction: real

Text: جعلی خبر: مشہور اداکارہ نے فلم انڈسٹری چھوڑ دی۔
Prediction: real

Text: Le président a annoncé de nouvelles mesures écologiques.
Prediction: real

