In [None]:
!pip install transformers datasets torch pandas sklearn

In [None]:
!pip install datasets

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

# Load datasets (assuming they are uploaded to Colab or stored in Google Drive)
# If in Google Drive, mount it:
# from google.colab import drive
# drive.mount('/content/drive')

fake_df = pd.read_csv('/content/drive/MyDrive/ma_fake.csv')  # Adjust path as needed
true_df = pd.read_csv('/content/drive/MyDrive/ma_true.csv')  # Adjust path as needed

# Add labels
fake_df['label'] = 0  # Fake news
true_df['label'] = 1  # True news

# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Basic preprocessing for Malayalam text
def preprocess_text(text):
    # Remove special characters, numbers, and extra spaces (adjust as needed for Malayalam)
    text = str(text).strip()
    text = ''.join(char for char in text if char.isalpha() or char.isspace() or char in 'അആഇഈഉഊഋഎഏഐഒഓൺം')
    return text

df['text'] = df['text'].apply(preprocess_text)

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increase epochs
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=200,  # Reduce warmup steps if dataset is small
    weight_decay=0.1,  # Increase regularization
    learning_rate=2e-5,  # Smaller learning rate for better convergence
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Define compute_metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

In [None]:
# Save the model and tokenizer
model.save_pretrained('/content/drive/MyDrive/lastdance')
tokenizer.save_pretrained('/content/drive/MyDrive/lastdance')

('/content/drive/MyDrive/lastdance/tokenizer_config.json',
 '/content/drive/MyDrive/lastdance/special_tokens_map.json',
 '/content/drive/MyDrive/lastdance/vocab.txt',
 '/content/drive/MyDrive/lastdance/added_tokens.json')

In [None]:
# Load the saved model and tokenizer
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/lastdance')
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/lastdance')

# Function to predict on new text
def predict_fake_news(text):
    # Preprocess and tokenize the text
    text = preprocess_text(text)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()

    return "True" if prediction == 1 else "Fake"
# Example usage
new_text = "2000 രൂപ നോട്ടിന്റെ വിതരണം നിര്‍ത്തി റിസര്‍വ് ബാങ്ക്"  # Replace with your Malayalam text
result = predict_fake_news(new_text)
print(f"The text is predicted to be: {result}")
# Example usage
new_text = "'യുഡിഎഫ് കള്ളം പ്രചരിപ്പിക്കാറില്ല, ഉള്ളത് മാത്രമേ പറയു' പിണറായി വിജയൻ ഇങ്ങനെ പറഞ്ഞോ?..."  # Replace with your Malayalam text
result = predict_fake_news(new_text)
print(f"The text is predicted to be: {result}")

In [None]:
from sklearn.metrics import classification_report

# Print detailed classification report
print("Classification Report:")
print(classification_report(labels, preds, target_names=['True', 'Fake']))

In [None]:
def predict_fake_news(text):
    text = preprocess_text(text)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()
    return "True" if prediction == 1 else "Fake"  # Swap if necessary

In [None]:
# Example usage
new_text = "മോദി ലോകത്തിൻ്റെ പ്രതീക്ഷ  പ്രധാനമന്ത്രിയെ പുകഴ്ത്തിയ ന്യൂ യോർക്ക് ടൈംസ് വാർത്ത വ്യാജമോ?"  # Replace with your Malayalam text
result = predict_fake_news(new_text)
print(f"The text is predicted to be: {result}")

The text is predicted to be: Fake


In [None]:
# Example usage
new_text = "പത്മഭൂഷണ്‍ പുരസ്‌കാരം ഏറ്റുവാങ്ങി മോഹന്‍ലാല്‍."  # Replace with your Malayalam text
result = predict_fake_news(new_text)
print(f"The text is predicted to be: {result}")

The text is predicted to be: True
