<a href="https://colab.research.google.com/github/FathimaHusna/FathimaHusna/blob/main/Fakenews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# pip install transformers datasets evaluate pandas scikit-learn tensorflow

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

def load_data():
    # Load the datasets
    fake_news = pd.read_csv("/content/drive/MyDrive/fake_data/data1/News _dataset/Fake.csv")
    true_news = pd.read_csv("/content/drive/MyDrive/fake_data/data1/News _dataset/True.csv")

    # Add labels
    true_news["label"] = 1  # 1 for real news
    fake_news["label"] = 0  # 0 for fake news

    # Combine the datasets
    df = pd.concat([true_news, fake_news], ignore_index=True)

    # Shuffle the dataset
    df = df.sample(frac=1).reset_index(drop=True)

    return df

def split_data(df):
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Load and split the data
df = load_data()
X_train, X_test, y_train, y_test = split_data(df)

In [None]:
df.head()

In [None]:
from transformers import DistilBertTokenizer

def tokenize_data(X_train, X_test):
    # Load the DistilBERT tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    # Tokenize the training and testing data
    train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
    test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)

    return train_encodings, test_encodings, tokenizer

# Tokenize the data
train_encodings, test_encodings, tokenizer = tokenize_data(X_train, X_test)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
import tensorflow as tf

def create_tf_datasets(train_encodings, test_encodings, y_train, y_test):
    # Convert to TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {key: tf.constant(val) for key, val in train_encodings.items()},
        tf.constant(y_train)
    ))
    test_dataset = tf.data.Dataset.from_tensor_slices((
        {key: tf.constant(val) for key, val in test_encodings.items()},
        tf.constant(y_test)
    ))

    # Batch and shuffle the datasets
    train_dataset = train_dataset.shuffle(1000).batch(16)
    test_dataset = test_dataset.batch(16)

    return train_dataset, test_dataset

# Create TensorFlow datasets
train_dataset, test_dataset = create_tf_datasets(train_encodings, test_encodings, y_train, y_test)

In [None]:
from transformers import TFDistilBertForSequenceClassification
from sklearn.metrics import classification_report

def train_and_evaluate(train_dataset, test_dataset, y_test):
    # Load the pre-trained DistilBERT model
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2, from_pt=True)

    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # Train the model
    history = model.fit(train_dataset, epochs=3, validation_data=test_dataset)

    # Evaluate the model
    results = model.evaluate(test_dataset)
    print("Test Accuracy:", results[1])

    # Generate predictions
    predictions = model.predict(test_dataset)
    predicted_labels = tf.argmax(predictions.logits, axis=1)

    # Print classification report
    print(classification_report(y_test, predicted_labels.numpy()))

    return model

# Train and evaluate the model
model = train_and_evaluate(train_dataset, test_dataset, y_test)

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'cla

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Accuracy: 0.999109148979187
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4681
           1       1.00      1.00      1.00      4299

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [None]:
import os

def save_model(model, tokenizer, save_dir="fine_tuned_distilbert_model"):
    # Save the model and tokenizer
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"Model and tokenizer saved to {save_dir}")

# Save the model
save_model(model, tokenizer)


Model and tokenizer saved to fine_tuned_distilbert_model


In [None]:
import shutil

# Replace with your directory name if different
shutil.make_archive('fine_tuned_distilbert_model', 'zip', 'fine_tuned_distilbert_model')

'/content/fine_tuned_distilbert_model.zip'