# Setting Up the Environment

In [None]:
!pip install transformers torch tensorflow opencv-python



In [None]:
import torch
torch.cuda.is_available()

True

# Verifing the Environment

In [None]:
from transformers import RobertaTokenizer, RobertaModel

# Loading the pre-trained model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Sample text
sample_text = "Fake news detection is an important task."

# Tokenize input text
inputs = tokenizer(sample_text, return_tensors="pt")

# Pass the input through the model
outputs = model(**inputs)

# Print the last hidden state shape
print("Tokenized input shape:", inputs['input_ids'].shape)
print("Model output shape:", outputs.last_hidden_state.shape)

Tokenized input shape: torch.Size([1, 10])
Model output shape: torch.Size([1, 10, 768])


# Text Preprocessing

In [None]:
import pandas as pd

# Mounting Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the Fakeddit dataset
data_path = '/content/drive/MyDrive/FakeditDataset/multimodal_train.tsv'
df = pd.read_csv(data_path, sep='\t')

# View the first few rows to understand the structure
print(df.head())

           author                                        clean_title  \
0     Alexithymia  my walgreens offbrand mucinex was engraved wit...   
1        VIDCAs17                this concerned sink with a tiny hat   
2  prometheus1123      hackers leak emails from uae ambassador to us   
3             NaN                           puppy taking in the view   
4       3rikR3ith               i found a face in my sheet music too   

    created_utc         domain  hasImage      id  \
0  1.551641e+09    i.imgur.com      True  awxhir   
1  1.534727e+09      i.redd.it      True  98pbid   
2  1.496511e+09  aljazeera.com      True  6f2cy5   
3  1.471341e+09    i.imgur.com      True  4xypkv   
4  1.525318e+09      i.redd.it      True  8gnet9   

                                           image_url linked_submission_id  \
0  https://external-preview.redd.it/WylDbZrnbvZdB...                  NaN   
1  https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...                  NaN   
2  https://external-pre

# Tokenization

In [None]:
print(df.columns)

Index(['author', 'clean_title', 'created_utc', 'domain', 'hasImage', 'id',
       'image_url', 'linked_submission_id', 'num_comments', 'score',
       'subreddit', 'title', 'upvote_ratio', '2_way_label', '3_way_label',
       '6_way_label'],
      dtype='object')


In [None]:
# Tokenize the cleaned text
tokenized_text = tokenizer(
    df['clean_title'].tolist(),
    padding=True,
    truncation=True,
    max_length=256,  # RoBERTa supports a max length of 512
    return_tensors="pt"
)

# Check the tokenized input
print("Tokenized input IDs shape:", tokenized_text['input_ids'].shape)

Tokenized input IDs shape: torch.Size([564000, 256])


# Text Preprocessing (1% of Dataset)




In [None]:
# Filter the dataset to include only rows that have both text and image URLs
df_filtered = df.dropna(subset=['clean_title', 'image_url'])  # Drop rows with missing text or image
sample_size = int(len(df_filtered) * 0.01)  # 10% of the filtered dataset
df_sample = df_filtered.sample(n=sample_size, random_state=42)

In [None]:
# Filter the dataset to include only rows with non-null text and image URLs
df_sample_filtered = df_sample[df_sample['clean_title'].notnull() & df_sample['image_url'].notnull()]

# Now tokenize text and preprocess images using this filtered dataset
# Proceed with text and image preprocessing

In [None]:
# Tokenizing the 'cleaned_text' column
tokenized_text_sample = tokenizer(
    df_sample['clean_title'].tolist(),   # Converts the text column to a list for tokenization
    padding=True,                         # Adds padding to make the inputs of equal length
    truncation=True,                      # Truncates tokens longer than max_length
    max_length=256,                       # Maximum token length
    return_tensors="pt"                   # Return tokenized data as PyTorch tensors
)


In [None]:
print(tokenized_text_sample['input_ids'].shape)

torch.Size([16873, 256])


# Image Preprocessing

In [None]:
!pip install pillow torchvision



In [None]:
from PIL import Image
import requests
from io import BytesIO
import torch
import torchvision.transforms as transforms

# Define transformations for the image
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224
    transforms.ToTensor(),          # Convert image to tensor
    transforms.Normalize(           # Normalize using ImageNet values
        mean=[0.485, 0.456, 0.406],  # Mean for ImageNet
        std=[0.229, 0.224, 0.225]    # Std for ImageNet
    )
])

# Function to preprocess a single image from URL
def preprocess_image_from_url(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        return image_transform(img)  # Apply transformations
    except:
        return None  # Handle case where image URL is invalid or download fails

# Apply the function to all images in the dataset
df_sample['processed_images'] = df_sample['image_url'].apply(preprocess_image_from_url)

# Filter out rows where image processing failed
df_sample = df_sample[df_sample['processed_images'].notnull()]

In [None]:
# Example to check the shape of the first processed image
print(df_sample['processed_images'].iloc[0].shape)  # Should be torch.Size([3, 224, 224])

torch.Size([3, 224, 224])


# Feature Extraction

## Text Feature Extraction using RoBERTa

In [None]:
from transformers import RobertaModel
from torch.utils.data import DataLoader, TensorDataset

# Ensure GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move RoBERTa model to GPU
roberta_model.to(device)

# Create a DataLoader for batch processing
batch_size = 8  # Adjust this depending on the available memory
dataset = TensorDataset(
    tokenized_text_sample['input_ids'].to(device),  # Move input IDs to GPU
    tokenized_text_sample['attention_mask'].to(device)  # Move attention mask to GPU
)
dataloader = DataLoader(dataset, batch_size=batch_size)

text_cls_embeddings = []

# Process each batch individually
roberta_model.eval()  # Set model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    for batch in dataloader:
        input_ids, attention_mask = batch

        # Pass the batch through RoBERTa model (now on GPU)
        outputs = roberta_model(input_ids=input_ids, attention_mask=attention_mask)

        # Extract CLS token embeddings
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]

        text_cls_embeddings.append(cls_embeddings)

# Concatenate all embeddings into one tensor
text_cls_embeddings = torch.cat(text_cls_embeddings, dim=0)


In [None]:
### Unused Code
from transformers import RobertaModel
from torch.utils.data import DataLoader, TensorDataset

# Create a DataLoader for batch processing
batch_size = 4  # Adjust this depending on the available memory
dataset = TensorDataset(tokenized_text_sample['input_ids'], tokenized_text_sample['attention_mask'])
dataloader = DataLoader(dataset, batch_size=batch_size)

text_cls_embeddings = []

# Process each batch individually
roberta_model = RobertaModel.from_pretrained('roberta-base')
roberta_model.eval()  # Set model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    for batch in dataloader:
        input_ids, attention_mask = batch
        # Pass the batch through RoBERTa model
        outputs = roberta_model(input_ids=input_ids, attention_mask=attention_mask)
        # Extract CLS token embeddings
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
        text_cls_embeddings.append(cls_embeddings)

# Concatenate all embeddings into one tensor
text_cls_embeddings = torch.cat(text_cls_embeddings, dim=0)


## Image Feature Extraction using ResNet

In [None]:
import torch
import torchvision.models as models
from torchvision import transforms

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the pre-trained ResNet model and move it to the GPU
resnet_model = models.resnet50(pretrained=True)
resnet_model = resnet_model.to(device)
resnet_model.eval()  # Set to evaluation mode

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
# Function to extract features from images
def extract_image_features(image_tensor):
    image_tensor = image_tensor.to(device)  # Move image tensor to the same device as the model
    with torch.no_grad():  # Disable gradient calculation
        features = resnet_model(image_tensor.unsqueeze(0))  # Add batch dimension
    return features

In [None]:
# Extract features for each processed image
image_features = df_sample['processed_images'].apply(extract_image_features)

# Flatten and concatenate the features
image_features_flat = torch.cat(list(image_features), dim=0)  # Combine all image features

In [None]:
text_cls_embeddings = text_cls_embeddings.to(device)
image_features_flat = image_features_flat.to(device)

In [None]:
# Concatenate along the last dimension (features dimension)
combined_features = torch.cat((text_cls_embeddings, image_features_flat), dim=1)  # [batch_size, text_dim + image_dim]

# Feature Fusion

In [None]:
# Concatenate text and image features
fused_features = torch.cat((text_cls_embeddings, image_features_flat), dim=1)  # Shape: (batch_size, combined_feature_dim)

# Building Classification Layer

In [None]:
import torch.nn as nn
import torch.optim as optim

# Define the classifier model
class MultimodalClassifier(nn.Module):
    def __init__(self, fused_feature_dim, num_classes):
        super(MultimodalClassifier, self).__init__()
        # Fully connected layer to output the number of classes (e.g., 2 for Fake/Real)
        self.fc = nn.Linear(fused_feature_dim, num_classes)
        # Softmax activation
        self.softmax = nn.Softmax(dim=1)

    def forward(self, fused_features):
        # Pass through the fully connected layer
        x = self.fc(fused_features)
        # Apply softmax to output probabilities for each class
        x = self.softmax(x)
        return x

# Initialize the classifier
num_classes = 2  # Fake/Real (or adjust based on 3_way/6_way label)
fused_feature_dim = fused_features.shape[1]  # Number of dimensions in the fused feature vector
classifier = MultimodalClassifier(fused_feature_dim, num_classes)

# Move the classifier to GPU if available
classifier = classifier.to(device)

In [None]:
# Use CrossEntropyLoss as it's suitable for classification tasks
criterion = nn.CrossEntropyLoss()

# Use the Adam optimizer with a learning rate
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'df_sample_filtered' is your 10% dataset, with the fused features already calculated
# '2_way_label' is used as the label (Fake or Real)
train_data, test_data, train_labels, test_labels = train_test_split(
    fused_features,
    df_sample_filtered['2_way_label'],  # Use appropriate labels (e.g., 2_way_label)
    test_size=0.2,  # 20% of data will be used as test set
    random_state=42
)

# Move them to device (GPU/CPU)
train_data = train_data.to(device)
train_labels = torch.tensor(train_labels.values).to(device)
test_data = test_data.to(device)
test_labels = torch.tensor(test_labels.values).to(device)

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Create dataset objects
train_dataset = TensorDataset(train_data, train_labels)
test_dataset = TensorDataset(test_data, test_labels)

# Define DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
# Training settings
epochs = 10  # Number of epochs (can be adjusted)
batch_size = 32  # Mini-batch size

# Loop over the dataset for a number of epochs
for epoch in range(epochs):
    classifier.train()  # Set the classifier in training mode
    total_loss = 0  # To track the overall loss for the epoch

    # Loop over batches
    for batch_data, batch_labels in train_loader:
        optimizer.zero_grad()  # Clear gradients

        # Forward pass: Get predictions
        predictions = classifier(batch_data)

        # Calculate the loss between predictions and true labels
        loss = criterion(predictions, batch_labels)

        # Backward pass: Compute gradients
        loss.backward()

        # Optimizer step: Update weights
        optimizer.step()

        total_loss += loss.item()  # Accumulate the batch loss

    # Print the average loss per epoch
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}')


Epoch [1/10], Loss: 0.5753
Epoch [2/10], Loss: 0.5656
Epoch [3/10], Loss: 0.5665
Epoch [4/10], Loss: 0.5716
Epoch [5/10], Loss: 0.5640
Epoch [6/10], Loss: 0.5593
Epoch [7/10], Loss: 0.5573
Epoch [8/10], Loss: 0.5584
Epoch [9/10], Loss: 0.5532
Epoch [10/10], Loss: 0.5648


In [None]:
print(len(train_loader))  # Should be > 0 if batches are being processed

88


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Set the classifier in evaluation mode (disables dropout, etc.)
classifier.eval()

# Track predictions and labels for evaluation
all_preds = []
all_labels = []

# Disable gradient calculation for evaluation
with torch.no_grad():
    for batch_data, batch_labels in test_loader:
        predictions = classifier(batch_data)  # Get model predictions
        _, predicted_labels = torch.max(predictions, dim=1)  # Get the class with the highest score

        all_preds.extend(predicted_labels.cpu().numpy())  # Save predictions
        all_labels.extend(batch_labels.cpu().numpy())  # Save actual labels

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.6528
Precision: 0.6930
Recall: 0.7560
F1 Score: 0.7231


# Deployment

In [None]:
pip install flask

# Creating a Flask Application (API Endpoint - /predict)

In [None]:
from flask import Flask, request, jsonify
import torch
from transformers import RobertaTokenizer, RobertaModel
from torchvision import models
from PIL import Image
import requests
from torchvision import transforms

# Initialize Flask app
app = Flask(__name__)

# Load your models
text_model = RobertaModel.from_pretrained('roberta-base')  # Example text model
image_model = models.resnet50(pretrained=True)             # Example image model
image_model.eval()

classifier = FakeNewsClassifier(fused_features.shape[1])   # Your classification model

# Tokenizer and image preprocessing
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Define the image preprocessing transformations
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Helper function for image preprocessing
def preprocess_image(image_url):
    response = requests.get(image_url, stream=True)
    img = Image.open(response.raw).convert('RGB')
    img = image_transforms(img)
    return img.unsqueeze(0)

# Route to handle predictions
@app.route('/predict', methods=['POST'])
def predict():
    data = request.json

    # Extract text and image URL from request
    text = data.get('text')
    image_url = data.get('image_url')

    # Text preprocessing
    tokenized_input = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors="pt")
    text_features = text_model(**tokenized_input)['last_hidden_state'][:, 0, :]

    # Image preprocessing
    image_features = image_model(preprocess_image(image_url))
    image_features_flat = image_features.view(image_features.size(0), -1)

    # Fuse text and image features
    fused_features = torch.cat((text_features, image_features_flat), dim=1)

    # Classify the fused features
    prediction = classifier(fused_features)
    predicted_class = torch.argmax(prediction, dim=1).item()

    # Return the result
    return jsonify({'prediction': 'Fake' if predicted_class == 0 else 'Real'})

if __name__ == '__main__':
    app.run(debug=True)


# Steps to Test the Flask App

Before deploying to the cloud, test it.

Run the Flask app:


- python app.py

Send a test request using curl or Postman:

- curl -X POST http://127.0.0.1:5000/predict -H "Content-Type: application/json" -d '{"text": "some news text", "image_url": "https://example.com/image.jpg"}'
(JSON Response)

# Preparing for Cloud Deployment

Create requirements.txt: Flask and other dependencies should be included in requirements.txt.

 - flask
  torch
  transformers
  torchvision
  pillow
  requests

Create Procfile: For platforms like Heroku, we need a Procfile that tells how to run the app.

  - web: python app.py

 **Now, only thing remaining is Deploying on Cloud Platform (Heroku/AWS Lambda/AWS EC2)**

 - heroku login

- heroku create

- git add .
git commit -m "Deploy model"
git push heroku master

- heroku open
