In [2]:
# For managing COCO dataset
# from pycocotools.coco import COCO

# For creating and managing folder/ files
import glob
import os
import shutil

# For managing images
from PIL import Image
import skimage.io as io

# Basic libraries
import numpy as np
import pandas as pd
import random
import cv2

# For plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
# import wandb

# For importing models and working with them
## Torch
import torch
import torch.utils.data # for Dataset
import torch.nn as nn
from torch.optim import Adam
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
# import segmentation_models_pytorch as smp

## Torchvision
import torchvision
from torchvision.transforms import transforms

# For creating train - test splits
from sklearn.model_selection import train_test_split

import pathlib
import pylab
import requests
from io import BytesIO
from pprint import pprint
from tqdm import tqdm
import time
from imutils import paths

# Performance Metrics
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix




device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir Dataset
!mkdir Dataset/findit

!unzip /content/drive/MyDrive/findit2.zip -d /content/Dataset/findit

Archive:  /content/drive/MyDrive/findit2.zip
   creating: /content/Dataset/findit/findit2/
  inflating: /content/Dataset/findit/__MACOSX/._findit2  
   creating: /content/Dataset/findit/findit2/test/
  inflating: /content/Dataset/findit/__MACOSX/findit2/._test  
  inflating: /content/Dataset/findit/findit2/train.txt  
  inflating: /content/Dataset/findit/__MACOSX/findit2/._train.txt  
   creating: /content/Dataset/findit/findit2/train/
  inflating: /content/Dataset/findit/__MACOSX/findit2/._train  
  inflating: /content/Dataset/findit/findit2/test.txt  
  inflating: /content/Dataset/findit/__MACOSX/findit2/._test.txt  
  inflating: /content/Dataset/findit/findit2/val.txt  
  inflating: /content/Dataset/findit/__MACOSX/findit2/._val.txt  
   creating: /content/Dataset/findit/findit2/val/
  inflating: /content/Dataset/findit/__MACOSX/findit2/._val  
  inflating: /content/Dataset/findit/findit2/test/X51007339127.png  
  inflating: /content/Dataset/findit/__MACOSX/findit2/test/._X510073391

# Create Dataframes

In [59]:
import json

def parse_annotation_to_dict(annotation):
    parts = annotation.strip().split(',')

    image_filename = parts[0].strip()
    digital_annotation = int(parts[1].strip())
    handwritten_annotation = int(parts[2].strip())
    forged = int(parts[3].strip())

    forgery_annotations_str = ','.join(parts[4:]).strip()  # Convert string representation of dictionary to dictionary

    if forgery_annotations_str != '0':
        try:
            forgery_annotations_str = forgery_annotations_str.encode('unicode_escape').decode('utf-8')
            forgery_annotations_json = json.loads(forgery_annotations_str)  # Convert string to dictionary
            filename = forgery_annotations_json.get('filename', '')
            size = forgery_annotations_json.get('size', '')

            regions = forgery_annotations_json.get('regions', [])
            Name = []
            X = []
            Y = []
            W = []
            H = []
            Modified_Area = []
            Entity = []
            Original_Area = []

            for region in regions:
                shape_attributes = region.get('shape_attributes', {})
                Name.append(shape_attributes.get('name', ''))
                X.append(shape_attributes.get('x', ''))
                Y.append(shape_attributes.get('y', ''))
                W.append(shape_attributes.get('width', ''))
                H.append(shape_attributes.get('height', ''))

                region_attributes = region.get('region_attributes', {})
                Modified_Area.append(region_attributes.get('Modified area', ''))
                Entity.append(region_attributes.get('Entity type', ''))
                Original_Area.append(region_attributes.get('Original area', ''))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            print(f"Problematic JSON string: {forgery_annotations_str}")
            # Assign default values if JSON decoding fails
            forgery_annotations_json = {}
            filename, size = '', ''
            Name, X, Y, W, H = [], [], [], [], []
            Modified_Area, Entity, Original_Area = [], [], []
    else:
        filename, size = '', ''
        Name, X, Y, W, H = [], [], [], [], []
        Modified_Area, Entity, Original_Area = [], [], []

    return {
        "image": image_filename,
        "digital_annotation": digital_annotation,
        "handwritten_annotation": handwritten_annotation,
        "forged": forged,
        "forgery_annotations": forgery_annotations_str,
        "filename": filename,
        "size": size,
        "name": Name,
        "x": X,
        "y": Y,
        "w": W,
        "h": H,
        "modified_area": Modified_Area,
        "entity": Entity,
        "original_area": Original_Area
    }


In [60]:
def txt_to_dataframe(txt_file):
    with open(txt_file, 'r') as f:
        lines = f.readlines()[1:]  # Skip header
        annotations = [parse_annotation_to_dict(line) for line in lines]

    return pd.DataFrame(annotations)

In [61]:
txt_file = "/content/Dataset/findit/train_json.txt"
df = txt_to_dataframe(txt_file)

df.head()


Unnamed: 0,image,digital_annotation,handwritten_annotation,forged,forgery_annotations,filename,size,name,x,y,w,h,modified_area,entity,original_area
0,X00016469622.png,1,1,1,"{""filename"": ""X00016469622.png"", ""size"": 23072...",X00016469622.png,230726.0,"[rect, rect, rect, rect, rect, rect, rect, rect]","[248, 405, 345, 347, 344, 343, 266, 268]","[459, 342, 461, 510, 547, 657, 812, 845]","[13, 13, 9, 8, 10, 12, 12, 9]","[18, 17, 16, 16, 16, 18, 20, 17]","[{'CPI': 'True'}, {'None': 'True'}, {'CPI': 'T...","[Product, Metadata, Product, Product, Total/pa...","[no, yes, no, no, no, no, no, no]"
1,X00016469623.png,1,1,0,0,,,[],[],[],[],[],[],[],[]
2,X00016469670.png,1,1,0,0,,,[],[],[],[],[],[],[],[]
3,X00016469671.png,1,1,0,0,,,[],[],[],[],[],[],[],[]
4,X00016469672.png,1,1,0,0,,,[],[],[],[],[],[],[],[]


In [62]:
df.to_csv('train.csv', index = False)

In [None]:
# import json

# # Get the forgery_annotations for the 5th row (index 4)
# forgery_annotations_str = df.iloc[4]['forgery_annotations']

# # Replace single quotes with double quotes
# forgery_annotations_str = forgery_annotations_str.replace("'", "\"").strip()

# print(forgery_annotations_str)

# # Attempt to load the JSON from the string
# try:
#     forgery_annotations_json = json.loads(forgery_annotations_str)
#     filename = forgery_annotations_json['regions']
#     print(filename)
#     print(filename[0]['shape_attributes']['name'])
#     print(len(filename))
# except json.JSONDecodeError as e:
#     print("Error decoding JSON:", e)


{"filename": "X51005230616.png", "size": 835401, "regions": [{"shape_attributes": {"name": "rect", "x": 27, "y": 875, "width": 29, "height": 43}, "region_attributes": {"Modified area": {"IMI": "True"}, "Entity type": "Product", "Original area": "no"}}, {"shape_attributes": {"name": "rect", "x": 458, "y": 883, "width": 35, "height": 37}, "region_attributes": {"Modified area": {"IMI": "True"}, "Entity type": "Product", "Original area": "no"}}], "file_attributes": {"Software used": "paint"}}
[{'shape_attributes': {'name': 'rect', 'x': 27, 'y': 875, 'width': 29, 'height': 43}, 'region_attributes': {'Modified area': {'IMI': 'True'}, 'Entity type': 'Product', 'Original area': 'no'}}, {'shape_attributes': {'name': 'rect', 'x': 458, 'y': 883, 'width': 35, 'height': 37}, 'region_attributes': {'Modified area': {'IMI': 'True'}, 'Entity type': 'Product', 'Original area': 'no'}}]
rect
2


In [None]:
# import re
# import json

# # Original JSON string
# json_str = "{\"filename\": \"X51005230616.png\", \"size\": 835401, \"regions\": [{\"shape_attributes\": {\"name\": \"rect\", \"x\": 27, \"y\": 875, \"width\": 29, \"height\": 43}, \"region_attributes\": {\"Modified area\": {\"IMI\": True}, \"Entity type\": \"Product\", \"Original area\": \"no\"}}, {\"shape_attributes\": {\"name\": \"rect\", \"x\": 458, \"y\": 883, \"width\": 35, \"height\": 37}, \"region_attributes\": {\"Modified area\": {\"IMI\": True}, \"Entity type\": \"Product\", \"Original area\": \"no\"}}], \"file_attributes\": {\"Software used\": \"paint\"}}"

# # Use regular expression to remove non-JSON characters
# clean_json_str = re.sub(r'[^{}[\],\s\d.:a-zA-Z_"]', '', json_str)

# print("Clean JSON string:", clean_json_str)

# try:
#     forgery_annotations_json = json.loads(clean_json_str)
#     print("JSON decoding successful.")
#     print(forgery_annotations_json)
# except json.JSONDecodeError as e:
#     print("Error decoding JSON:", e)


Clean JSON string: {"filename": "X51005230616.png", "size": 835401, "regions": [{"shape_attributes": {"name": "rect", "x": 27, "y": 875, "width": 29, "height": 43}, "region_attributes": {"Modified area": {"IMI": True}, "Entity type": "Product", "Original area": "no"}}, {"shape_attributes": {"name": "rect", "x": 458, "y": 883, "width": 35, "height": 37}, "region_attributes": {"Modified area": {"IMI": True}, "Entity type": "Product", "Original area": "no"}}], "file_attributes": {"Software used": "paint"}}
Error decoding JSON: Expecting value: line 1 column 194 (char 193)


# Plot masks - bounding boxes

In [71]:
df = pd.read_csv('/content/val.csv')
df.fillna(0, inplace = True)

columns_to_drop = ['forgery_annotations']
df.drop(columns = columns_to_drop, inplace = True)

df.head(5)

Unnamed: 0,image,digital_annotation,handwritten_annotation,forged,filename,size,name,x,y,w,h,modified_area,entity,original_area
0,X00016469612.png,1,1,1,X00016469612.png,291659.0,"['rect', 'rect', 'rect', 'rect']","[174, 164, 318, 179]","[374, 373, 372, 396]","[9, 9, 13, 10]","[14, 15, 16, 18]","[{'CPI': 'True'}, {'None': 'True'}, {'CPI': 'T...","['Metadata', 'Metadata', 'Metadata', 'Other']","['no', 'yes', 'no', 'yes']"
1,X51005200931.png,0,1,0,0,0.0,[],[],[],[],[],[],[],[]
2,X51005230605.png,0,0,0,0,0.0,[],[],[],[],[],[],[],[]
3,X51005433538.png,0,1,0,0,0.0,[],[],[],[],[],[],[],[]
4,X51005441401.png,0,1,1,X51005441401.png,358358.0,"['rect', 'rect', 'rect', 'rect', 'rect', 'rect...","[418, 416, 417, 414, 416, 418, 430]","[595, 623, 674, 717, 743, 815, 764]","[12, 15, 12, 14, 14, 13, 14]","[24, 27, 37, 20, 17, 42, 24]","[{'CUT': 'True'}, {'CUT': 'True'}, {'CUT': 'Tr...","['Product', 'Product', 'Total/payment', 'Total...","['no', 'no', 'no', 'no', 'no', 'no', 'yes']"


In [66]:
import matplotlib.patches as patches
import cv2
import ast

def draw_boxes(image_path, df_row, save_path):
    try:
        # Read the image
        image = cv2.imread(image_path)
        if image is None:
            print(f"Unable to read image: {image_path}")
            return

        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Plot the image without axes
        fig, ax = plt.subplots(1)
        ax.imshow(image_rgb)
        ax.axis('off')  # Turn off axes

        # Parse annotations and draw bounding boxes
        for x, y, w, h in zip(ast.literal_eval(df_row['x']), ast.literal_eval(df_row['y']), ast.literal_eval(df_row['w']), ast.literal_eval(df_row['h'])):
            # Draw rectangle
            rect = patches.Rectangle((int(x), int(y)), int(w), int(h), linewidth=1, edgecolor='r', facecolor='none')
            ax.add_patch(rect)

        # Save the processed image with bounding boxes
        mask_path = os.path.join(save_path, os.path.basename(image_path))
        plt.savefig(mask_path, bbox_inches='tight', pad_inches=0)
        plt.close()
    except Exception as e:
        print(f"Error processing image: {image_path}")
        print(f"Error message: {e}")


# Create folder to save masks if it does not exist
save_folder = '/content/Dataset/findit/findit2/test_masks'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# df = pd.DataFrame(data)
forged_df = df[df['forged'] == 1]

# Iterate over DataFrame rows and draw bounding boxes
for index, row in forged_df.iterrows():
    image_name = row['filename']
    image_path = '/content/Dataset/findit/findit2/test/' + image_name
    draw_boxes(image_path, row, save_folder)

In [72]:
def create_binary_mask(image_shape, bbox_list):
    # Create a blank black image
    mask = np.zeros(image_shape, dtype=np.uint8)

    # Draw white rectangles for bounding boxes
    for x, y, w, h in bbox_list:
        cv2.rectangle(mask, (x, y), (x+w, y+h), (255, 255, 255), -1)  # -1 to fill the rectangle

    return mask

def draw_boxes(image_path, df_row, save_path):
    try:
        # Read the image
        image = cv2.imread(image_path)
        if image is None:
            print(f"Unable to read image: {image_path}")
            return

        # Get image dimensions
        height, width, _ = image.shape

        # Parse annotations and get bounding boxes
        bbox_list = list(zip(ast.literal_eval(df_row['x']), ast.literal_eval(df_row['y']), ast.literal_eval(df_row['w']), ast.literal_eval(df_row['h'])))

        # Create binary mask
        mask = create_binary_mask((height, width, 3), bbox_list)

        # Save the processed image with bounding boxes
        mask_path = os.path.join(save_path, os.path.basename(image_path))
        cv2.imwrite(mask_path, mask)
    except Exception as e:
        print(f"Error processing image: {image_path}")
        print(f"Error message: {e}")

# Create folder to save masks if it does not exist
save_folder = '/content/Dataset/findit/findit2/val_bm'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# Filter rows with forged = 1
forged_df = df[df['forged'] == 1]

# Iterate over DataFrame rows and draw bounding boxes
for index, row in forged_df.iterrows():
    image_name = row['filename']
    image_path = '/content/Dataset/findit/findit2/val/' + image_name
    draw_boxes(image_path, row, save_folder)

Unable to read image: /content/Dataset/findit/findit2/val/X51006619709.png


In [58]:
!rm -r /content/Dataset/findit/findit2/train_masks

In [73]:
import zipfile
import os

# Define the folder you want to zip and the name of the zip file
folder_to_zip = '/content/Dataset'
zip_file_name = '/content/Dataset.zip'

# Create a zip file
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    # Iterate over all the files in the folder
    for root, _, files in os.walk(folder_to_zip):
        for file in files:
            # Get the full path of the file
            file_path = os.path.join(root, file)
            # Add the file to the zip
            zipf.write(file_path, os.path.relpath(file_path, folder_to_zip))

# Print message when done
print(f"Folder '{folder_to_zip}' zipped successfully as '{zip_file_name}'.")

Folder '/content/Dataset' zipped successfully as '/content/Dataset.zip'.


In [74]:
from google.colab import files

# Specify the path to the zip file you want to download
zip_file_path = '/content/Dataset.zip'

# Download the file
files.download(zip_file_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>