In [3]:
# For managing COCO dataset
# from pycocotools.coco import COCO

# For creating and managing folder/ files
import glob
import os
import shutil

# For managing images
from PIL import Image
import skimage.io as io

# Basic libraries
import numpy as np
import pandas as pd
import random
import cv2

# For plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
# import wandb

# For importing models and working with them
## Torch
import torch
import torch.utils.data # for Dataset
import torch.nn as nn
from torch.optim import Adam
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
# import segmentation_models_pytorch as smp

## Torchvision
import torchvision
from torchvision.transforms import transforms

# For creating train - test splits
from sklearn.model_selection import train_test_split

import pathlib
import pylab
import requests
from io import BytesIO
from pprint import pprint
from tqdm import tqdm
import time
from imutils import paths

# Performance Metrics
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix




device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
# !mkdir Dataset
# !mkdir Dataset/findit

# !unzip /content/drive/MyDrive/findit2.zip -d /content/Dataset/findit

In [119]:
import json
import pandas as pd

def parse_annotation_to_dict(annotation):
    parts = annotation.strip().split(',')

    image_filename = parts[0].strip()
    digital_annotation = int(parts[1].strip())
    handwritten_annotation = int(parts[2].strip())
    forged = int(parts[3].strip())

    forgery_annotations = ','.join(parts[4:]).strip()  # Convert string representation of dictionary to dictionary

    if forgery_annotations != '0':
      forgery_annotations_json = json.loads(forgery_annotations_str)
      filename = forgery_annotations_json['filename']
      size = forgery_annotations_json['size']

      regions = forgery_annotations_json['regions']
      Name = []
      X = []
      Y = []
      W = []
      H = []
      Modified_Area = []
      Entity = []
      Original_Area = []

      for i in range(0, len(regions)):
        shape_attributes = regions[i]['shape_attributes']
        Name.append(shape_attributes['name'])
        X.append(shape_attributes['x'])
        Y.append(shape_attributes['y'])
        W.append(shape_attributes['width'])
        H.append(shape_attributes['height'])


        region_attributes = regions[i]['region_attributes']
        Modified_Area.append(region_attributes['Modified area'])
        Entity.append(region_attributes['Entity type'])
        Original_Area.append(region_attributes['Original area'])

      # file_attributes = forgery_annotations_json['file_attributes']
      # software = file_attributes['Software used'] # only paint
      # comment = file_attributes['Comment'] # very few

    else:
      filename, size = '', ''
      regions, Name, X, Y, W, H = '', '', '', '', '', ''
      Modified_Area, Entity, Original_Area = '', '', ''


    try:
        forgery_annotations_json = json.loads(forgery_annotations.replace("'", "\""))
    except json.JSONDecodeError:
        forgery_annotations_json = forgery_annotations

    return {
            "image": image_filename,
            "digital_annotation": digital_annotation,
            "handwritten_annotation": handwritten_annotation,
            "forged": forged,
            "forgery_annotations": forgery_annotations_json,
            "filename": filename,
            "size": size,
            "regions": regions,
            "name": Name,
            "x": X,
            "y": Y,
            "w": W,
            "h": H,
            "modified_area": Modified_Area,
            "entity": Entity,
            "original_area": Original_Area

            # "software": software
            # "comment": comment
          }


In [120]:
def txt_to_dataframe(txt_file):
    with open(txt_file, 'r') as f:
        lines = f.readlines()[1:]  # Skip header
        annotations = [parse_annotation_to_dict(line) for line in lines]

    return pd.DataFrame(annotations)

In [128]:
txt_file = "/content/Dataset/findit/findit2/train_json.txt"
df = txt_to_dataframe(txt_file)

df.head()


Unnamed: 0,image,digital_annotation,handwritten_annotation,forged,forgery_annotations,filename,size,regions,name,x,y,w,h,modified_area,entity,original_area
0,X00016469622.png,1,1,1,"{'filename': 'X00016469622.png', 'size': 23072...",X51005230616.png,835401.0,"[{'shape_attributes': {'name': 'rect', 'x': 27...","[rect, rect]","[27, 458]","[875, 883]","[29, 35]","[43, 37]","[{'IMI': 'True'}, {'IMI': 'True'}]","[Product, Product]","[no, no]"
1,X00016469623.png,1,1,0,0,,,,,,,,,,,
2,X00016469670.png,1,1,0,0,,,,,,,,,,,
3,X00016469671.png,1,1,0,0,,,,,,,,,,,
4,X00016469672.png,1,1,0,0,,,,,,,,,,,


In [129]:
df.to_csv('train.csv', index = False)

In [88]:
import json

# Get the forgery_annotations for the 5th row (index 4)
forgery_annotations_str = df.iloc[4]['forgery_annotations']

# Replace single quotes with double quotes
forgery_annotations_str = forgery_annotations_str.replace("'", "\"").strip()

print(forgery_annotations_str)

# Attempt to load the JSON from the string
try:
    forgery_annotations_json = json.loads(forgery_annotations_str)
    filename = forgery_annotations_json['regions']
    print(filename)
    print(filename[0]['shape_attributes']['name'])
    print(len(filename))
except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)


{"filename": "X51005230616.png", "size": 835401, "regions": [{"shape_attributes": {"name": "rect", "x": 27, "y": 875, "width": 29, "height": 43}, "region_attributes": {"Modified area": {"IMI": "True"}, "Entity type": "Product", "Original area": "no"}}, {"shape_attributes": {"name": "rect", "x": 458, "y": 883, "width": 35, "height": 37}, "region_attributes": {"Modified area": {"IMI": "True"}, "Entity type": "Product", "Original area": "no"}}], "file_attributes": {"Software used": "paint"}}
[{'shape_attributes': {'name': 'rect', 'x': 27, 'y': 875, 'width': 29, 'height': 43}, 'region_attributes': {'Modified area': {'IMI': 'True'}, 'Entity type': 'Product', 'Original area': 'no'}}, {'shape_attributes': {'name': 'rect', 'x': 458, 'y': 883, 'width': 35, 'height': 37}, 'region_attributes': {'Modified area': {'IMI': 'True'}, 'Entity type': 'Product', 'Original area': 'no'}}]
rect
2


In [75]:
import re
import json

# Original JSON string
json_str = "{\"filename\": \"X51005230616.png\", \"size\": 835401, \"regions\": [{\"shape_attributes\": {\"name\": \"rect\", \"x\": 27, \"y\": 875, \"width\": 29, \"height\": 43}, \"region_attributes\": {\"Modified area\": {\"IMI\": True}, \"Entity type\": \"Product\", \"Original area\": \"no\"}}, {\"shape_attributes\": {\"name\": \"rect\", \"x\": 458, \"y\": 883, \"width\": 35, \"height\": 37}, \"region_attributes\": {\"Modified area\": {\"IMI\": True}, \"Entity type\": \"Product\", \"Original area\": \"no\"}}], \"file_attributes\": {\"Software used\": \"paint\"}}"

# Use regular expression to remove non-JSON characters
clean_json_str = re.sub(r'[^{}[\],\s\d.:a-zA-Z_"]', '', json_str)

print("Clean JSON string:", clean_json_str)

try:
    forgery_annotations_json = json.loads(clean_json_str)
    print("JSON decoding successful.")
    print(forgery_annotations_json)
except json.JSONDecodeError as e:
    print("Error decoding JSON:", e)


Clean JSON string: {"filename": "X51005230616.png", "size": 835401, "regions": [{"shape_attributes": {"name": "rect", "x": 27, "y": 875, "width": 29, "height": 43}, "region_attributes": {"Modified area": {"IMI": True}, "Entity type": "Product", "Original area": "no"}}, {"shape_attributes": {"name": "rect", "x": 458, "y": 883, "width": 35, "height": 37}, "region_attributes": {"Modified area": {"IMI": True}, "Entity type": "Product", "Original area": "no"}}], "file_attributes": {"Software used": "paint"}}
Error decoding JSON: Expecting value: line 1 column 194 (char 193)
