In [11]:
import os
import cv2
import numpy as np
import pandas as pd
import pytesseract as pt
import plotly.express as px
import matplotlib.pyplot as plt
import xml.etree.ElementTree as xet
from pathlib import Path

from glob import glob
from skimage import io
from shutil import copy

In [16]:
import os
from pathlib import Path
import xml.etree.ElementTree as xet

# Define the absolute path to the datasets directory
datasets_dir = Path('/Users/daksh/Desktop/ANPR/datasets')

# Check if the datasets directory exists
if not datasets_dir.exists():
    print("Datasets directory does not exist.")
else:
    # Print the contents of the datasets directory
    print("Files in datasets directory:", list(datasets_dir.glob('*')))

    # Prepare a function to extract labels from XML files
    def extract_labels(xml_paths):
        labels_dict = dict(filepath=[], xmin=[], xmax=[], ymin=[], ymax=[])

        for filename in xml_paths:
            info = xet.parse(filename)
            root = info.getroot()
            member_object = root.find('object')
            labels_info = member_object.find('bndbox')
            xmin = int(labels_info.find('xmin').text)
            xmax = int(labels_info.find('xmax').text)
            ymin = int(labels_info.find('ymin').text)
            ymax = int(labels_info.find('ymax').text)

            labels_dict['filepath'].append(filename)
            labels_dict['xmin'].append(xmin)
            labels_dict['xmax'].append(xmax)
            labels_dict['ymin'].append(ymin)
            labels_dict['ymax'].append(ymax)
        
        return labels_dict

    # Process each subdirectory
    google_images_path = datasets_dir / 'google_images'
    statewise_olx_path = datasets_dir / 'State-wise_OLX'
    video_images_path = datasets_dir / 'video_images'

    # Extract labels for google_images
    google_images_xml_paths = list(google_images_path.glob('*.xml'))
    print("Google Images XML files:", len(google_images_xml_paths))
    google_images_labels = extract_labels(google_images_xml_paths)

    # Extract labels for State-wise OLX
    statewise_olx_xml_paths = list(statewise_olx_path.glob('*/*.xml'))
    print("State-wise OLX XML files:", len(statewise_olx_xml_paths))
    statewise_olx_labels = extract_labels(statewise_olx_xml_paths)

    # Extract labels for video_images
    video_images_xml_paths = list(video_images_path.glob('*.xml'))
    print("Video Images XML files:", len(video_images_xml_paths))
    video_images_labels = extract_labels(video_images_xml_paths)

    # Print a summary of the label counts
    print("Label counts:")
    print(f"Google Images: {len(google_images_labels['filepath'])}")
    print(f"State-wise OLX: {len(statewise_olx_labels['filepath'])}")
    print(f"Video Images: {len(video_images_labels['filepath'])}")

Files in datasets directory: [PosixPath('/Users/daksh/Desktop/ANPR/datasets/google_images'), PosixPath('/Users/daksh/Desktop/ANPR/datasets/State-wise_OLX'), PosixPath('/Users/daksh/Desktop/ANPR/datasets/video_images')]
Google Images XML files: 440
State-wise OLX XML files: 603
Video Images XML files: 654
Label counts:
Google Images: 440
State-wise OLX: 603
Video Images: 654


In [17]:
# Function to parse XML files for filename, width, and height
def parsing(path: str):
    parser = xet.parse(path).getroot()
    name = parser.find('filename').text
    filename = f'/Users/daksh/Desktop/ANPR/datasets/google_images/{name}'

    # width and height
    parser_size = parser.find('size')
    width = int(parser_size.find('width').text)
    height = int(parser_size.find('height').text)
    
    return filename, width, height

def newparsing(path: str):
    parser = xet.parse(path).getroot()
    name = parser.find('filename').text
    filename = f'/Users/daksh/Desktop/ANPR/datasets/State-wise_OLX/{name[:2]}/{name}'

    # width and height
    parser_size = parser.find('size')
    width = int(parser_size.find('width').text)
    height = int(parser_size.find('height').text)
    
    return filename, width, height

# Create DataFrames from the dictionaries
df_google_images = pd.DataFrame(google_images_labels)
df_statewise_olx = pd.DataFrame(statewise_olx_labels)
df_video_images = pd.DataFrame(video_images_labels)

# Parsing filenames and dimensions for google images
df_google_images[['filename', 'width', 'height']] = df_google_images['filepath'].apply(parsing).apply(pd.Series)

# Parsing filenames and dimensions for State-wise OLX
df_statewise_olx[['filename', 'width', 'height']] = df_statewise_olx['filepath'].apply(newparsing).apply(pd.Series)

# Parsing filenames and dimensions for video images
def video_parsing(path: str):
    parser = xet.parse(path).getroot()
    name = parser.find('filename').text
    filename = f'/Users/daksh/Desktop/ANPR/datasets/video_images/{name}'

    # width and height
    parser_size = parser.find('size')
    width = int(parser_size.find('width').text)
    height = int(parser_size.find('height').text)
    
    return filename, width, height

df_video_images[['filename', 'width', 'height']] = df_video_images['filepath'].apply(video_parsing).apply(pd.Series)

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat([df_google_images, df_statewise_olx, df_video_images], ignore_index=True)

# Display the final DataFrame
print(final_df.head())

                                            filepath  xmin  xmax  ymin  ymax  \
0  /Users/daksh/Desktop/ANPR/datasets/google_imag...     1   702    46   401   
1  /Users/daksh/Desktop/ANPR/datasets/google_imag...   393   595   452   500   
2  /Users/daksh/Desktop/ANPR/datasets/google_imag...   231   431   252   308   
3  /Users/daksh/Desktop/ANPR/datasets/google_imag...   606   793   392   469   
4  /Users/daksh/Desktop/ANPR/datasets/google_imag...   129   206   131   156   

                                            filename  width  height  
0  /Users/daksh/Desktop/ANPR/datasets/google_imag...    750     562  
1  /Users/daksh/Desktop/ANPR/datasets/google_imag...   1024     685  
2  /Users/daksh/Desktop/ANPR/datasets/google_imag...    551     455  
3  /Users/daksh/Desktop/ANPR/datasets/google_imag...    930     768  
4  /Users/daksh/Desktop/ANPR/datasets/google_imag...    350     196  


In [19]:
df = final_df

In [20]:
# center_x, center_y, width , height
df['center_x'] = (df['xmax'] + df['xmin'])/(2*df['width'])
df['center_y'] = (df['ymax'] + df['ymin'])/(2*df['height'])

df['bb_width'] = (df['xmax'] - df['xmin'])/df['width']
df['bb_height'] = (df['ymax'] - df['ymin'])/df['height']
df.head()

Unnamed: 0,filepath,xmin,xmax,ymin,ymax,filename,width,height,center_x,center_y,bb_width,bb_height
0,/Users/daksh/Desktop/ANPR/datasets/google_imag...,1,702,46,401,/Users/daksh/Desktop/ANPR/datasets/google_imag...,750,562,0.468667,0.397687,0.934667,0.631673
1,/Users/daksh/Desktop/ANPR/datasets/google_imag...,393,595,452,500,/Users/daksh/Desktop/ANPR/datasets/google_imag...,1024,685,0.482422,0.694891,0.197266,0.070073
2,/Users/daksh/Desktop/ANPR/datasets/google_imag...,231,431,252,308,/Users/daksh/Desktop/ANPR/datasets/google_imag...,551,455,0.600726,0.615385,0.362976,0.123077
3,/Users/daksh/Desktop/ANPR/datasets/google_imag...,606,793,392,469,/Users/daksh/Desktop/ANPR/datasets/google_imag...,930,768,0.752151,0.560547,0.201075,0.10026
4,/Users/daksh/Desktop/ANPR/datasets/google_imag...,129,206,131,156,/Users/daksh/Desktop/ANPR/datasets/google_imag...,350,196,0.478571,0.732143,0.22,0.127551


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [23]:
# Assuming df contains the columns needed for splitting
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Display the number of images in training and testing sets
print(f"Training set size: {len(df_train)}")
print(f"Testing set size: {len(df_test)}")

Training set size: 1357
Testing set size: 340


In [26]:
import os
from shutil import copy

# Define folder paths
train_folder = './yolo/data/images/train'
test_folder = './yolo/data/images/test'

# Ensure the folders exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)
os.makedirs('./yolo/data/labels/train', exist_ok=True)
os.makedirs('./yolo/data/labels/test', exist_ok=True)

# Process training data
for _, row in df_train.iterrows():
    try:
        fname, center_x, center_y, bb_width, bb_height = row[['filename', 'center_x', 'center_y', 'bb_width', 'bb_height']]
        image_name = os.path.split(fname)[-1]
        txt_name = os.path.splitext(image_name)[0]

        dst_image_path = os.path.join(train_folder, image_name)
        dst_label_file = os.path.join('./yolo/data/labels/train', txt_name + '.txt')

        # Copy each image into the folder
        copy(fname, dst_image_path)

        # Generate .txt which has label info
        label_txt = f'0 {center_x} {center_y} {bb_width} {bb_height}'
        with open(dst_label_file, mode='w') as f:
            f.write(label_txt)

    except Exception as e:
        print(f"Error processing {fname}: {e}")

# Process testing data
for _, row in df_test.iterrows():
    try:
        fname, center_x, center_y, bb_width, bb_height = row[['filename', 'center_x', 'center_y', 'bb_width', 'bb_height']]
        image_name = os.path.split(fname)[-1]
        txt_name = os.path.splitext(image_name)[0]

        dst_image_path = os.path.join(test_folder, image_name)
        dst_label_file = os.path.join('./yolo/data/labels/test', txt_name + '.txt')

        # Copy each image into the folder
        copy(fname, dst_image_path)

        # Generate .txt which has label info
        label_txt = f'0 {center_x} {center_y} {bb_width} {bb_height}'
        with open(dst_label_file, mode='w') as f:
            f.write(label_txt)

    except Exception as e:
        print(f"Error processing {fname}: {e}")

Error processing /Users/daksh/Desktop/ANPR/datasets/State-wise_OLX/MH/MH5.jpg: [Errno 2] No such file or directory: '/Users/daksh/Desktop/ANPR/datasets/State-wise_OLX/MH/MH5.jpg'
Error processing /Users/daksh/Desktop/ANPR/datasets/State-wise_OLX/NL/NL1.jpg: [Errno 2] No such file or directory: '/Users/daksh/Desktop/ANPR/datasets/State-wise_OLX/NL/NL1.jpg'
