In [49]:
###### ✅ Mount Google Drive and Import Required Libraries
from google.colab import drive
import os

drive.mount('/content/drive')
print("📂 Google Drive mounted successfully.")

###### ✅ Sub-task 1.1.2: Define Base Directory
base_path = '/content/drive/MyDrive/CustomOCR'
print(f"📁 Base directory set to: {base_path}")

# ✅ Defining directory structure in Drive
base_path = '/content/drive/MyDrive/Custom_OCR_Project'
dataset_path = os.path.join(base_path, 'datasets')
model_dir = os.path.join(base_path, 'models')
results_path = os.path.join(base_path, 'results')

# ✅ Creating folders if not present
os.makedirs(dataset_path, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)
os.makedirs(results_path, exist_ok=True)

# ✅ Displaying folder structure status
print("\n📦 Folder paths set up successfully:")
print(f"   📁 Dataset Directory : {dataset_path}")
print(f"   📁 Model Directory   : {model_dir}")
print(f"   📁 Results Directory : {results_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📂 Google Drive mounted successfully.
📁 Base directory set to: /content/drive/MyDrive/CustomOCR

📦 Folder paths set up successfully:
   📁 Dataset Directory : /content/drive/MyDrive/Custom_OCR_Project/datasets
   📁 Model Directory   : /content/drive/MyDrive/Custom_OCR_Project/models
   📁 Results Directory : /content/drive/MyDrive/Custom_OCR_Project/results


In [None]:
# Step 1: Importing required libraries
# I'm importing libraries needed for file handling, parsing XML, and data manipulation
import os
from glob import glob  # extracting paths of each XML file
import pandas as pd  # preparing structured DataFrame
from xml.etree import ElementTree as et  # parsing XML files
from functools import reduce  # flattening nested lists

In [None]:
# Step 2: Suppressing warnings for clean output
import warnings
warnings.filterwarnings('ignore')

# Step 3: Collecting paths of all XML files from dataset folder
xmlfiles = glob(base_path + '/data_images/*.xml')  # fetching all .xml files from drive
xmlfiles = list(map(lambda x: x.replace('\\', '/'), xmlfiles))  # normalizing path separators
print(f'Total XML files found: {len(xmlfiles)}')

# Step 4: Displaying few sample XML paths
print("\nXML file paths:")
for i, path in enumerate(xmlfiles[:5], 1):
    print(f"{i}. {path}")

Total XML files found: 50

XML file paths:
1. /content/drive/MyDrive/CustomOCR/data_images/thyrocare_0_4618.xml
2. /content/drive/MyDrive/CustomOCR/data_images/thyrocare_0_421.xml
3. /content/drive/MyDrive/CustomOCR/data_images/thyrocare_0_1915.xml
4. /content/drive/MyDrive/CustomOCR/data_images/thyrocare_0_447.xml
5. /content/drive/MyDrive/CustomOCR/data_images/thyrocare_0_4748.xml


In [None]:
# Step 5: Defining function to parse XML content
# I'm extracting: filename, image size, object class and bounding box coordinates
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    image_name = root.find('filename').text
    width = root.find('size/width').text
    height = root.find('size/height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
    return parser

# Step 6: Parsing all XML files
# I'm calling the extract_text function on every XML file
parser_all = list(map(extract_text, xmlfiles))
print(f'Total parsed entries (file-wise): {len(parser_all)}')

print("\nShowing 4 annotations from first file:")
for i, annotation in enumerate(parser_all[0][:4], 1):
    print(f"{i}. {annotation}")

Total parsed entries (file-wise): 50

Showing 4 annotations from first file:
1. ['thyrocare_0_4618.jpg', '939', '631', 'Test Name', '77', '356', '401', '458']
2. ['thyrocare_0_4618.jpg', '939', '631', 'Value', '494', '559', '391', '448']
3. ['thyrocare_0_4618.jpg', '939', '631', 'Units', '582', '641', '387', '445']
4. ['thyrocare_0_4618.jpg', '939', '631', 'Reference Range', '641', '701', '390', '445']


In [None]:
# Step 8: Flattening the parsed list into a single list of annotations
data = reduce(lambda x, y: x + y, parser_all)
print(f'Total number of bounding boxes extracted: {len(data)}')

# Step 9: Previewing a few annotations
print("\nPreview of parsed bounding boxes:")
for i, bbox in enumerate(data[:3], 1):
    print(f"{i}. {bbox}")

Total number of bounding boxes extracted: 200

Preview of parsed bounding boxes:
1. ['thyrocare_0_4618.jpg', '939', '631', 'Test Name', '77', '356', '401', '458']
2. ['thyrocare_0_4618.jpg', '939', '631', 'Value', '494', '559', '391', '448']
3. ['thyrocare_0_4618.jpg', '939', '631', 'Units', '582', '641', '387', '445']


In [None]:
# Step 10: Creating DataFrame from the extracted annotations
df = pd.DataFrame(data, columns=['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])
print('✅ DataFrame created successfully!\n')
df.head()

✅ DataFrame created successfully!



Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,thyrocare_0_4618.jpg,939,631,Test Name,77,356,401,458
1,thyrocare_0_4618.jpg,939,631,Value,494,559,391,448
2,thyrocare_0_4618.jpg,939,631,Units,582,641,387,445
3,thyrocare_0_4618.jpg,939,631,Reference Range,641,701,390,445
4,thyrocare_0_421.jpg,1068,635,Test Name,79,427,343,560


# 🔍 Key Findings after Step 10

- ✅ Successfully parsed all XML files and created a structured DataFrame
- 📄 Each row contains:
  - 📁 Filename
  - 🖼️ Image dimensions
  - 🏷️ Object name
  - 📐 Bounding box coordinates
- 🚀 Ready for type conversion and normalization

In [None]:
# Step 11: Checking dataset information before conversion
print("DataFrame shape:")
print(df.shape)

print("\nValue counts of 'name' column:")
print(df['name'].value_counts())

print("\nDataFrame info:")
df.info()

DataFrame shape:
(200, 8)

Value counts of 'name' column:
name
Test Name          50
Value              50
Units              50
Reference Range    50
Name: count, dtype: int64

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  200 non-null    object
 1   width     200 non-null    object
 2   height    200 non-null    object
 3   name      200 non-null    object
 4   xmin      200 non-null    object
 5   xmax      200 non-null    object
 6   ymin      200 non-null    object
 7   ymax      200 non-null    object
dtypes: object(8)
memory usage: 12.6+ KB


In [None]:
# Step 12: Converting numeric columns from string to integer
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
print("\n✅ Successfully converted width/height/bbox to integers")
print(df.info())


✅ Successfully converted width/height/bbox to integers
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  200 non-null    object
 1   width     200 non-null    int64 
 2   height    200 non-null    int64 
 3   name      200 non-null    object
 4   xmin      200 non-null    int64 
 5   xmax      200 non-null    int64 
 6   ymin      200 non-null    int64 
 7   ymax      200 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 12.6+ KB
None


# 🔍 Key Findings after Step 12

- 🔢 All coordinate fields are now integers
- ✅ Data is ready for normalization

In [None]:
# Step 13: Normalizing bounding box coordinates
# I'm calculating center_x, center_y, width and height relative to image size
df['center_x'] = ((df['xmax'] + df['xmin']) / 2) / df['width']
df['center_y'] = ((df['ymax'] + df['ymin']) / 2) / df['height']
df['w'] = (df['xmax'] - df['xmin']) / df['width']
df['h'] = (df['ymax'] - df['ymin']) / df['height']

print("✅ Normalization completed\n")
# df[['filename', 'name', 'center_x', 'center_y', 'w', 'h']].head()
df.head()

✅ Normalization completed



Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,thyrocare_0_4618.jpg,939,631,Test Name,77,356,401,458,0.230564,0.680666,0.297125,0.090333
1,thyrocare_0_4618.jpg,939,631,Value,494,559,391,448,0.560703,0.664818,0.069223,0.090333
2,thyrocare_0_4618.jpg,939,631,Units,582,641,387,445,0.651225,0.659271,0.062833,0.091918
3,thyrocare_0_4618.jpg,939,631,Reference Range,641,701,390,445,0.71459,0.661648,0.063898,0.087163
4,thyrocare_0_421.jpg,1068,635,Test Name,79,427,343,560,0.236891,0.711024,0.325843,0.341732


# 🔍 Key Findings after Step 13

- 📏 Bounding boxes are normalized between 0 and 1
- 🤖 Suitable format for training with YOLO

In [None]:
# Step 14: Splitting data into train and test based on filenames
from sklearn.model_selection import train_test_split
train_files, test_files = train_test_split(df['filename'].unique(), test_size=0.2, random_state=42)

In [None]:
train_df = df[df['filename'].isin(train_files)]
test_df = df[df['filename'].isin(test_files)]

print(f"\nTraining samples: {len(train_df)}")
print(f"Testing samples: {len(test_df)}")


Training samples: 160
Testing samples: 40


# 🔍 Key Findings after Step 14

- 🔀 Dataset split into train and test using an 80/20 ratio
- 🗂️ Unique filenames used to avoid data leakage

In [None]:
# Step 15: Assign ID to object names
def label_encoding(x):
    labels = {'Test Name':0, 'Value':1, 'Units':2, 'Reference Range':3}
    return labels[x]

train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,thyrocare_0_4618.jpg,939,631,Test Name,77,356,401,458,0.230564,0.680666,0.297125,0.090333,0
1,thyrocare_0_4618.jpg,939,631,Value,494,559,391,448,0.560703,0.664818,0.069223,0.090333,1
2,thyrocare_0_4618.jpg,939,631,Units,582,641,387,445,0.651225,0.659271,0.062833,0.091918,2
3,thyrocare_0_4618.jpg,939,631,Reference Range,641,701,390,445,0.71459,0.661648,0.063898,0.087163,3
4,thyrocare_0_421.jpg,1068,635,Test Name,79,427,343,560,0.236891,0.711024,0.325843,0.341732,0
5,thyrocare_0_421.jpg,1068,635,Value,678,738,342,552,0.662921,0.703937,0.05618,0.330709,1
6,thyrocare_0_421.jpg,1068,635,Units,788,868,345,550,0.775281,0.704724,0.074906,0.322835,2
7,thyrocare_0_421.jpg,1068,635,Reference Range,898,979,347,550,0.878745,0.706299,0.075843,0.319685,3
8,thyrocare_0_1915.jpg,635,1043,Test Name,2,172,233,263,0.137008,0.237776,0.267717,0.028763,0
9,thyrocare_0_1915.jpg,635,1043,Value,404,437,233,271,0.662205,0.241611,0.051969,0.036433,1


In [None]:
# Step 16: Save image and labels to text
from shutil import move
train_folder = base_path + '/data_images/train'
test_folder = base_path + '/data_images/test'
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

print(f"Train folder created at: {train_folder}")
print(f"Test folder created at: {test_folder}")

print(f"\nNumber of training files: {len(groupby_obj_train)}")
print(f"Number of testing files: {len(groupby_obj_test)}")

Train folder created at: /content/drive/MyDrive/CustomOCR/data_images/train
Test folder created at: /content/drive/MyDrive/CustomOCR/data_images/test

Number of training files: 40
Number of testing files: 10


In [None]:
# Step 17: Save each image in train/test folder and repective labels in .txt

def save_data(filename, folder_path, group_obj):
    src = os.path.join(base_path, 'datasets', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst)
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

pd.Series(groupby_obj_train.groups.keys()).apply(save_data, args=(train_folder, groupby_obj_train))
pd.Series(groupby_obj_test.groups.keys()).apply(save_data, args=(test_folder, groupby_obj_test))

print("✅ All images and label files saved successfully.")
print("🎯 Unique labels:", train_df['name'].unique())

✅ All images and label files saved successfully.
🎯 Unique labels: ['Test Name' 'Value' 'Units' 'Reference Range']


In [None]:
print(f"✅ Saved {len(groupby_obj_train)} training images and label files to {train_folder}")
print(f"✅ Saved {len(groupby_obj_test)} testing images and label files to {test_folder}")

✅ Saved 40 training images and label files to /content/drive/MyDrive/CustomOCR/data_images/train
✅ Saved 10 testing images and label files to /content/drive/MyDrive/CustomOCR/data_images/test
