# Date extraction notebook




In [1]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

!pip install pytesseract        
!sudo apt install tesseract-ocr 

Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.00~git2288-10f4998a-2).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


In [2]:
import numpy as np

#SOURCE CODE : https://becominghuman.ai/how-to-automatically-deskew-straighten-a-text-image-using-opencv-a0c30aed83df

# Calculate skew angle of an image
def getSkewAngle(cvImage) -> float:
    # Prep image, copy, convert to gray scale, blur, and threshold
    newImage = cvImage.copy()
    gray = cv2.cvtColor(newImage, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (9, 9), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    # Apply dilate to merge text into meaningful lines/paragraphs.
    # Use larger kernel on X axis to merge characters into single line, cancelling out any spaces.
    # But use smaller kernel on Y axis to separate between different blocks of text
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5))
    dilate = cv2.dilate(thresh, kernel, iterations=5)

    # Find all contours
    contours, hierarchy = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key = cv2.contourArea, reverse = True)

    # Find largest contour and surround in min area box
    largestContour = contours[0]
    minAreaRect = cv2.minAreaRect(largestContour)

    # Determine the angle. Convert it to the value that was originally used to obtain skewed image
    angle = minAreaRect[-1]
    if angle < -45:
        angle = 90 + angle
    return -1.0 * angle
    
def rotate_image(image, angle):
  image_center = tuple(np.array(image.shape[1::-1]) / 2)
  rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
  result = cv2.warpAffine(image, rot_mat, image.shape[1::-1], flags=cv2.INTER_LINEAR)
  return result

## Date extraction function

In [97]:
import re
from datetime import datetime

def readDate(text):
  # NORMALISING TEXT
  textNormalised = re.sub("[^a-zA-Z\d ]", " ", text) # Remove punctuation

  # SEARCHING FOR DATES
  regexDateFormats = [\
                      "\d\d \d\d \d\d",\
                      "\d\d \d\d \d\d\d\d",\
                      "\d\d [a-z][a-z][a-z]",\
                      "[a-z][a-z][a-z] \d\d\d\d"\
                      ]

  # See possible format: https://www.programiz.com/python-programming/datetime/strptime
  dateFormats =[\
                "%d %m %y",\
                "%d %m %Y",\
                "%d %b %Y",\
                "%b %Y"\
                ] 

  i = 0
  dateList = []
  while i < len(regexDateFormats):
    prevLength = len(dateList)
    dateList = dateList + re.findall(regexDateFormats[i], textNormalised, re.IGNORECASE)
    
    # FILTERING DATES FOUND
    j = prevLength
    while j < len(dateList):
      # If year is missing (matched  with regex "\d\d [a-z][a-z][a-z]"")...
      if i == 2: 
        # ... we assume it is current year
        now = datetime.now()
        dateList[j] = dateList[j] + " " + str(now.year)
        dateFormat = "%d %b %Y" #dd MMM (yyyy)
      
      # try to convert a date to datetime
      try:
        date = datetime.strptime(dateList[j], dateFormats[i])
        dateList[j] = date
        j = j + 1
      # And discard invalid dates that can't be convert to datetime
      except ValueError:
        dateList.pop(j)

    i = i + 1

  # Return none if no date found
  if len(dateList) == 0:
    return None

  # We assume the last date found on the package correspond to expirity date
  return dateList[-1]


## Testing

In [98]:
import glob
import cv2
import pytesseract
import numpy as np

files = sorted(glob.glob('filesSelection/*.jpg'))

for file in files:
  image = cv2.imread(file)

  image = rotate_image(image, -getSkewAngle(image))

  extractedInformation = pytesseract.image_to_string(image)

  print(readDate(extractedInformation))


None
None
None
None
2021-03-14 00:00:00
None
None
None
None
None
None
None
None
2021-03-21 00:00:00
None
None
None
None
None
None
None
None
None
None
2011-10-04 00:00:00
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
2020-12-30 00:00:00
None
None
None
None
None
None
None
None
None
None
None
None
None
