In [13]:
    import json
    import pytesseract
    import cv2
    import numpy as np
    import sys
    import re
    import os
    from PIL import Image, ImageOps
    import ftfy
    import io

In [14]:
def detect_angle_front(image):
    mask = np.zeros(image.shape, dtype=np.uint8)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (3,3), 0)
    adaptive = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,15,4)

    cnts = cv2.findContours(adaptive, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]

    for c in cnts:
        area = cv2.contourArea(c)
        if area < 45000 and area > 20:
            cv2.drawContours(mask, [c], -1, (255,255,255), -1)
#     cv2.imwrite("IAMHERE.png",mask)        
    mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
    # cv2_imshow(mask)
    h, w = mask.shape
    
    # Horizontal
    if w > h:
        left = mask[0:h, 0:0+w//4]
        right = mask[0:h, 3*w//4:]
        left_pixels = cv2.countNonZero(left)
        right_pixels = cv2.countNonZero(right)
        return 0 if left_pixels <= right_pixels else 180
    # Vertical
    else:
        top = mask[0:0+h//4, 0:w]
        bottom = mask[3*h//4:, 0:w]
        top_pixel = cv2.countNonZero(top)
        bottom_pixel = cv2.countNonZero(bottom)
        return 90 if top_pixel <= bottom_pixel else 270

def detect_angle_back(image):
    mask = np.zeros(image.shape, dtype=np.uint8)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (3,3), 0)
    adaptive = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,15,4)

    cnts = cv2.findContours(adaptive, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]

    for c in cnts:
        area = cv2.contourArea(c)
        if area < 45000 and area > 20:
            cv2.drawContours(mask, [c], -1, (255,255,255), -1)

    mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
    # cv2_imshow(mask)
    h, w = mask.shape
    
    # Horizontal
    if w > h:
        left = mask[0:h, 0:0+w//2]
        right = mask[0:h, w//2:]
        left_pixels = cv2.countNonZero(left)
        right_pixels = cv2.countNonZero(right)
        # print(left_pixels)
        # print(right_pixels)
        return 0 if left_pixels <= right_pixels else 180
    # Vertical
    else:
        top = mask[0:0+h//2, 0:w]
        bottom = mask[h//2:, 0:w]
        top_pixels = cv2.countNonZero(top)
        bottom_pixels = cv2.countNonZero(bottom)
        return 90 if top_pixels <= bottom_pixels else 270



In [15]:
def validate_aadhaar_numbers(candidates):
    # The multiplication table
    d = [
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
[2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
[3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
[4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
[5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
[6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
[7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
[8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
]
#permutation table p
    p = [
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
[5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
[8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
[9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
[4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
[2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
[7, 0, 4, 6, 9, 1, 3, 2, 5, 8]
]
# inverse table inv
    inv = [0, 4, 3, 2, 1, 5, 6, 7, 8, 9]
    for candidate in candidates:
        lastDigit=candidate[-1]
        c = 0
        array=[int(i) for i in candidate if i!=' ']
        array.pop()
        array.reverse()
        for  i in range(len(array)): 
            c = d[c][p[((i + 1) % 8)][array[i]]]                               #use verheoffs algorithm to validate
        if inv[c]==int(lastDigit):
            return candidate
        

In [16]:
def clean_words(name):
      name = name.replace("8", "B")
      name = name.replace("0", "D")
      name = name.replace("6", "G")
      name = name.replace("1", "I")
      name=name.replace('5','S')
      return name
        

In [25]:
def adhaar_read_data(textFront,textBack):
  
  textFront=re.sub(r'[^\x00-\x7F]+','', textFront)                  #remove non ascii characters
  textBack=re.sub(r'[^\x00-\x7F]+','', textBack)
  resFront=textFront.split()
  resBack=textBack.splitlines()
  
  resBack = [ i for i in resBack if i != '']

  name = None
  dob = None
  adh = None
  sex = None
  address=None
  temp=""
#   print(textBack)
  try: 
    address=(re.search(r'(?:(Address|Adress|Addres|Adres|Addre55):?)(.*(\d{6}))',textBack,re.M|re.S)).group(2)
  except AttributeError:
        pass
  
  lines = textFront.split('\n')
  for i in range(len(lines)):
    if 'DOB' in lines[i]:
        name=lines[i-1]
        dob=re.sub(r'.*DOB:?','',lines[i])           
        break;  
  aadhaar_candidates=re.findall(r'\d{4} \d{4} \d{4}',textFront+textBack)
  aadharNumber=validate_aadhaar_numbers(aadhaar_candidates)

  if 'female' in textFront.lower():
      sex = "FEMALE"
  else:
      sex = "MALE"
  
 
  try:

      name=clean_words(name)
      dob = dob.strip()
      dob = dob.replace('l', '/')
      dob = dob.replace('L', '/')
      dob = dob.replace('I', '/')
      dob = dob.replace('i', '/')
      dob = dob.replace('|', '/')
      dob = dob.replace('\"', '/1')
#       dob = dob.replace(":","")
      dob = dob.replace(" ", "")

      # Cleaning Adhaar number details
    
      for word in resFront:
          if len(word) == 4 and word.isdigit():
              aadhar_number=aadhar_number  + word + ' '
      if len(aadhar_number)>=14:
          print("Aadhar number is :"+ aadhar_number)
      else:
          print("Aadhar number not read")


  except:
      pass

  fathersName=re.search(r"(?::)(.*?)(?:,)",address).group(1)
  fathersName.strip()
  print(fathersName)
  data={}
  data['Name'] = name
  data['DOB'] = dob
  data['Aadhaar Number'] = aadharNumber
  data['Sex'] = sex
  data['ID Type']='Aadhaar'
  data['Address'] = address
  data['Fathers Name'] = fathersName
  data['Enrollment'] = None
  data['VID'] = None
  return data


In [22]:

def get_details_img(pathF,pathB):
    
    
    
#from google.colab.patches import cv2_imshow
    image_front = cv2.imread(pathF)
    image_back = cv2.imread(pathB)
    angle_front = detect_angle_front(image_front)
#     cv2.imwrite("temp/image_front.png",image_front)
    angle_back = detect_angle_back(image_back)
    print(angle_front)
    print(angle_back)
    im = Image.open(pathB)
    im = ImageOps.exif_transpose(im)
    if angle_back != 0:
      im = im.rotate(angle_back,Image.NEAREST, expand = 1)
    width, height = im.size

    left = width/2
    top = 5
    right = width
    bottom = height

#     im1 = im.crop((left, top, right, bottom))                        #it crops the image, might be a problem

    open_cv_image = np.array(im) 
    dst = cv2.fastNlMeansDenoisingColored(open_cv_image, None, 10, 10, 7, 15)
#     config = ('--psm 1')
    extractedbacktext = pytesseract.image_to_string(dst,lang='kan+eng+hin')
    extractedbacktext_output = open('output2.txt', 'w', encoding='utf-8')
    extractedbacktext_output.write(extractedbacktext)
    extractedbacktext_output.close()
    file = open('output2.txt', 'r', encoding='utf-8')
    extractedbacktext = file.read()
    extractedbacktext = ftfy.fix_text(extractedbacktext)
    extractedbacktext = ftfy.fix_encoding(extractedbacktext)
    print(extractedbacktext)


# from google.colab.patches import cv2_imshow
    src = cv2.imread(pathF)
    # if angle_front == 0:
    #   img = src
    # if angle_front == 270:
    #   img = cv2.rotate(src, cv2.ROTATE_90_CLOCKWISE)
    # if angle_front == 180:
    #   img = cv2.rotate(src, cv2.ROTATE_180)
    # if angle_front == 90:
    #   img = cv2.rotate(src, cv2.ROTATE_90_COUNTERCLOCKWISE)
    img=src
    # cv2_imshow(img)
    dst = cv2.fastNlMeansDenoisingColored(img, None, 10, 10, 7, 15)
    config = ('--psm 3')
    extractedfronttext = pytesseract.image_to_string(dst,config=config,lang='kan+eng+hin')     #try adding other configs  
    extractedfronttext_output = open('output1.txt', 'w', encoding='utf-8')
    extractedfronttext_output.write(extractedfronttext)
    extractedfronttext_output.close()
    file = open('output1.txt', 'r', encoding='utf-8')
    extractedfronttext = file.read()
    extractedfronttext = ftfy.fix_text(extractedfronttext)
    #print(extractedfronttext)
    extractedfronttext = ftfy.fix_encoding(extractedfronttext)
    #print(extractedfronttext)
    data = adhaar_read_data(extractedfronttext,extractedbacktext)
    return data
#     try:
#         to_unicode = unicode
#     except NameError:
#         to_unicode = str
#     with io.open('info.json', 'w', encoding='utf-8') as outfile:
#         data = json.dumps(data, indent=4, sort_keys=True, separators=(',', ': '), ensure_ascii=False)
#         outfile.write(to_unicode(data))

#     with open('info.json', encoding = 'utf-8') as data:
#         data_loaded = json.load(data)

#     print("\n---------- ADHAAR Details ----------")
#     print("\nADHAAR Number: ",data_loaded['Adhaar Number'])
#     print("\nName: ",data_loaded['Name'])
#     print("\nDate Of Birth: ",data_loaded['Date of Birth'])
#     print("\nSex: ",data_loaded['Sex'])
#     print("\nAddress:",data_loaded['Address'])
#     print("\n------------------------------------")