In [None]:
!apt-get update
!apt-get install poppler-utils
!pip install pytesseract
!apt-get install tesseract-ocr
!pip install pdf2image


from PIL import Image
import pdf2image
import io
import pandas as pd


In [None]:
import re
#the scanned text can contain unexpected chars
def correct_course(text):
  chars_to_remove = "-\\'|;[]=_‘"
  new_text = text.translate(str.maketrans('', '', chars_to_remove))
  return new_text

string = "sdlf--\\|;dfd'df["
print(correct_course(string))

In [None]:
import re
#sometimes credit can extracted incorrectly, such as 1.0 can be taken 10
def correct_credit(text):
  new_text = text
  if text.find('.') ==-1 and len(text) > 1:
    new_text = text[:-1] + '.' + text[-1]
  return new_text

print(correct_credit("20"))

In [None]:
import re
#check whether the grad is extracted correctly
def correct_grads(text):
  new_text = text
  validC = "ABCFPN"
  if not (validC.find(text) != -1 or (text.isdigit() and 0<= int(text) <=100)):
    return "-1"
  return text

print(correct_grads("120"))

In [None]:
import re
def preprocess_page(text):
    # Remove newlines and extra whitespace
    # Split the string into lines, strip whitespace from each line, and filter out the empty lines
    non_empty_lines = [line.strip() for line in text.splitlines() if line.strip()]

    # Join the non-empty lines back together
    text = "\n".join(non_empty_lines)
    text = text.strip().replace("|",'')
    #define the pattern that finds a row in the transcript table
    #corresponding to: the_name_of_the_course hours credit grade semester
    #during to the extraction accuracy, can have unexpected chars such as | ] -
    #for example
    #Math 36 1.0 | 99 1/2
    pattern = r"(\D+)(\d+)\D+(\d+\.\d|\d+)\D+(\d+|[A-Z])\D+(\d\/\d)"
    lines = text.split("\n")
    left = []
    right = []

    for line in lines:
      if line.find("/") == -1:
        continue

      matched = re.findall(pattern, line)
      #if matched, then it is a row of the table
      if matched != None and len(matched) > 0:
        print(matched)
        #corrected name of the course
        course = correct_course(matched[0][0])

        #corrected credit of the course
        credit = correct_credit(matched[0][2])

        #corrected grads of the course in 100 or ABCF or NP scale
        grads = correct_grads(matched[0][3])

        #two tables in one page, scanned horizontal and need to process vertical
        #get the left table
        left.append([course, credit, grads])
        if(len(matched) == 2):
          #corrected name of the course
          course = correct_course(matched[1][0])

          #corrected credit of the course
          credit = correct_credit(matched[1][2])

          #corrected grads of the course in 100 or ABCF or NP scale
          grads = correct_grads(matched[1][3])

          #two tables in one page, get the right table
          right.append([course, credit, grads])

    finalresult = left + right

    return finalresult

In [None]:
from sqlalchemy.engine.result import Result
import pytesseract
import cv2
import numpy as np
import imutils

def extracttxt_pdf(pdf_path):
  # Define the path to the txt file
  txt_file = pdf_path[0:pdf_path.index(".")] + ".txt"
  # Use pdf2image to convert the PDF file to a list of PIL images
  pages = pdf2image.convert_from_path(pdf_path)

  result = []
  #write the extracted text into a file
  with open(txt_file, 'w') as file:
    # Loop through each page of the PDF
    for page in pages:
        #do noise reduction before extraction to improve accuracy
        # convert the image to a NumPy array
        image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)
        
        # convert the image to grayscale
        gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Binarization
        # thresh = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

        # Denoising using median filtering
        # denoised = cv2.medianBlur(thresh, 3)
        # apply Gaussian filtering
        filtered_img = cv2.GaussianBlur(gray_img, (5, 5), 0)

        # Deskew
        # osd = pytesseract.image_to_osd(denoised)
        # matched = re.search(r"Rotate: (\d+.?\d*)",osd)
        # if(matched != None):
        #   angle = matched.group(1) 
        # rotated = imutils.rotate_bound(denoised, int(angle))

        # Contrast enhancement
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced_image = clahe.apply(gray_img)

        # define the minimum and maximum pixel values
        # min_val, max_val, _, _ = cv2.minMaxLoc(enhanced_image)

        # stretch the pixel values to the range of 0-255
        # stretched_img = np.uint8((enhanced_image - min_val) * (255 / (max_val - min_val)))


        # Use pytesseract to extract text from the page
        text = pytesseract.image_to_string(enhanced_image)
        res = preprocess_page(text)

        result += res
        # write some text to the file
        file.write(text)
  return result

res = extracttxt_pdf("Transcript.pdf")


In [None]:
def generate_excel(path, data):
  # Convert the data to a pandas DataFrame
  column_names = ["Course","Credit","Grads"]  
  df = pd.DataFrame(data, columns=column_names)
  df.head()
  # Save the DataFrame as an Excel file
  df.to_csv(path, index=False)

In [None]:
def to_nor(row):
  if row['Grads'] == 'P':
    return 'A'
  elif row['Grads'] == 'A' or row['Grads'] == 'B' or row['Grads'] == 'C':
    return row['Grads']
  elif row['Grads'] == 'N':
    return 'F'
  elif float(row['Grads']) >= 89.5:
    return 'A'
  elif float(row['Grads'])  >= 77.5:
    return 'B'
  elif float(row['Grads'])  >= 65.5:
    return 'C'
  elif float(row['Grads'])  >= 53.5:
    return 'D'
  elif float(row['Grads'])  >= 41.5:
    return 'E'
  else:
    return 'F' 

In [None]:
# Define a function to right-align the text in a cell
def right_align(val):
    return str(val).rjust(10)



In [None]:
def compute_nor_grad(path):
  df = pd.read_csv(path)
  df['Nor_Grad'] = df.apply(to_nor, axis=1)
  # Define a dictionary to map Norwegian grades to numerical values
  norwegian_grades_map = {'A': 6, 'B': 5, 'C': 4, 'D': 3, 'E': 2, 'F': 1}

  # Compute the numerical value of each Norwegian grade
  numerical_grades = [norwegian_grades_map[grade] for grade in df['Nor_Grad']]
  print(numerical_grades)
  df['Nor_Grad_Num'] = numerical_grades
  print(df.head())
  # Apply the right_align function to all cells in the dataframe
  df = df.applymap(right_align)   
  # Save the DataFrame as an Excel file
  df.to_csv(path, index=False)

In [None]:
# generate a csv file from the scanned bachelor transcript and comput Norwegian grad
def processTranscript(filepath):
  res = extracttxt_pdf(filepath)
  generate_excel(filepath[0:filepath.index('.')] + ".csv", res)
  compute_nor_grad(filepath[0:filepath.index('.')] + ".csv")

In [None]:
#process bachelor transcript
processTranscript("Transcript.pdf")

# New Section