### A Few Things to Know...
- Tesseract NEEDS to be installed for this to work. I've included the installer in the repo and the link to find it is here:
    - https://github.com/UB-Mannheim/tesseract/wiki 
    - Specify a new file path if things don't work
- Try to use the same resolution as the image provided in the repo

In [None]:
# --- --- --- --- ---
#  Imports
# --- --- --- --- ---

import pytesseract      # Module for reading text from images
pytesseract.pytesseract.tesseract_cmd = r"E:\\Tesseract\\tesseract.exe"
    # Put the new file path to the .exe file here
    # You also might have to add it to your system's PATH
    # taskbar search > environment variables > PATH (both USER and SYSTEM)
import numpy as np      # Numpy 
import cv2              # Module for image processing
import os               # os for managing files

# Folder containing all of the necessary files
os.chdir("E:/Projects/ClassFinder")     # Change this to the location of stuff

In [None]:
# --- --- --- --- ---
# Reading text off of the image
# --- --- --- --- ---

# Uploading the image
file = "Schedule2.png"      # Schedule image to use
image = cv2.imread(file)    # Reads the image
image = cv2.resize(image, None, fx=4, fy=4, interpolation = cv2.INTER_CUBIC)
    # Upscales the image


h, w = image.shape[:2]              # Gets the image shape
image = image[:, :int(0.77 * w)]    # Cuts off the boring stuff
gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)    # Grayscales the image
text = pytesseract.image_to_string(gray)                    # Gets the text from the image

# Prints the text that it read
# YOU CAN IGNORE THIS IF YOU WANT
print(text)

CMPSC 32 - OBJ ORIENT DESIGN

07047

Grading: L

4.0 Units NASIR N
T.B.A

CMPSC 64 - COMP ORGANIZATION

07161

Grading: L

4.0 Units MATNIZA
T.B.A

CMPSC 130A - DATA STRUCT ALGORTH

07229

Grading: L

4.0 Units NASIR N
T.B.A

CMPTGCS 191 - EXPLORING RESEARCH

75457

Grading: P

1.0 Units SHERWOOD T P

PSTAT 120B - PROB & STATISTICS

42234

Grading: L

4.0 Units QU P
T.B.A

Total Units: 17.0

9:30 AM-10:45 AM
10:00 AM-10:50 AM

12:30 PM-1:45 PM
4:00 PM-4:50 PM

2:00 PM-3:15 PM
11:00 AM-11:50 AM

4:00 PM-6:00 PM

11:00 AM-12:15 PM
3:00 PM-3:50 PM

Psychology Building, 1902
Phelps Hall, 3525

ILP, 1101
Phelps Hall, 3525

Harold Frank Hall, 1104
Phelps Hall, 1508

Creative Studies, 164B

ILP, 1302
ILP, 4209



In [None]:
# --- --- --- --- ---
# Class Definition
# --- --- --- --- ---

class classContainer:
    def __init__(self, inputList):
        # Using input list values to construct the class
        self.title = inputList[0]       # Title of the course
        self.code = inputList[1]        # Course code ID
        self.grading = inputList[2]     # Grading type of the course
        self.units = inputList[3]       # Unit value of the class
        self.professor = inputList[4]   # The name of the professor

        # Handling sections for classes
        if len(inputList) == 6:         # Checks if there is a TA
            self.TA = inputList[5]          # Adds the TA to the list
        else:                           # Checks if there is no TA
            self.TA = "None"                # Sets the TA to "None"
        # Classes won't always have sections, this code handles that exception
        
        # Uninitialized Variables
        self.lecTime = "None"   # Time of the lecture 
        self.secTime = "None"   # Time of the section
        self.lecLoc = "None"    # Lecture location
        self.secLoc = "None"    # Section location
        self.lecDay = "None"    # Day of the lecture
        self.secDay = "None"    # Day of the section

    # Misc. function to print values
    # YOU CAN IGNORE THIS IF YOU WANT
    def getClasses(self):
        print("Title".ljust(15) + " -> " + self.title)
        print("Code".ljust(15) + " -> " + self.code)
        print("Grading".ljust(15) + " -> " + self.grading)
        print("Units".ljust(15) + " -> " + self.units)
        print("Professor".ljust(15) + " -> " + self.professor)
        print("TA".ljust(15) + " -> " + self.TA)
        print("---")
        print("Prof Time".ljust(15) + " | " + self.lecTime)
        print("Lec Location".ljust(15) + " | " + self.lecLoc)
        print("Lec Days".ljust(15) + " | " + self.lecDay)
        print("TA Time".ljust(15) + " | " + self.secTime)
        print("Sec Location".ljust(15) + " | " + self.secLoc)
        print("Sec Time".ljust(15) + " | " + self.secDay)
        print("---")

    # Misc. functions to set values
    # YOU CAN IGNORE THESE IF YOU WANT
    # --- --- --- --- ---

    # Sets the lecture time
    def setProfTime(self, inputString):
        self.lecTime = inputString

    # Sets the section time
    def setTATime(self, inputString):
        self.secTime = inputString

    # Sets the lecture location
    def setLecLoc(self, inputString):
        self.lecLoc = inputString

    # Sets the section location
    def setSecLoc(self, inputString):
        self.secLoc = inputString

    # Sets the lecture day
    def setLecDay(self, inputString):
        self.lecDay = inputString

    # Sets the section day
    def setSecDay(self, inputString):
        self.secDay = inputString
        


In [None]:
# --- --- --- --- ---
# List preprocessing
# --- --- --- --- ---
temp = text.replace("Units", "\n")      # Replaces "Units"
    # The values for units and professor names appear on the same line
    # The word "Units" is replaced with a new line for splitted (see next line)
temp = temp.split("\n")             # Splits the text by the newline character "\n"
temp = list(filter(None, temp))     # Removes empty "" entries in the list
classes = text.count(" - ")         # Counts the total number of classes present in the image
    # All of the classes have a - with two spaces beside them
    # There are other dashes present, but none have spaces around them
    # e.g. CMPSC 32 - OBJ ORIENT DESIGN

# Removing the total units
for i in range(len(temp) - 2):      # Loops through all values in the list
    if "Total " in temp[i]:             # Checks if the word "Total" appears
        temp.remove(temp[i])                # Removes that entry
        temp.remove(temp[i])                # Removes the next entry as well
            # When the previous value is popped, this is the new value at temp[i]
    # We don't need to measure the total amount of units, so it gets removed
    # This string is also split by the word "Units" so it appears in 2 list entries
    # This loop deletes both of them

# Printing values
# YOU CAN IGNORE THIS IF YOU WANT
print(temp)
print(classes)


['CMPSC 32 - OBJ ORIENT DESIGN', '07047', 'Grading: L', '4.0 ', ' NASIR N', 'T.B.A', 'CMPSC 64 - COMP ORGANIZATION', '07161', 'Grading: L', '4.0 ', ' MATNIZA', 'T.B.A', 'CMPSC 130A - DATA STRUCT ALGORTH', '07229', 'Grading: L', '4.0 ', ' NASIR N', 'T.B.A', 'CMPTGCS 191 - EXPLORING RESEARCH', '75457', 'Grading: P', '1.0 ', ' SHERWOOD T P', 'PSTAT 120B - PROB & STATISTICS', '42234', 'Grading: L', '4.0 ', ' QU P', 'T.B.A', '9:30 AM-10:45 AM', '10:00 AM-10:50 AM', '12:30 PM-1:45 PM', '4:00 PM-4:50 PM', '2:00 PM-3:15 PM', '11:00 AM-11:50 AM', '4:00 PM-6:00 PM', '11:00 AM-12:15 PM', '3:00 PM-3:50 PM', 'Psychology Building, 1902', 'Phelps Hall, 3525', 'ILP, 1101', 'Phelps Hall, 3525', 'Harold Frank Hall, 1104', 'Phelps Hall, 1508', 'Creative Studies, 164B', 'ILP, 1302', 'ILP, 4209']
5
[]
Title           -> CMPSC 32 - OBJ ORIENT DESIGN
Code            -> 07047
Grading         -> Grading: L
Units           -> 4.0 
Professor       ->  NASIR N
TA              -> T.B.A
---
Prof Time       | 9:30 A

In [None]:
# --- --- --- --- ---
# Processing
# --- --- --- --- ---

classList = []      # Initializes the list of objects
tempClass = []      # Temporary list for building a new class
counter = 0         # Temporary counter

# Remember that the RAW list is separated into section
# > CLASSES
# > LECTURE / SECTION TIMES
# > LECTURE / SECTION LOCATIONS
# The logic for classes is different than the logic for lecture / section times
# Keep this in mind when looking through the logic below

# Adding main class details, loops for all text elements
for i in temp:
    # Checks if the loop has reached the time section
    if (("AM" in i) or ("PM" in i)) and (":" in i):
        classList.append(classContainer(tempClass))     # Appends the leftover class to the list
        break                                           # Breaks the loop

    # Checks if a new class section has started in the temp list
    if " - " in i:                                      # Checks for the starting " - " characters
        if tempClass:                                       # Checks if there are values stored in the tempClass list
            classList.append(classContainer(tempClass))         # Appends it to the class list
            
            # On the first execution the classList list is empty, this prevents it from adding an empty list
        
        tempClass = []          # Resets the temporary class variable
        tempClass.append(i)     # Adds the next value to the temporary class
    else:
        tempClass.append(i)     # Adds the next value to the temporary class

    counter += 1

# Remove Garbage
classList = [x for x in classList if x]     # Removes any empty values from the list
    # I'll be honest I'm not really sure if you need this or not

# Redefines the temporary list
temp = temp[counter:]   # Takes all of the stuff we just looked at out of the temp

# Adding times
# Here we're just popping values out from the start of the list since they're in order already
# These loops keep removing the first value from the list

# Reading the times in the classes
for i in range(classes):                    # Loops for each class in the list
    classList[i].setProfTime(temp.pop(0))       # Adds the professor time
    if classList[i].TA != "None":               # Checks if there is a TA
        classList[i].setTATime(temp.pop(0))         # Adds the TA time

# Reading the locations in the classes
for i in range(classes):                    # Loops for each class in the list
    classList[i].setLecLoc(temp.pop(0))         # Adds the lecture location
    if classList[i].TA != "None":               # Checks if there is a TA
        classList[i].setSecLoc(temp.pop(0))         # Adds the section location

# Printing Information
# YOU CAN IGNORE THIS IF YOU WANT
print(temp)             # prints whatever is still in the temporary variable
for item in classList:  # Loops for each class object
    item.getClasses()       # Calls the function that prints everything

In [None]:
# --- --- --- --- ---
# Resizing 2
# --- --- --- --- ---

# This is for the days each class takes place
# The previous text-to-image run doesn't read any of the M T W R F days
# We need to do this to make sure everything works

# Gets the height and width of the image
h, w = image.shape[:2]                          # Gets the new height / width of the current image
new = image[:, int(0.45 * w):int(0.5 * w)]      # Takes the section of the image with the letters 

# Processing the image
gray = cv2.cvtColor(new, cv2.COLOR_BGR2GRAY)    # Grayscales it to make it look nice
newImage = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 5)
    # Fixes up the image a bit
    # I looked this thing up, not really sure how it works

config = r"--psm 6 -c tessedit_char_whitelist=MWFTR"    # Configuration settings for the OCR
    # Configures it to look for specific characters in a specific way
text = pytesseract.image_to_string(newImage, config=config)     # Converts the image to text

# Preparing the unsorted list
temp = text.split("\n")             # Splits it by the newline character "\n"
temp = list(filter(None, temp))     # Removes empty "" values

# Reading the days of classes
for i in range(classes):                    # Loops for each class in the list
    classList[i].setLecDay(temp.pop(0))         # Adds the lecture days
    if classList[i].TA != "None":               # Checks if there is a TA
        classList[i].setSecDay(temp.pop(0))         # Adds the section days

# Printing Class Contents
# YOU CAN IGNORE THIS IF YOU WANT   
for item in classList:  # Loops for each class object
    item.getClasses()       # Calls the function that prints everything


Title           -> CMPSC 32 - OBJ ORIENT DESIGN
Code            -> 07047
Grading         -> Grading: L
Units           -> 4.0 
Professor       ->  NASIR N
TA              -> T.B.A
---
Prof Time       | 9:30 AM-10:45 AM
Lec Location    | Psychology Building, 1902
Lec Days        | MW
TA Time         | 10:00 AM-10:50 AM
Sec Location    | Phelps Hall, 3525
Sec Time        | F
---
Title           -> CMPSC 64 - COMP ORGANIZATION
Code            -> 07161
Grading         -> Grading: L
Units           -> 4.0 
Professor       ->  MATNIZA
TA              -> T.B.A
---
Prof Time       | 12:30 PM-1:45 PM
Lec Location    | ILP, 1101
Lec Days        | MW
TA Time         | 4:00 PM-4:50 PM
Sec Location    | Phelps Hall, 3525
Sec Time        | F
---
Title           -> CMPSC 130A - DATA STRUCT ALGORTH
Code            -> 07229
Grading         -> Grading: L
Units           -> 4.0 
Professor       ->  NASIR N
TA              -> T.B.A
---
Prof Time       | 2:00 PM-3:15 PM
Lec Location    | Harold Frank Hall,