In [1]:
from llama_cpp import Llama
import torch

device = torch.device("cuda")
# Dynamically determine optimal number of threads and GPU layers
n_threads = max(1, torch.get_num_threads() - 1)  # Leave one thread for OS and other processes
print(f"Using {n_threads} CPU threads.")
n_gpu_layers = 0  # Default to no GPU layers if not using CUDA
if device.type == 'cuda':
    n_gpu_layers = torch.cuda.get_device_capability(device.index)[0]  # Use number of SMs as a heuristic
    print(f"Offloading {n_gpu_layers} layers to GPU.")

model_path = r"C:\Users\Declan Bracken\MEng_Project\dolphin-2.1-mistral-7b.Q5_K_M.gguf"
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = Llama(
  model_path= model_path,  # Download the model file first
  n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=n_threads,            # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=n_gpu_layers         # The number of layers to offload to GPU, if you have GPU acceleration available
)


llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from C:\Users\Declan Bracken\MEng_Project\dolphin-2.1-mistral-7b.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = ehartford_dolphin-2.1-mistral-7b
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - k

Using 3 CPU threads.
Offloading 5 layers to GPU.


llm_load_tensors:        CPU buffer size =  4893.00 MiB
...................................................................................................
llama_new_context_with_model: n_ctx      = 4096
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:        CPU KV buffer size =   512.00 MiB
llama_new_context_with_model: KV self size  =  512.00 MiB, K (f16):  256.00 MiB, V (f16):  256.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.12 MiB
llama_new_context_with_model:        CPU compute buffer size =   296.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 1
AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WAS

In [3]:
text = """
APSC 221 Economic And Business Practice 3.00 A 12.0
ELEC 210 Intro Elec Circuits & Machines 4.00 A- 14.8
MINE 244 Underground Mining 3.00 Bt 9.9
MINE 267 App Chem/Instrument Meth Mine 4.00 A- 14.8
MTHE 272 Applications Numerical Methods. 3.50 At 15.0
MTHE 367 Engineering Data Analysis 4.00 B- 10.8
APSC 100A Engineering Practice 0.00 NG 0.0
APSC 111 Mechanics 3.50 A 14.0
APSC 131 Chemistry And Materials 3.50 A 14.0
APSC 151 Earth Systems And Engineering 4.00 At 17.2
APSC 161 Engineering Graphics 3.50 A 14.0
APSC 171 Calculus | 3.50 At 15.0
APSC 200 Engr Design & Practice 4.00 At 17.2
APSC 293 Engineering Communications 1.00 At 4.
CIVL 230 Solid Mechanics | 4.25 At 18.
MECH 230 Applied Thermodynamics | 3.50 At 15.
MINE 201 Intro To Mining/Mineral Proces 4.00 A 16.(
MINE 202 Comp Apps/Instrumntn In Mining 1.50 A 6.(
MTHE 225 Ordinary Differential Equation 3.50 A 14.(
APSC  100B Engineering Practice 11.00 A- 40.7
APSC 112 Electricity And Magnetism 3.50 B+ 11.6
APSC 132 Chemistry And The Environment 3.50 B 10.5
APSC 142 Intro Computer Program Engrs 3.00 A- 11.1
APSC 172 Calculus II 3.50 A- 13.0
APSC 174 Introduction To Linear Algebra 3.50 At 15.0
CLST 201 Roman History 3.00 At 12.¢
ECON 111 Introductory Microeconomics 3.00 A- 11.1
MINE 321 Drilling & Blasting 4.50 A- 16.6
MINE 331 Methods Of Mineral Separation 4.50 A- 16.€
MINE 339 Mine Ventilation 4.50 Ct 10.4
MINE 341 Open Pit Mining 4.50 A- 16.6
Academic Program History
06/12/2012: Bachelor of Science Engineer Active in Program
Major in General Engineering
02/28/2013: Bachelor of Science Engineer Active in Program
Major in Mining Engineering
Option in Mining
12/09/2014: Bachelor of Arts Active in Program
Term GPA 3.51. Term Totals 24.00 24.00 84.3
Term GPA 3.60 Term Totals 21.50 21.50 774
Term GPA 4.18 Term Totals 21.75 21.75 90.8
Term GPA 3.64 Term Totals 28.00 28.00 101.8
Term GPA 4.13 Term Totals 18.00 18.00 74.2
"""

headers = """
Course Description Units Grade _— Points
Course Description Units Grade Points
Course Description Units Grade Points
Course Description Units Grade _— Points
Course Description Units Grade Points
"""


In [4]:
prompt = f'''Below is OCR text from a student transcript. This text contains a table, or multiple tables. Select data only relevant to student courses and grades from these tables and format the fields into a table in csv format. Some extracted table headers are given below to help with formatting. The csv you output should only have 3 columns: 'Course Code', 'Grade', and 'Credits', you must select which columns best fit these fields.
        
### Headers:
{headers}

### Text:
{text}

### CSV:

'''

system_message = "You are a table creation assistant"
prompt_template=f'''<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
'''

max_tokens = 2048
temperature = 0
top_p = 0
echo = False
stop = ["</s>"]

# Define the parameters
model_output = llm(
       prompt_template,
       max_tokens=max_tokens,
       temperature=temperature,
       echo=echo,
       stop=stop,
   )
final_result = model_output["choices"][0]["text"].strip()

In [4]:
print(final_result)

Course Code,Grade,Credits
APSC 221,A,3.0
ELEC 210,A-,4.0
MINE 244,Bt,3.0
MINE 267,A-,4.0
MTHE 272,At,3.5
MTHE 367,B-,4.0
APSC 100A,NG,0.0
APSC 111,A,3.5
APSC 131,A,3.5
APSC 151,At,4.0
APSC 161,A,3.5
APSC 171,At,3.5
APSC 200,At,4.0
CIVL 230,At,4.25
MECH 230,At,3.5
MINE 201,A,4.0
MINE 202,A,1.5
MTHE 225,A,3.5
APSC  100B,A-,11.0
APSC 112,B+,3.5
APSC 132,B,3.5
APSC 142,A-,3.0
APSC 172,A-,3.5
APSC 174,At,3.5
CLST 201,At,3.0
ECON 111,A-,3.0
MINE 321,A-,4.5
MINE 331,A-,4.5
MINE 339,Ct,4.5
MINE 341,A-,4.5


In [5]:
import csv
from io import StringIO

def filter_unidentified_courses(mistral_output, original_text):
    # Step 1: Extract course codes from Mistral's output
    mistral_courses = set()
    csv_reader = csv.reader(StringIO(mistral_output))
    next(csv_reader)  # Skip headers if they exist
    for row in csv_reader:
        if row:  # Make sure row is not empty
            mistral_courses.add(row[0].strip())  # Assume course code is in the first column

    # Step 2: Split the original text by newline and filter
    original_lines = original_text.strip().split('\n')
    filtered_lines = []

    for line in original_lines:
        if not any(course_code in line for course_code in mistral_courses):
            filtered_lines.append(line)

    # Step 3: Join the filtered lines back into a single string
    filtered_text = '\n'.join(filtered_lines)
    return filtered_text

filtered_text = filter_unidentified_courses(final_result, text)
print(filtered_text)

APSC 293 Engineering Communications 1.00 At 4.
Academic Program History
06/12/2012: Bachelor of Science Engineer Active in Program
Major in General Engineering
02/28/2013: Bachelor of Science Engineer Active in Program
Major in Mining Engineering
Option in Mining
12/09/2014: Bachelor of Arts Active in Program
Term GPA 3.51. Term Totals 24.00 24.00 84.3
Term GPA 3.60 Term Totals 21.50 21.50 774
Term GPA 4.18 Term Totals 21.75 21.75 90.8
Term GPA 3.64 Term Totals 28.00 28.00 101.8
Term GPA 4.13 Term Totals 18.00 18.00 74.2


In [9]:
import sys
import os
class SuppressOutput:
    def __enter__(self):
        # Redirect stdout and stderr to os.devnull
        self._original_stdout = sys.stdout
        self._original_stderr = sys.stderr
        sys.stdout = open(os.devnull, 'w')
        sys.stderr = open(os.devnull, 'w')
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        # Close the devnull handlers and restore original handlers
        sys.stdout.close()
        sys.stderr.close()
        sys.stdout = self._original_stdout
        sys.stderr = self._original_stderr

prompt2 = f'''Below is OCR text from a student transcript. This text maight contain student grade data. Determine if there is course information and corresponding grades from this data. If there is, select lines only relevant to student courses and grades and format the fields into a table in csv format. Some extracted table headers are given below to help with formatting. The csv you output should only have 3 columns: 'Course Code', 'Grade', and 'Credits', you must select which columns best fit these fields. The data could be one line long, several lines long, or if you determine that there is no grade data, simply respon with "None".

### Headers:
{headers}

### Text:
{filtered_text}

### CSV:
'''

system_message2 = "You are a table creation assistant"
prompt_template2=f'''<|im_start|>system
{system_message2}<|im_end|>
<|im_start|>user
{prompt2}<|im_end|>
<|im_start|>assistant
'''

max_tokens2 = 128
temperature = 0
echo = False
stop = ["</s>"]

# Define the parameters
with SuppressOutput():
    model_output2 = llm(
        prompt_template2,
        max_tokens=max_tokens2,
        temperature=temperature,
        echo=echo,
        stop=stop,
    )
final_result2 = model_output2["choices"][0]["text"].strip()
print(final_result2)


Course Code,Grade,Credits
APSC 293,A+,1.00


In [10]:
print(prompt2)

Below is OCR text from a student transcript. This text maight contain student grade data. Determine if there is course information and corresponding grades from this data. If there is, select lines only relevant to student courses and grades and format the fields into a table in csv format. Some extracted table headers are given below to help with formatting. The csv you output should only have 3 columns: 'Course Code', 'Grade', and 'Credits', you must select which columns best fit these fields. The data could be one line long, several lines long, or if you determine that there is no grade data, simply respon with "None".

### Headers:

Course Description Units Grade _— Points
Course Description Units Grade Points
Course Description Units Grade Points
Course Description Units Grade _— Points
Course Description Units Grade Points


### Text:
APSC 293 Engineering Communications 1.00 At 4.
Academic Program History
06/12/2012: Bachelor of Science Engineer Active in Program
Major in General

In [7]:
llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that outputs student course grades in JSON format.",
        },
        {
            "role": "user",
            "content": f"""Below is OCR text from a student transcript. This text contains a table, or multiple tables. Select data only relevant to student courses and grades from these tables and format the fields into a JSON object. The JSON you output should only have fields for 'Course Code', 'Grade', and 'Credits'. You must select which columns best fit these fields.
            
            ### Headers:
            {headers}
            
            ### Text:
            {text}"""
        }
    ],
    response_format={
        "type": "json_object",
        "schema": {
            "type": "object",
            "properties": {
                "courses": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "Course Code": {"type": "string"},
                            "Grade": {
                                "type": ["string", "number"]  # Allows both strings and numbers
                            },
                            "Credits": {"type": "number"}
                        },
                        "required": ["Course Code", "Grade", "Credits"]
                    }
                }
            },
            "required": ["courses"]
        },
    },
    temperature=0,
)


from_string grammar:
char ::= [^"\] | [\] char_1 
char_1 ::= ["\/bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
courses ::= [[] space courses_8 []] space 
space ::= space_86 
courses_4 ::= courses-item courses_7 
courses-item ::= [{] space courses-item-Course-Code-kv [,] space courses-item-Grade-kv [,] space courses-item-Credits-kv [}] space 
courses_6 ::= [,] space courses-item 
courses_7 ::= courses_6 courses_7 | 
courses_8 ::= courses_4 | 
courses-item-Course-Code-kv ::= ["] [C] [o] [u] [r] [s] [e] [ ] [C] [o] [d] [e] ["] space [:] space string 
courses-item-Grade-kv ::= ["] [G] [r] [a] [d] [e] ["] space [:] space courses-item-Grade 
courses-item-Credits-kv ::= ["] [C] [r] [e] [d] [i] [t] [s] ["] space [:] space number 
string ::= ["] string_87 ["] space 
number ::= number_78 number_81 number_84 space 
courses-item-Grade ::= string | number 
courses-kv ::= ["] [c] [o] [u] [r] [s] [e] [s] ["] space [:] space courses 
decimal-part ::= [0-9] decimal-part_46 
decimal-part

KeyboardInterrupt: 