In [21]:
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from google.auth.transport.requests import Request
import os


class GoogleDriveManager:
    def __init__(self, credentials_file='/Users/declanbracken/Development/UofT_Projects/Meng_Project/code_base/web_scraping/client_secret_809384080547-4sfr7l9u8a618keak7b11qan4o63nvh3.apps.googleusercontent.com.json', token_file='token.json', scopes=None):
        self.credentials_file = credentials_file
        self.token_file = token_file
        self.scopes = ['https://www.googleapis.com/auth/drive']
        self.service = self.authenticate_google_drive()

    def authenticate_google_drive(self):
        """Authenticate and return a Google Drive service object."""
        creds = None
        if os.path.exists(self.token_file):
            creds = Credentials.from_authorized_user_file(self.token_file, self.scopes)
        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(self.credentials_file, self.scopes)
                creds = flow.run_local_server(port=0)
            with open(self.token_file, 'w') as token:
                token.write(creds.to_json())
        return build('drive', 'v3', credentials=creds)
    
    def list_files_in_folder(self, folder_id):
        query = f"'{folder_id}' in parents and mimeType contains 'image/'"
        results = self.service.files().list(q=query, pageSize=1000, fields="nextPageToken, files(id, name)").execute()
        items = results.get('files', [])
        return items

    def get_file_urls(self, files):
        file_urls = []
        for file in files:
            file_id = file['id']
            file_url = f"https://drive.google.com/uc?id={file_id}"
            file_urls.append(file_url)
        return file_urls

# Path to your service account key file
SERVICE_ACCOUNT_FILE = '/Users/declanbracken/Development/UofT_Projects/Meng_Project/code_base/web_scraping/client_secret_809384080547-4sfr7l9u8a618keak7b11qan4o63nvh3.apps.googleusercontent.com.json'

# ID of the folder containing the images
FOLDER_ID = '14zyq0BXTYrYj81bGlKtYpEG-KL59oNnM'

# Create manager instace
manager = GoogleDriveManager(credentials_file=SERVICE_ACCOUNT_FILE)#, token_file=TOKEN_FILE)

# Get folder items
folder_items = manager.list_files_in_folder(FOLDER_ID)

# List urls
image_urls = manager.get_file_urls(folder_items)
        

In [6]:
# Print or use the list of image URLs
for url in image_urls:
    print(url)

https://drive.google.com/uc?id=1QrUWNZv5f7QFKjExPCF6gcso5jsmJYGq
https://drive.google.com/uc?id=1M_8OgVIUafvwupnvRVkjCiFtw9F3LZ4R
https://drive.google.com/uc?id=1SkhPNjaVKRTz4XGiEULl9ukwgqMTrVF-
https://drive.google.com/uc?id=11u_oNNGCOSciEEgaefHkYLRDHSPUvVGz
https://drive.google.com/uc?id=1a27GpxxEBqAL5X1QYcoHcenfaVLXyVgG
https://drive.google.com/uc?id=17G7k-uJgDe94yGv4a99t4EUAtj463eMO
https://drive.google.com/uc?id=1nw0Elt2qwdbg3tmHr9S7nKOIF65K7aN0
https://drive.google.com/uc?id=1SOsmFJzIjcPx7vhHc2iH_ZcDk1Wz1MRv
https://drive.google.com/uc?id=1NiNGMa_erAA75qmuN8EQxkhf_reWep5J
https://drive.google.com/uc?id=1qC9yQaEJYELkosM2wv7VmDtyMLLwpZKR
https://drive.google.com/uc?id=1PaxJGQ3egf1W2a-7pCsmdmqsVQBIJi7v
https://drive.google.com/uc?id=1w0AuTOfeNgzQcQ10QeFcUsG5Z8nexSOc
https://drive.google.com/uc?id=1dtMhvdMM9SPDA8iTtDg8HK4Lbhv7xN4m
https://drive.google.com/uc?id=1ZRimkmpXljp-sYfOJDMd6RibjPjML69E
https://drive.google.com/uc?id=1_1cGpVa8Dn8BY8AnhybCBQY-o-QbJmEI
https://drive.google.com/

In [7]:
len(image_urls)

200

In [13]:
import json
import random

def write_to_jsonl(prompts, image_urls, max_tokens = 1200):

    # Generate JSONL content
    jsonl_content = []
    for i, url in enumerate(image_urls):
        prompt = random.choice(prompts)
        request = {
            "custom_id": f"request-{i+1}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": f"{prompt}"
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"{url}"
                                }
                            }
                        ]
                    }
                ],
                "max_tokens": max_tokens
            }
        }
        jsonl_content.append(request)

    # Write to JSONL file
    with open("batchinput.jsonl", "w") as f:
        for entry in jsonl_content:
            f.write(json.dumps(entry) + "\n")

    print("batchinput.jsonl file created.")

# Define the prompts for different parts of the extraction
prompts = [
    "Extract only the course information from the attached transcript image, including but not limited to subjects/course codes, grades, credits, and other information. Please then structure it into a table in CSV format. If the image is not a transcript, respond with 'not a transcript'.",
    "Please transcribe only the grade data from the attached transcript image into a CSV format with the appropriate fields. If the image is not a transcript, respond with 'not a transcript'.",
    "From the attached transcript image, extract and organize the course and grade data, along with any other columns, into CSV format. No need to include term data or student basic information. Ensure all data is accurately transcribed and formatted. If the image is not a transcript, respond with 'not a transcript'.",
    "Extract comprehensive course details from the attached transcript image, such as subject codes, course names, grades, and credits. If there are more or other column headers, include them aswell. Organize this data into a table in CSV format. Only extract data relevant to the course information. If the image does not contain a student transcript, respond with 'not a transcript'.",
    "Identify and extract the tabular grade information from the attached transcript image. Transcribe this information into a table in CSV format. If the image is not a transcript, respond with 'not a transcript'.",
    "Extract and transcribe just the course information from the transcript image into a table in CSV format. Include specific fields for the courses, grades, and any other columns available. Ensure all data is formatted correctly. If the image is not a transcript, respond with 'not a transcript'."
]
# Image urls already defined

write_to_jsonl(prompts, image_urls)

batchinput.jsonl file created.


In [14]:
# Since there is a 90 000 token limit, we need to split our 200 samples into chunks
# Function to split a list into smaller chunks
def split_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

# Path to the existing JSONL file
input_file = "batchinput.jsonl"

# Read the existing JSONL file
with open(input_file, "r") as f:
    lines = f.readlines()

# Number of chunks (in this case, 4)
num_chunks = 4
chunk_size = len(lines) // num_chunks

# Split the lines into chunks
chunks = list(split_list(lines, chunk_size))

# Write each chunk to a new JSONL file
for idx, chunk in enumerate(chunks):
    filename = f"batchinput_part_{idx + 1}.jsonl"
    with open(filename, "w") as f:
        for line in chunk:
            f.write(line)
    print(f"{filename} file created with {len(chunk)} lines.")

print(f"Total lines: {sum(len(chunk) for chunk in chunks)}")

batchinput_part_1.jsonl file created with 50 lines.
batchinput_part_2.jsonl file created with 50 lines.
batchinput_part_3.jsonl file created with 50 lines.
batchinput_part_4.jsonl file created with 50 lines.
Total lines: 200


In [19]:
import openai
# Set your OpenAI API key
api_key = 'sk-proj-WMUAiU7RSVuQyXru8yn7T3BlbkFJbqBfbs2i7he2nuGBiqi5'

client = openai.OpenAI(api_key=api_key)

# Upload JSONL file
with open("Input_JSONL/batchinput_part_4.jsonl", "rb") as f:
    batch_input_file = client.files.create(file=f, purpose='batch')

batch_input_file_id = batch_input_file.id
print(f"Batch input file uploaded with ID: {batch_input_file_id}")

# Create batch job
batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "Transcript image processing batch job, part 1."
    }
)

print("Batch job created.")
print(batch)

Batch input file uploaded with ID: file-bYZ3j6BghSm0hMlGBdIwoXRZ
Batch job created.
Batch(id='batch_taS6V1TJIjAdEak0AcWvRuuF', completion_window='24h', created_at=1717447199, endpoint='/v1/chat/completions', input_file_id='file-bYZ3j6BghSm0hMlGBdIwoXRZ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1717533599, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Transcript image processing batch job, part 1.'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [12]:
# Check status
client.batches.retrieve("batch_ttxxW6YmmlYT7jkZadQvoNRv")

Batch(id='batch_ttxxW6YmmlYT7jkZadQvoNRv', completion_window='24h', created_at=1717442895, endpoint='/v1/chat/completions', input_file_id='file-DzstArLd1SKLy10KCW68Skfc', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o in organization org-LyCWzHEWOMkZKVa87GQimuOJ. Limit: 90,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list'), expired_at=None, expires_at=1717529295, failed_at=1717442917, finalizing_at=None, in_progress_at=None, metadata={'description': 'Transcript image processing batch job'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))