In [31]:
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from google.auth.transport.requests import Request
from googleapiclient.http import MediaIoBaseUpload
from PIL import Image
import os
import io

class GoogleDriveManager:
    def __init__(self, credentials_file='/Users/declanbracken/Development/UofT_Projects/Meng_Project/code_base/web_scraping/client_secret_809384080547-4sfr7l9u8a618keak7b11qan4o63nvh3.apps.googleusercontent.com.json', token_file='token.json', scopes=None):
        self.credentials_file = credentials_file
        self.token_file = token_file
        self.scopes = ['https://www.googleapis.com/auth/drive']
        self.service = self.authenticate_google_drive()

    def authenticate_google_drive(self):
        """Authenticate and return a Google Drive service object."""
        creds = None
        if os.path.exists(self.token_file):
            creds = Credentials.from_authorized_user_file(self.token_file, self.scopes)
        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                creds.refresh(Request())
            else:
                flow = InstalledAppFlow.from_client_secrets_file(self.credentials_file, self.scopes)
                creds = flow.run_local_server(port=0)
            with open(self.token_file, 'w') as token:
                token.write(creds.to_json())
        return build('drive', 'v3', credentials=creds)
    
    def list_files_in_folder(self, folder_id):
        query = f"'{folder_id}' in parents and mimeType contains 'image/'"
        results = self.service.files().list(q=query, pageSize=1000, fields="nextPageToken, files(id, name)").execute()
        items = results.get('files', [])
        return items

    def get_file_urls(self, files):
        file_urls = []
        for file in files:
            file_id = file['id']
            file_url = f"https://drive.google.com/uc?id={file_id}"
            file_urls.append(file_url)
        return file_urls

    def convert_images_to_jpeg_and_upload(self, input_folder_id, output_folder_id):
        """
        Convert all images in the input Google Drive folder to JPEG format and upload them to the output folder.
        Necessary for gpt-4o inference if there are .jpg files or other formats which are unsupported.

        Args:
        - input_folder_id (str): The ID of the Google Drive folder containing the original images.
        - output_folder_id (str): The ID of the Google Drive folder where the JPEG images will be uploaded.
        """
        # Retrieve image files from the input folder
        items = self.list_files_in_folder(input_folder_id)

        if not items:
            print("No image files found in the specified folder.")
            return

        for item in items:
            file_id = item['id']
            file_name = item['name']

            # Download the file content
            request = self.service.files().get_media(fileId=file_id)
            file_data = io.BytesIO(request.execute())

            try:
                # Open the image and convert it to JPEG
                with Image.open(file_data) as img:
                    img = img.convert("RGB")  # Ensure the image is in RGB mode for JPEG

                    # Prepare JPEG image in memory
                    jpeg_io = io.BytesIO()
                    jpeg_name = os.path.splitext(file_name)[0] + ".jpeg"
                    img.save(jpeg_io, "JPEG")
                    jpeg_io.seek(0)

                    # Upload the JPEG image back to Google Drive
                    media = MediaIoBaseUpload(jpeg_io, mimetype='image/jpeg')
                    file_metadata = {
                        'name': jpeg_name,
                        'parents': [output_folder_id]
                    }
                    uploaded_file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute()
                    print(f"Converted and uploaded: {jpeg_name} (File ID: {uploaded_file.get('id')})")

            except Exception as e:
                print(f"Error converting {file_name}: {e}")

# Path to your service account key file
SERVICE_ACCOUNT_FILE = '/Users/declanbracken/Development/UofT_Projects/Meng_Project/code_base/web_scraping/client_secret_809384080547-4sfr7l9u8a618keak7b11qan4o63nvh3.apps.googleusercontent.com.json'

# ID of the folder containing the images
# FOLDER_ID = '14zyq0BXTYrYj81bGlKtYpEG-KL59oNnM'
IN_FOLDER_ID = '1KInbF1MOyQ-RSLWYcnLIoaCo3D9ieuOL'
OUT_FOLDER_ID = '1qRtXRctr7jXr5cpxKiI_vxw0Bj-Ryp07'

# Create manager instace
manager = GoogleDriveManager(credentials_file=SERVICE_ACCOUNT_FILE)#, token_file=TOKEN_FILE)

# Convert images in the input folder to JPEG and upload them to the output folder
manager.convert_images_to_jpeg_and_upload(IN_FOLDER_ID, OUT_FOLDER_ID)
        

Converted and uploaded: SD_Trans_University_of_Chicago_-_Copy.jpeg (File ID: 1pI3toNgtVMDZ-HEKqGUqyr0z_6Tj46VC)
Converted and uploaded: Sd-The-University-of-Alabama-Page-1.jpeg (File ID: 1LTl-rZAATpqczCM3Ene5C6tXsxVLIpxK)
Converted and uploaded: SD_UTEPsn_-_Copy.jpeg (File ID: 1kSkSCW9j9yNISQJri3SB2_ifNhDr-6QF)
Converted and uploaded: SD_Queens-College-Transcript-Page-1_-_Copy.jpeg (File ID: 12hqqW4o6-tLF9Tm4UDZmnZC5T-bHd7cO)
Converted and uploaded: SD_Indiana_University_Trans_Pg_1_-_Copy.jpeg (File ID: 1a2SqGFdFGGX81zAHb9-9r7vi2r_SnkXb)
Converted and uploaded: SD_Humber_College_Trans_Match_Sample.jpeg (File ID: 1BzQykPTXrYDo8qm45-Kqs5CwJy0i2kgL)
Converted and uploaded: SD_Concordia_University-Page_1_-_Copy.jpeg (File ID: 1gDraQhEah7I9VX1FHSzm-Sinj_HBQhNs)
Converted and uploaded: SD_Concordia_Transcript_HOrizontal_1_-_Copy.jpeg (File ID: 1T5P769wOOhsdY4sQ17tUFag7ReKyj3Zb)
Converted and uploaded: SD_Pennsylvania-State-University.jpeg (File ID: 1K1eTxzaYPiP68_GS_AXOTm1nJ9BEoniq)
Converte

In [35]:
# Get folder items
folder_items = manager.list_files_in_folder(OUT_FOLDER_ID)

# List urls
image_urls = manager.get_file_urls(folder_items)

In [36]:
# Print or use the list of image URLs
for url in image_urls:
    print(url)

https://drive.google.com/uc?id=1poCyiTUbBH5uarIOtECYKyC2SBbxUOOY
https://drive.google.com/uc?id=1P8LvUcCM1NoIU2qo_cS4hCzRgfdFb_nL
https://drive.google.com/uc?id=1jtQjFKztmz2l5DAk1KPRq9YtYNayU4V6
https://drive.google.com/uc?id=1vKUpnIE73KoQslO-7cw_XhR4areynUP0
https://drive.google.com/uc?id=1WrHrCzvwxk5IyXpj2LPNBXqxkCl5Hmql
https://drive.google.com/uc?id=1kqW27F0vtWl99s-u0XOYHMaOyya6ivsK
https://drive.google.com/uc?id=1qXufiu8cLDN5CCxRI69T867qZSsxOd2X
https://drive.google.com/uc?id=1K1eTxzaYPiP68_GS_AXOTm1nJ9BEoniq
https://drive.google.com/uc?id=1T5P769wOOhsdY4sQ17tUFag7ReKyj3Zb
https://drive.google.com/uc?id=1gDraQhEah7I9VX1FHSzm-Sinj_HBQhNs
https://drive.google.com/uc?id=1BzQykPTXrYDo8qm45-Kqs5CwJy0i2kgL
https://drive.google.com/uc?id=1a2SqGFdFGGX81zAHb9-9r7vi2r_SnkXb
https://drive.google.com/uc?id=12hqqW4o6-tLF9Tm4UDZmnZC5T-bHd7cO
https://drive.google.com/uc?id=1kSkSCW9j9yNISQJri3SB2_ifNhDr-6QF
https://drive.google.com/uc?id=1LTl-rZAATpqczCM3Ene5C6tXsxVLIpxK
https://drive.google.com/

In [37]:
len(image_urls)

16

In [38]:
import json
import random

def write_to_jsonl(prompts, image_urls, jsonl_path, max_tokens = 1200):
    # Ensure the directory exists
    # os.makedirs(os.path.dirname(jsonl_path), exist_ok=True)

    # Generate JSONL content
    jsonl_content = []
    for i, url in enumerate(image_urls):
        prompt = random.choice(prompts)
        request = {
            "custom_id": f"request-{i+1}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": f"{prompt}"
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"{url}"
                                }
                            }
                        ]
                    }
                ],
                "max_tokens": max_tokens
            }
        }
        jsonl_content.append(request)

    # Write to JSONL file
    with open(jsonl_path, "w") as f:
        for entry in jsonl_content:
            f.write(json.dumps(entry) + "\n")
    try:
        print(f"{jsonl_path.split('/')[-1]} file created.")
    except:
        print("jsonl file created.")

# Define the prompts for different parts of the extraction
prompts = [
    "Extract only the course information from the attached transcript image, including but not limited to subjects/course codes, grades, credits, and other information. Please then structure this information into a table in CSV format. If the image is not a transcript, respond with 'not a transcript'.",
]
# JSONL path
jsonl_path = 'Test_Data/Input_JSONL/input.jsonl'

write_to_jsonl(prompts, image_urls, jsonl_path)

input.jsonl file created.


In [27]:
# Since there is a 90 000 token limit, we need to split our 200 samples into chunks
# Function to split a list into smaller chunks
def split_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

# Path to the existing JSONL file
input_file = "Test_Data/Input_JSONL/input.jsonl"

# Read the existing JSONL file
with open(input_file, "r") as f:
    lines = f.readlines()

# Number of chunks (in this case, 4)
num_chunks = 2
chunk_size = len(lines) // num_chunks

# Split the lines into chunks
chunks = list(split_list(lines, chunk_size))

# Write each chunk to a new JSONL file
for idx, chunk in enumerate(chunks):
    filename = f"batchinput_part_{idx + 1}.jsonl"
    with open(filename, "w") as f:
        for line in chunk:
            f.write(line)
    print(f"{filename} file created with {len(chunk)} lines.")

print(f"Total lines: {sum(len(chunk) for chunk in chunks)}")

batchinput_part_1.jsonl file created with 8 lines.
batchinput_part_2.jsonl file created with 8 lines.
Total lines: 16


In [42]:
import openai
api_key_path = "openai_api_key.txt"
# Set your OpenAI API key
with open(api_key_path, "r") as f:
    api_key = f.read()

client = openai.OpenAI(api_key=api_key)

# Upload JSONL file
with open(jsonl_path, "rb") as f:
    batch_input_file = client.files.create(file=f, purpose='batch')

batch_input_file_id = batch_input_file.id
print(f"Batch input file uploaded with ID: {batch_input_file_id}")

# Create batch job
batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "Transcript image processing batch job, part 1."
    }
)

print("Batch job created.")
print(batch)

Batch input file uploaded with ID: file-A7lIsvehdNTwNdqpnnesIWt4
Batch job created.
Batch(id='batch_mNvrFzYMUglQNiBy9OYZoMpY', completion_window='24h', created_at=1724947688, endpoint='/v1/chat/completions', input_file_id='file-A7lIsvehdNTwNdqpnnesIWt4', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725034088, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Transcript image processing batch job, part 1.'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [41]:
# Check status
client.batches.retrieve("batch_uTtjmaVn2mfulzyoNKgkIhmJ")

Batch(id='batch_uTtjmaVn2mfulzyoNKgkIhmJ', completion_window='24h', created_at=1724947388, endpoint='/v1/chat/completions', input_file_id='file-hu6HcZ5dOI76sFZ8hMnpOxC5', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725033788, failed_at=None, finalizing_at=None, in_progress_at=1724947388, metadata={'description': 'Transcript image processing batch job, part 1.'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=10, total=16))

In [43]:
# Function to check batch status
def check_batch_status(batch_id):
    """Check the status of a batch job and return the batch object."""
    batch = openai.Batch.retrieve(batch_id)
    return batch

# Function to download file from OpenAI ad save locally
def download_file(file_id, save_path):
    """Download a file from OpenAI and save it to the specified directory."""
    file_content = openai.File.download(file_id)
    
    # Write the file content to the specified path
    with open(save_path, "wb") as f:
        f.write(file_content)
    print(f"File downloaded and saved to: {save_path}")

# Check batch status until it's completed
batch_id = batch.id
print(f"Checking status of batch job with ID: {batch_id}...")

batch_status = check_batch_status(batch_id)

# Wait for batch completion
while batch_status.status not in ['completed', 'failed']:
    print(f"Current batch status: {batch_status.status}")
    batch_status = check_batch_status(batch_id)

# Once the batch is completed, download the output file
if batch_status.status == 'completed' and batch_status.output_file_id:
    output_file_id = batch_status.output_file_id
    output_directory = "Test_Data/Output_JSONL"  # Specify your output directory here
    os.makedirs(output_directory, exist_ok=True)
    output_file_path = os.path.join(output_directory, "output_file.jsonl")
    
    # Download the output file
    download_file(output_file_id, output_file_path)
else:
    print(f"Batch job failed or there is no output file available. Status: {batch_status.status}")

Checking status of batch job with ID: batch_mNvrFzYMUglQNiBy9OYZoMpY...


AttributeError: module 'openai' has no attribute 'Batch'