In [1]:
import os
import openai
import numpy as np
import base64
import psycopg2
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from datetime import datetime, timedelta

In [2]:
from dotenv import load_dotenv # used to load env variables
load_dotenv()

True

In [3]:
# Define your time range
START_DATE = "2025-02-10"  # Change to your desired start date (YYYY-MM-DD)
END_DATE = "2025-02-18"    # Change to your desired end date (YYYY-MM-DD)

# Gmail API scope
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

# Connect to Gmail API
def authenticate_gmail():
    creds = None
    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
    if not creds or not creds.valid:
        flow = InstalledAppFlow.from_client_secrets_file("google_creds.json", SCOPES)
        creds = flow.run_local_server(port=0)
        with open("token.json", "w") as token:
            token.write(creds.to_json())
    return build("gmail", "v1", credentials=creds)

def date_to_unix(date_str):
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    return int(dt.timestamp())  # Convert to Unix timestamp (seconds)

# Get emails within the time range
def get_emails(service):
    query = f"after:{date_to_unix(START_DATE)} before:{date_to_unix(END_DATE)}"
    results = service.users().messages().list(userId="me", q=query, maxResults=10).execute()
    messages = results.get("messages", [])
        
    emails = []
    for msg in messages:
        msg_data = service.users().messages().get(userId="me", id=msg["id"]).execute()
        payload = msg_data.get("payload", {})
        
        # Extract headers
        headers = {header["name"]: header["value"] for header in payload.get("headers", [])}
        sender = headers.get("From", "Unknown Sender")
        subject = headers.get("Subject", "No Subject")
        date = headers.get("Date", "Unknown Date")
        
        # Extract email content
        body = ""
        if "parts" in payload:
            for part in payload["parts"]:
                if part["mimeType"] == "text/plain":
                    body = base64.urlsafe_b64decode(part["body"]["data"]).decode("utf-8")
        elif "body" in payload and "data" in payload["body"]:
            body = base64.urlsafe_b64decode(payload["body"]["data"]).decode("utf-8")
    
        emails.append((sender, subject, date, body))
    return emails

service = authenticate_gmail()
emails = get_emails(service)

In [4]:
len(emails)

10

In [None]:
# 1) brew install postgresql
# 2) brew services start postgresql

# 3) now that postgres is running in the terminal create table 
# CREATE DATABASE emails_db;
# CREATE USER your_user WITH PASSWORD 'your_password';
# GRANT ALL PRIVILEGES ON DATABASE emails_db TO your_user;


In [28]:
from openai import OpenAI
client = OpenAI()

# DB Configuration (Assuming you already have this)
DB_CONFIG = {
    'dbname': 'emails_db',
    'user': 'alexanderbarriga03',
    'password': 'water03',
    'host': 'localhost',
    'port': '5432'
}

# Function to vectorize text using OpenAI API
def vectorize_text(text):
    response = client.embeddings.create(
        model="text-embedding-3-small",  
        input=text
    )
    return response.data[0].embedding

def store_emails_in_db(emails):
    conn = psycopg2.connect(**DB_CONFIG)
    cursor = conn.cursor()
    
    # Modify table to store vector as BYTEA (binary data)
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS emails (
            id SERIAL PRIMARY KEY,
            sender TEXT,
            subject TEXT,
            timestamp TEXT,
            body TEXT,
            body_vector BYTEA  -- To store the vector
        );
    """)
    conn.commit()

    for email in emails:
        # Vectorize the email body using OpenAI's model
        body_vector = vectorize_text(email[-1])  # Vectorize the body of the email
        body_vector_bytes = np.array(body_vector, dtype=np.float32).tobytes()  # Convert to bytes
        
        # Insert the email data into the database, including the vector
        cursor.execute("""
            INSERT INTO emails (sender, subject, timestamp, body, body_vector)
            VALUES (%s, %s, %s, %s, %s)
        """, (email['sender'], email['subject'], email['timestamp'], email['body'], body_vector_bytes))
        
    conn.commit()
    cursor.close()
    conn.close()       

In [None]:
store_emails_in_db(emails)

TypeError: tuple indices must be integers or slices, not str

In [10]:
emails[5]

('LinkedIn Job Alerts <jobalerts-noreply@linkedin.com>',
 '“data scientist”: Eczacıbaşı Bilişim - DATA SCIENTIST and more',
 'Tue, 18 Feb 2025 04:57:26 +0000 (UTC)',
 'Your job alert for data scientist\r\n30+ new jobs match your preferences.\r\n          \r\nDATA SCIENTIST\r\nEczacıbaşı Bilişim\r\nTürkiye\r\nView job: https://www.linkedin.com/comm/jobs/view/4152720719/?trackingId=Xz%2FAp4wX%2BzG6a4eksFbUdw%3D%3D&refId=ByteString%28length%3D16%2Cbytes%3De16ba638...9079663b%29&lipi=urn%3Ali%3Apage%3Aemail_email_job_alert_digest_01%3BXzBVG9w3R8Wg5eP%2FBYaagg%3D%3D&midToken=AQEsx4nyGTXpHQ&midSig=0EKQFWn239tXE1&trk=eml-email_job_alert_digest_01-job_card-0-view_job&trkEmail=eml-email_job_alert_digest_01-job_card-0-view_job-null-5bx38z~m7a0j5c6~6p-null-null&eid=5bx38z-m7a0j5c6-6p&otpToken=MTEwNzFjZTMxNzJlY2NjNWIxMjQwNGVkNDUxN2VmYjE4YmM4ZDQ0NDkwYWQ4ODYxNzBjNTA5NmM0NjUzNWZmNmY2ZDJkZmEwNmRmOWYwZmI1MGIzY2NiNDJkM2RmOWNkNjJlNWJmZGYyMzI4YTkyMzNkYjQzOSwxLDE%3D\r\n\r\n---------------------------------

In [None]:
        
    conn.commit()
    cursor.close()
    conn.close()


store_emails_in_db(emails)
print(f"Stored {len(emails)} emails in the database.")

In [46]:
vectorize_text(emails[0]['body'])

TypeError: tuple indices must be integers or slices, not str

In [47]:
response = openai.Embedding.create(
    model="text-embedding-3-small",  
    input=emails[0]['body']
)
response['data'][0]['embedding']

TypeError: tuple indices must be integers or slices, not str

In [60]:
emails[0][-1]

'<!DOCTYPE html\n  PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n\n<html\n  style="width:100%;font-family:helvetica, \'helvetica neue\', arial, verdana, sans-serif;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;padding:0;Margin:0;">\n\n<head>\n  <meta charset="UTF-8">\n  <meta content="width=device-width, initial-scale=1" name="viewport">\n  <meta name="x-apple-disable-message-reformatting">\n  <meta http-equiv="X-UA-Compatible" content="IE=edge">\n  <meta content="telephone=no" name="format-detection">\n  <title>Amelia Mink posted in Data Science for Sustainability</title>\n  <!--[if (mso 16)]>\n  <style type="text/css">\n    a {text-decoration: none;}\n  </style>\n  <![endif]-->\n  <!--[if gte mso 9]>\n  <style>sup { font-size: 100% !important; }</style>\n  <![endif]-->\n  <style type="text/css" data-start-index="742" data-end-index="765">\n    @font-face {\n      font-family: \'Graphik Medium\';\n      src: url(\