In [3]:
from imapclient import IMAPClient
from dotenv import load_dotenv
import os
import email
from email.header import decode_header
from bs4 import BeautifulSoup
from huggingface_hub import login
from datetime import datetime
load_dotenv()
login(token=os.getenv('TOKEN'))
e_pass = os.getenv('APP_PASS')


In [None]:
import os
import email
import re
import hashlib
from datetime import datetime
from imapclient import IMAPClient
from email.header import decode_header
from bs4 import BeautifulSoup
import short_url  # Library for encoding numeric IDs to short strings
from app import db
from models import Link

def format_link(url):
    """
    Shorten the URL using short_url.
    This function checks for an existing Link record for the URL; if not found, it creates one.
    Then it encodes the Link id to a short base62 string.
    """
    # Remove common trailing punctuation
    url = url.strip('.,;:()[]{}\'"')
    link = Link.query.filter_by(link=url).first()
    if not link:
        link = Link(link=url)
        db.session.add(link)
        db.session.commit()  # Commit so that link.id is generated
    short_code = short_url.encode_url(42)
    link.short = short_code
    return short_code

def decode_email_body(part):
    if not hasattr(part, 'get_payload'):
        return None

    payload = part.get_payload(decode=True)
    if not payload:
        return None

    encodings = [part.get_content_charset(), 'utf-8', 'iso-8859-1', 'windows-1252', 'ascii', 'latin1']
    encodings = [e for e in encodings if e]

    for encoding in encodings:
        try:
            return payload.decode(encoding)
        except (UnicodeDecodeError, LookupError):
            continue

    try:
        return payload.decode(encodings[0], errors='replace')
    except (UnicodeDecodeError, LookupError, IndexError):
        return payload.decode('utf-8', errors='replace')

def safe_decode_header(header_value):
    if not header_value:
        return ""

    try:
        decoded_parts = decode_header(header_value)
        decoded_string = ""
        for part, charset in decoded_parts:
            if isinstance(part, bytes):
                try:
                    decoded_string += part.decode(charset or 'utf-8', errors='replace')
                except (UnicodeDecodeError, LookupError):
                    decoded_string += part.decode('utf-8', errors='replace')
            else:
                decoded_string += str(part)
        return decoded_string.strip()
    except Exception:
        return ""

def clean_email_text(text):
    if not text:
        return ""

    # Remove extraneous Unicode whitespace characters
    text = text.replace('\u200c', '').replace('\xa0', ' ')
    
    # Remove templating artifacts (e.g., "raw", "{% endraw", and "%}")
    text = re.sub(r'\s*raw\s*', ' ', text)
    text = re.sub(r'\{%\s*endraw\s*%}', ' ', text)
    text = re.sub(r'%}', ' ', text)
    
    # Collapse multiple spaces and newlines into a single space/newline
    text = re.sub(r'\s*\n\s*', '\n', text)  # clean up newlines
    text = re.sub(r'\n+', '\n', text)        # collapse multiple newlines
    text = re.sub(r'[ ]+', ' ', text)         # collapse extra spaces
    text = text.strip()

    # Replace any raw URLs not already converted into the [LINK: ...] format
    url_pattern = r'https?://[^\s<>"\']+'
    text = re.sub(url_pattern, lambda match: f"[LINK: {format_link(match.group(0))}]", text)

    # Optionally, remove non-ASCII characters (you can adjust this as needed)
    text = ''.join(char for char in text if ord(char) < 128)

    return text

def extract_email_content(msg):
    body = None
    html_content = None

    try:
        if msg.is_multipart():
            for part in msg.walk():
                content_type = part.get_content_type()
                content_disposition = str(part.get("Content-Disposition"))
                if content_type == "text/plain" and "attachment" not in content_disposition:
                    print('plain')
                    decoded_content = decode_email_body(part)
                    if decoded_content:
                        body = decoded_content
                        break  # Prefer plain text over HTML
                elif content_type == "text/html" and not body:
                    decoded_content = decode_email_body(part)
                    if decoded_content:
                        html_content = decoded_content
        else:
            body = decode_email_body(msg)

        # If no plain text version, process the HTML version.
        if not body and html_content:
            print('html')
            soup = BeautifulSoup(html_content, "html.parser")
            
            # Remove non-content elements
            for tag in soup(["script", "style", "head", "meta"]):
                tag.decompose()

            # Process images: replace with their alt text or src to preserve context.
            for img in soup.find_all("img"):
                alt_text = img.get("alt", "").strip()
                src = img.get("src", "").strip()
                replacement = f"[IMAGE: {alt_text or src or 'N/A'}]"
                img.replace_with(replacement)

            # Replace <br> tags with newline characters.
            for br in soup.find_all("br"):
                br.replace_with("\n")
                
            # Ensure block elements add line breaks.
            for tag in soup.find_all(["p", "div", "li", "h1", "h2", "h3", "h4", "h5", "h6"]):
                text = tag.get_text(separator=" ", strip=True)
                tag.clear()
                tag.append(text + "\n")
                
            # Process links: replace <a> tags with the anchor text plus a shortened URL.
            for a in soup.find_all("a"):
                href = a.get("href")
                anchor_text = a.get_text().strip()
                if href:
                    short_code = format_link(href)
                    replacement = f"{anchor_text} [LINK: {short_code}]" if anchor_text else f"[LINK: {short_code}]"
                else:
                    replacement = anchor_text
                a.replace_with(replacement)
                
            body = soup.find_all(text=True)

        return clean_email_text(body) if body else "No readable content found"
    except Exception as e:
        return f"Error extracting content: {str(e)}"

def get_emails(host, user_email, token, after_date, since_time=None, old=None):
    try:
        if host == 'gmail':
            host = 'imap.gmail.com'
            folder = 'INBOX'
        else:
            raise ValueError('Invalid host')

        try:
            parsed_date = datetime.strptime(after_date, "%m-%d-%y")
            since_date = parsed_date.strftime("%d-%b-%Y")
            if since_time:
                parsed_datetime = datetime.strptime(after_date + " " + since_time, "%m-%d-%y %H:%M:%S")
            else:
                parsed_datetime = None  # No time filtering
        except ValueError as e:
            raise ValueError(f"Invalid date format: {str(e)}")

        emails = []
        with IMAPClient(host) as client:
            client.login(user_email, token)
            client.select_folder(folder)
            messages = client.search(['SINCE', since_date])  # Fetch emails since the given date

            batch_size = 50
            for i in range(0, len(messages), batch_size):
                batch = messages[i:i+batch_size]
                if not batch:
                    continue

                response = client.fetch(batch, ["RFC822", "INTERNALDATE"])  # Fetch email and timestamp
                for msgid, data in response.items():
                    try:
                        raw_email = data[b"RFC822"]
                        msg = email.message_from_bytes(raw_email)
                        
                        # Extract the email's received time
                        internal_date = data[b"INTERNALDATE"]  # IMAP stores this as a datetime object
                        
                        # If since_time is provided, filter out emails before it
                        if parsed_datetime and internal_date < parsed_datetime:
                            continue  # Skip emails before the specified time

                        subject = safe_decode_header(msg["Subject"])
                        from_email = safe_decode_header(msg["From"])
                        print(msg)
                        body = extract_email_content(msg)
                        print(body)
                        new_email = {
                            'from': from_email,
                            'subject': subject,
                            'body': body
                        }
                        print(new_email)
                        if old is not None:
                            if new_email not in old:
                                emails.append(new_email)
                        else:
                            emails.append(new_email)
                    except Exception as e:
                        print(f"Error processing message {msgid}: {str(e)}")
                        continue
        return emails
    except Exception as e:
        raise Exception(f"Email fetching failed: {str(e)}")

e_pass = os.getenv('APP_PASS')
emails = get_emails('gmail', 'pautomas55@gmail.com', token=e_pass, after_date='2-28-25', since_time='18:30:00')

Delivered-To: pautomas55@gmail.com
Received: by 2002:a05:7301:160f:b0:15f:8810:7276 with SMTP id lz15csp1204917dyb;
        Fri, 28 Feb 2025 15:49:27 -0800 (PST)
X-Google-Smtp-Source: AGHT+IFOb78wkdLZMTjNZJx7rq61xHX4DEajEBlpZuo6NxUvtUsTg7RMj5877tMigzu5+kkU7156
X-Received: by 2002:a05:6214:e88:b0:6d4:1bad:740c with SMTP id 6a1803df08f44-6e8a0c87a97mr87289356d6.4.1740786566991;
        Fri, 28 Feb 2025 15:49:26 -0800 (PST)
ARC-Seal: i=1; a=rsa-sha256; t=1740786566; cv=none;
        d=google.com; s=arc-20240605;
        b=K0Dw5BUrBLkrh9CElROova1Y5VSx86VPU0I1MZuci9P/qV+8YVdjWgG6Dd9F6t1QSh
         CQN1J4k11HwRR8Iv019dfzTwJG7fbbVghnBeMoYVuHbt2YMcIA7lPDmApY8NlLW3G0l5
         1cIUKFNyBhtuefzey0J7foilGvvbxox/GRuQQsk4b9i6EgqD5EoNcXO94X7B+F7z/553
         p4FzvkTP+zajeQ/PP1IDH5SmKrZmMUyY+qaKFzV7C4Iy3HHQ/EhSnEO0KHbsAL9S4yZf
         JGF/YTrL/0turyIMLiUGD/iiY60npC6XrZxSxqvbeTbdtBLDX4kakRtfc+9rTFUuteuw
         sPPA==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc

In [6]:
print(emails)

[{'from': 'Mentality <hello@mentality.com>', 'subject': 'FLASH BUNDLE: 3 Tees & 1 Gym Bottle = $60...', 'body': '[24 Hours Left]([LINK: cftgg]\ntwitter.com/mentalityog\ninstagram.com/mentality\nNo longer want to receive these emails? [Unsubscribe]([LINK: cftgg]\nMentality'}, {'from': '"Caleb Hammer" <calebhammer@calebhammer.com>', 'subject': '👓 They’re Coming for Your Wallet…', 'body': 'Here\'s how to keep the IRS off your back this tax season\nF*ck.\n**Tax season is here, Paul.**\nIf you\'re working a regular 9-5 job\n**without** making side income and you\'re NOT claiming any tax deductions, then\n**this isn\'t a huge deal for you.**\n**Chances are your taxes are automatically taken out of each paycheck, so all you have to do is:**\n*\nFile your W-2\n* Twiddle your thumbs like a good little boy/girl\n* Wait for that sweet tax refund\n**As for everyone else who:**\n* Plans to claim tax deductions (anyone can do this),\n* Makes some side income separate from main job, or\n* Runs a bizn

'body': "Your daily roundup from The Washington Post.\n[[LINK: wezdg]\nAdvertisement [[LINK: jy7yj] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26]\nPUBLISHED BY\n[[LINK: 
y9cgp]\nVISIT MY POST\nRecommended For You\n[[LINK: rfp3r] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=static] [[LINK: zmff9] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=default&li_coord=desktop&collapse_width=600]\nAdvertisement [[LINK: zmff9] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=default&li_coord=mobile&collapse_width=600]\nAre you still interested in this newsletter?\nSince you haven't read in a while, we'll pause sending it to you. Let us know if\nyou still would like to keep receiving it.\nYes, keep this newsletter\nThis is your daily roundup\nof stories recommended just for you, based on your interests and reading\nhistory.\n[[LINK: mz7z5]\nPolitics\nLive updates: Trump cancels news conference with Zelensky after contentious Oval\nOffice meeting\nEurope\nTears and shock in Ukraine and Europe after heated Zelensky-Trump meeting\nNational\nGene Hackman probably died Feb. 17 based on pacemaker data, sheriff says\n[[LINK: 8sc7t] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=static] [[LINK: cjy6s] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=default&li_coord=desktop&collapse_width=600]\nAdvertisement [[LINK: cjy6s] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=default&li_coord=mobile&collapse_width=600]\nAnalysis Aaron Blake\n4 takeaways from the Trump-Zelensky meeting that devolved into a shouting match\nOpinion Jason Willick\nPam Bondis Jeffrey Epstein stunt reveals populisms limits\nADVERTISEMENT\nAdvertisement [[LINK: wcqj3] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26]\nPolitics\nFeds to start getting weekly emails asking what they did. Bosses will see if it\nfits Trump goals.\nNational\nWhat we know so far about the deaths of Gene Hackman and Betsy Arakawa\nADVERTISEMENT\nAdvertisement [[LINK: 6xqx7] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26]\nNational\nTrump officials start dismantling civil rights offices, as part of DOGEs secret\nplan\nEconomic Policy\nIRS rejected request for addresses of people believed to be in U.S. illegally\nOpinion Dana Milbank\nHeres the real threat to personal liberties and free markets\nManage your interests\nHow was today's newsletter?\nOption #1\n[[LINK: p3zuu]\nLike\nOption #2\n[[LINK: rptba]\nNot sure\nOption #3\n[[LINK: 5wppq]\nDislike\n[[LINK: ndtdz]\nYou received this email because you signed up for Recommended For You or because\nit is included in your subscription. Manage newsletters | Unsubscribe 1301 K St\nNW, Washington DC 20071\nDOWNLOAD OUR APP Apple App Store\n[[LINK: btpax]\nGoogle Play\n[[LINK: 4r74d]\n[[LINK: 9bcrf] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=pixel][[LINK: jqcfh] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=pixel][[LINK: gneqk] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=pixel][[LINK: v9qwe] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=pixel][[LINK: z7z5m] 13eea4e47739cbdd6327547632e4d412&p=67c24e01733acb32f293cc26{% end &stpe=pixel]\n[[LINK: wezdg]\nLiveIntent Logo\n[[LINK: mvttv]\nAdChoices Logo\n[[LINK: 8ppx5]\n2025 The Washington Post | Privacy Policy | Help & Contact",



In [None]:
import requests 
import json
import re
import ast
def extract_dict(text):
    # Regular expression to capture a dictionary in the string
    match = re.search(r'\{.*?\}', text, re.DOTALL)
    
    if match:
        dict_str = match.group(0)
        try:
            return ast.literal_eval(dict_str)  # Safely convert to a Python dict
        except (SyntaxError, ValueError):
            return None  # If parsing fails, return None
    
    return None  # No dictionary found
def process_email(email_content):
    system_prompt = """You are an email processing assistant. Your task is to clean up and reformat an email 
    into a human-readable format. Follow these steps:
    1. Simplify all links in the email body by replacing them with `[LINK: <URL>]`.
    2. Clean up the email body by removing excessive line breaks, spaces, and non-ASCII characters.
    3. Summarize the email in 1-2 sentences.
    4. Provide 1-3 action items, if applicable.
    5. Format the output as a Python dictionary with the following keys: 
    `from`, `subject`, `body`, `summary`, `action_items`, `sample_response`"""
    user_prompt = f"{email_content}\\n{system_prompt}"

    data = {
        'model': 'deepseek-r1',
        'messages': [{"role" : "user", "content": user_prompt}],
        "stream": False 
    } 
    url = "http://localhost:11434/api/chat"
    response = requests.post(url, json=data)
    parsed = json.loads(response.text)
    final = parsed['message']['content']
    print(final)
    final = extract_dict(final)
    return final

process_email({'from': '"Cleveland.com" <newsletters@update.cleveland.com>', 'subject': 'New❗ ALL ACCESS Offer: $1 for the first month',
 'body': """$1 for the first month\r\nSubscribe Now https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly93d3cuY2xldmVsYW5kLmNvbS9kaWdpdGFsc3Vic2NyaXB0aW9uL2VtYWlsMi8_dXRtX3NvdXJjZT1zYWlsdGhydSZ1dG1fbWVkaXVtPWVtYWlsJnV0bV9jYW1wYWlnbj1mZWJfYWxsYWNjZXNzMjVfZGNyMjAyNSZ1dG1fY29udGVudD1BQ1EtQURIMC02MDAyYSZsaXN0PUJDX2Rjcl9xcF9mZWIyNQ/616a3a347350c918c70e4620Bec2226d4\nThe perfect pair!\nUnlimited Digital Access to \nclevelandcom\n& \nThe Plain Dealer\n online newspaper\n 7 days a week\r\n$1 for the first month\r\nSubscribe Now https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly93d3cuY2xldmVsYW5kLmNvbS9kaWdpdGFsc3Vic2NyaXB0aW9uL2VtYWlsMi8_dXRtX3NvdXJjZT1zYWlsdGhydSZ1dG1fbWVkaXVtPWVtYWlsJnV0bV9jYW1wYWlnbj1mZWJfYWxsYWNjZXNzMjVfZGNyMjAyNSZ1dG1fY29udGVudD1BQ1EtQURIMC02MDAyYSZsaXN0PUJDX2Rjcl9xcF9mZWIyNQ/616a3a347350c918c70e4620Cec2226d4\nCancel Anytime\n https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly93d3cuY2xldmVsYW5kLmNvbS9kaWdpdGFsc3Vic2NyaXB0aW9uL2VtYWlsMi8_dXRtX3NvdXJjZT1zYWlsdGhydSZ1dG1fbWVkaXVtPWVtYWlsJnV0bV9jYW1wYWlnbj1mZWJfYWxsYWNjZXNzMjVfZGNyMjAyNSZ1dG1fY29udGVudD1BQ1EtQURIMC02MDAyYSZsaXN0PUJDX2Rjcl9xcF9mZWIyNQ/616a3a347350c918c70e4620Dec2226d4\nHaving trouble viewing this email? \nView in your browser https://link.cleveland.com/view/616a3a347350c918c70e4620mtzpq.49jl/3fbfcf69\nThis is an introductory offer not available to current subscribers At the end of your introductory period you will be charged per the terms of 
the offer unless you cancel your subscription Additional terms and conditions may apply\r\nTo ensure receipt of our emails please add \nnewsletters@update.cleveland.com mailto:newsletters@update.cleveland.com\n to your address book or safe sender list You received this email because you have opted in to a newsletter or are a former subscriber to \ncleveland.com https://link.cleveland.com/click/38350862.198993/aHR0cDovL2NsZXZlbGFuZC5jb20_dXRtX3NvdXJjZT1zYWlsdGhydSZ1dG1fbWVkaXVtPWVtYWlsJnV0bV9jYW1wYWlnbj1mZWJfYWxsYWNjZXNzMjVfZGNyMjAyNSZ1dG1fY29udGVudD1BQ1EtQURIMC02MDAyYSZsaXN0PUJDX2Rjcl9xcF9mZWIyNQ/616a3a347350c918c70e4620B6e1320d0\n or The Plain Dealer\r\nMore ways to connect with \nclevelandcom\nContact us https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly93d3cuY2xldmVsYW5kLmNvbS9jb250YWN0dXMvP3V0bV9zb3VyY2U9c2FpbHRocnUmdXRtX21lZGl1bT1lbWFpbCZ1dG1fY2FtcGFpZ249ZmViX2FsbGFjY2VzczI1X2RjcjIwMjUmdXRtX2NvbnRlbnQ9QUNRLUFESDAtNjAwMmEmbGlzdD1CQ19kY3JfcXBfZmViMjU/616a3a347350c918c70e4620Be4eb31a3\nDownload our app https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly93d3cuY2xldmVsYW5kLmNvbS9tb2JpbGUtZGV2aWNlLz91dG1fc291cmNlPXNhaWx0aHJ1JnV0bV9tZWRpdW09ZW1haWwmdXRtX2NhbXBhaWduPWZlYl9hbGxhY2Nlc3MyNV9kY3IyMDI1JnV0bV9jb250ZW50PUFDUS1BREgwLTYwMDJhJmxpc3Q9QkNfZGNyX3FwX2ZlYjI1/616a3a347350c918c70e4620B38f9d624\nSign up for 
Custom Alerts https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly93d3cuY2xldmVsYW5kLmNvbS9jdXN0b21hbGVydHM_dXRtX3NvdXJjZT1zYWlsdGhydSZ1dG1fbWVkaXVtPWVtYWlsJnV0bV9jYW1wYWlnbj1mZWJfYWxsYWNjZXNzMjVfZGNyMjAyNSZ1dG1fY29udGVudD1BQ1EtQURIMC02MDAyYSZsaXN0PUJDX2Rjcl9xcF9mZWIyNQ/616a3a347350c918c70e4620Bd136f1f0\nPrivacy policy https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly93d3cuY2xldmVsYW5kLmNvbS9wcml2YWN5LXBvbGljeS8_dXRtX3NvdXJjZT1zYWlsdGhydSZ1dG1fbWVkaXVtPWVtYWlsJnV0bV9jYW1wYWlnbj1mZWJfYWxsYWNjZXNzMjVfZGNyMjAyNSZ1dG1fY29udGVudD1BQ1EtQURIMC02MDAyYSZsaXN0PUJDX2Rjcl9xcF9mZWIyNQ/616a3a347350c918c70e4620B93c2c54b\nMore newsletters https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly9zdWJzY3JpcHRpb24uY2xldmVsYW5kLmNvbS9uZXdzbGV0dGVycy8_dXRtX3NvdXJjZT1zYWlsdGhydSZ1dG1fbWVkaXVtPWVtYWlsJnV0bV9jYW1wYWlnbj1mZWJfYWxsYWNjZXNzMjVfZGNyMjAyNSZ1dG1fY29udGVudD1BQ1EtQURIMC02MDAyYSZsaXN0PUJDX2Rjcl9xcF9mZWIyNQ/616a3a347350c918c70e4620B4b59fbf2\nManage subscription https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly9jbGV2ZWxhbmQuY29tL215YWNjb3VudD91dG1fc291cmNlPXNhaWx0aHJ1JnV0bV9tZWRpdW09ZW1haWwmdXRtX2NhbXBhaWduPWZlYl9hbGxhY2Nlc3MyNV9kY3IyMDI1JnV0bV9jb250ZW50PUFDUS1BREgwLTYwMDJhJmxpc3Q9QkNfZGNyX3FwX2ZlYjI1/616a3a347350c918c70e4620B50ec5b8f\nUnsubscribe https://link.cleveland.com/click/38350862.198993/aHR0cDovL2xpbmsuY2xldmVsYW5kLmNvbS9tYW5hZ2UvNmZlL29wdG91dC1tYXJrZXRpbmc_ZW1haWw9cGF1dG9tYXM1NSU0MGdtYWlsLmNvbSZ1dG1fc291cmNlPXNhaWx0aHJ1JnV0bV9tZWRpdW09ZW1haWwmdXRtX2NhbXBhaWduPWZlYl9hbGxhY2Nlc3MyNV9kY3IyMDI1JnV0bV9jb250ZW50PUFDUS1BREgwLTYwMDJhJmxpc3Q9QkNfZGNyX3FwX2ZlYjI1/616a3a347350c918c70e4620B0d2c145a\n4800 Tiedeman Rd\nBrooklyn OH 44144\r\n https://link.cleveland.com/click/38350862.198993/aHR0cHM6Ly93d3cuY2xldmVsYW5kLmNvbS8_dXRtX3NvdXJjZT1zYWlsdGhydSZ1dG1fbWVkaXVtPWVtYWlsJnV0bV9jYW1wYWlnbj1mZWJfYWxsYWNjZXNzMjVfZGNyMjAyNSZ1dG1fY29udGVudD1BQ1EtQURIMC02MDAyYSZsaXN0PUJDX2Rjcl9xcF9mZWIyNQ/616a3a347350c918c70e4620B4b84d17b'"""})


In [None]:
@app.route('/upload', methods=["GET", "POST"])
def upload_file():
    if request.method == "POST":
        if 'file' not in request.files:
            return render_template('upload.html', text='No file part')
        file = request.files['file']
        if file.filename == '':
            return render_template('upload.html', text='No selected file')
        
        if file:
            filename = secure_filename(file.filename)
            path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
            file.save(path)
            return render_template('upload.html', text="Upload completed successfully")
    else:
        return render_template('upload.html', text='you alr know bro')

def process_mbox(file):
    with open(file, "r", encoding="utf-8") as f:
        content = f.read()
    return render_template('upload.html', text=content[:500])

In [None]:
if session.get('final_emails', False):
    last_load = session.get('last_load', datetime.now(timezone.utc))
    emails = session['final_emails']
    after_date = last_load.strftime("%m-%d-%y")  
    since_time = last_load.strftime("%H:%M:%S")  
    access_token = refresh(current_user)
    summaries = []
    new_emails = get_emails("gmail", current_user.email, access_token, after_date=after_date, 
            since_time=since_time)
    new_summaries = batch_get_summaries(new_emails)
    print(new_summaries)
    for email in emails:
        new_dict = {"summary": email["summary"], "action_items": email["action_items"]}
        summaries.append(new_dict)
    if new_emails:
        for i in range(0, len(new_emails), 5):
            print(i)
            processed_batch = process_email_batch(new_emails[i:i+5])
            for email in processed_batch:
                new_dict = {"summary": email["summary"], "action_items": email["action_items"]}
                summaries.append(new_dict)

    final = get_final_summary(summaries)
    print(final)
    final = textwrap.dedent(final)
    return render_template('summary.html', summary = final)
    

    
    
        


In [None]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time

def click_unsub(unsubscribe_url):
    """
    Attempts to unsubscribe from an email list by handling GET, POST, and JavaScript interactions automatically.
    """
    try:
        # 1. Attempt GET request (simple unsubscribe links)
        print(f"Attempting to unsubscribe via GET request: {unsubscribe_url}")
        response = requests.get(unsubscribe_url, allow_redirects=True)
        if response.status_code == 200 and "unsubscribed" in response.text.lower():
            print("Successfully unsubscribed via GET request!")
            return True
        
        # 2. Attempt POST request (if GET doesn't work)
        print("Attempting to unsubscribe via POST request...")
        post_data = {"unsubscribe": "yes", "confirm": "true", "action": "unsubscribe"}  # More comprehensive payload
        response = requests.post(unsubscribe_url, data=post_data)
        if response.status_code == 200 and "unsubscribed" in response.text.lower():
            print("Successfully unsubscribed via POST request!")
            return True
        
        # 3. If the page requires JavaScript, use Selenium
        print("Attempting to unsubscribe using Selenium (JavaScript interaction)...")
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
        
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(unsubscribe_url)
        
        # Wait for page to load with a timeout
        wait = WebDriverWait(driver, 10)
        
        # Look for various unsubscribe elements with a more comprehensive approach
        try:
            # Try buttons with "unsubscribe" text
            unsubscribe_elements = wait.until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'unsubscribe') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'opt out') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'remove me')]")
                )
            )
            
            if unsubscribe_elements:
                print(f"Found {len(unsubscribe_elements)} potential unsubscribe elements")
                unsubscribe_elements[0].click()
                print("Clicked unsubscribe element!")
                
                # Wait for confirmation or success message
                time.sleep(5)
                
                # Check for confirmation buttons if needed
                confirm_buttons = driver.find_elements(By.XPATH, "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'confirm') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'yes') or contains(@value, 'confirm')]")
                if confirm_buttons:
                    confirm_buttons[0].click()
                    print("Clicked confirmation button!")
                    time.sleep(3)
                
                driver.quit()
                return True
        except TimeoutException:
            print("No unsubscribe buttons found with text search.")
            
            # Try finding form elements or inputs related to unsubscribing
            try:
                form_elements = driver.find_elements(By.XPATH, "//form[contains(@action, 'unsubscribe') or contains(@id, 'unsubscribe')]")
                if form_elements:
                    submit_buttons = form_elements[0].find_elements(By.XPATH, ".//input[@type='submit'] | .//button[@type='submit']")
                    if submit_buttons:
                        submit_buttons[0].click()
                        print("Clicked form submit button!")
                        time.sleep(3)
                        driver.quit()
                        return True
            except NoSuchElementException:
                pass
                
        # Last resort: try to find any checkbox and submit button combination
        try:
            checkboxes = driver.find_elements(By.XPATH, "//input[@type='checkbox']")
            for checkbox in checkboxes:
                if not checkbox.is_selected():
                    checkbox.click()
                    print("Clicked a checkbox!")
            
            submit_buttons = driver.find_elements(By.XPATH, "//button[@type='submit'] | //input[@type='submit']")
            if submit_buttons:
                submit_buttons[0].click()
                print("Clicked submit button after checkbox!")
                time.sleep(3)
                driver.quit()
                return True
        except:
            pass
            
        print("Page content for debugging:")
        print(driver.page_source[:500] + "...")  # Print first 500 chars for debugging
        
        driver.quit()
        print("Automated unsubscribe attempt failed. Manual action may be required.")
        return False
        
    except Exception as e:
        print(f"Error while unsubscribing: {e}")
        return False

click_unsub("https://s2.washingtonpost.com/wp-unsubscribe/newsletters?nlsendid=67ceb9636809892566f4da91&trackId=6309817ba08ec439dc9b0673")

Attempting to unsubscribe via GET request: https://s2.washingtonpost.com/wp-unsubscribe/newsletters?nlsendid=67ceb9636809892566f4da91&trackId=6309817ba08ec439dc9b0673


In [None]:
def process_mbox(file):
    print('Processing')
    mbox = mailbox.mbox(file)
    i = 0
    print(len(mbox.items()))
    senders = []
    for key, message in mbox.items():
        '''
        email_data = {
            'from': safe_decode_header(message.get('From', '')),
            'to': safe_decode_header(message.get('To', '')),
            'subject': safe_decode_header(message.get('Subject', '')),
            'date': message.get('Date', ''),
            'body': extract_email_content(message)
        }
        '''
        print(i)
        i += 1
        if 'spam' in message.get('X-Spam-Status', ''):
            print('spam') 
            continue 
        sender = safe_decode_header(message.get('From', ''))
        if sender not in senders:
            senders.append({sender: 1})
        else:
            senders[sender] = senders[sender] + 1
        for k in sorted(senders, key=senders.get, reverse=True):
            print(k, senders[k])
    
    # return render_template('upload.html', text='No emails found')

In [None]:
@app.route('/delete_sender', methods=["POST"])
def delete_sender():
    senders = request.json.get("senders") #??
    senders = list(senders) #??
    if session.get('filtered_mbox', False):
        orig_mbox_path = session['filtered_mbox']
    else:
        orig_mbox_path = session['mbox_path']

    if not os.path.exists(orig_mbox_path):
        return jsonify({"error": "Mbox file not found. Please try reuploading"}), 404
    
    print(orig_mbox_path)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mbox")
    temp_mbox_path = temp_file.name 

    old_mbox = mailbox.mbox(orig_mbox_path)
    new_mbox = mailbox.mbox(temp_mbox_path)

    remaining_senders = {}
    i = 0
    for message in old_mbox:
        print(i)
        i += 1
        sender = safe_decode_header(message.get('From', ''))
        if sender.lower() not in senders:
            new_mbox.add(message)
            if sender in remaining_senders:
                remaining_senders[sender] += 1 
            else:
                print(sender)
                remaining_senders[sender] = 1
    new_mbox.flush()

    session['filtered_mbox'] = temp_mbox_path 


    return jsonify({"senders": sorted(remaining_senders.items(), key=lambda x: x[1], reverse=True)})

In [None]:
def get_message_body(parts):
    """Extract the email body from message parts recursively"""
    body = ""
    for part in parts:
        if part.get('mimeType') == 'text/plain' and 'data' in part.get('body', {}):
            # Decode the body data
            data = part['body']['data']
            text = base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
            body += text
        elif part.get('mimeType') == 'text/html' and 'data' in part.get('body', {}):
            # If we have HTML, decode it too
            data = part['body']['data']
            html = base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
            body += html
        elif 'parts' in part:
            # Recursively process nested parts
            body += get_message_body(part['parts'])
    return body

In [4]:
from transformers import GemmaForCausalLM, GemmaTokenizer

# Load from local directory
model = GemmaForCausalLM.from_pretrained("MailMindSummarization")
tokenizer = GemmaTokenizer.from_pretrained("MailMindSummarization")

You are using a model of type gemma3 to instantiate a model of type gemma. This is not supported for all configurations of models and can yield errors.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


AttributeError: 'dict' object has no attribute 'to_dict'