Connected to Python 3.13.6

In [None]:
# collect_emails.py - FIXED VERSION
import imaplib
import email
from email import policy
from email.parser import BytesParser
from datetime import datetime, timedelta
import os
import re
import argparse
from bs4 import BeautifulSoup
import lxml
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

def chunk_text(text, max_length=1000):
    """
    Improved text chunking with better error handling
    """
    if not text or not isinstance(text, str):
        return []
    
    # Normalize Unicode characters to the closest ASCII representation
    try:
        text = text.encode('ascii', 'ignore').decode('ascii')
    except Exception as e:
        print(f"Warning: Unicode normalization failed: {e}")
        # Fallback to original text if normalization fails
        pass

    # Remove sequences of '>' used in email threads
    text = re.sub(r'\s*(?:>\s*){2,}', ' ', text)

    # Remove sequences of dashes, underscores, or non-breaking spaces
    text = re.sub(r'-{3,}', ' ', text)
    text = re.sub(r'_{3,}', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)  # Collapse multiple spaces into one

    # Replace URLs with a single space, or remove them
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Normalize whitespace to single spaces, strip leading/trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Split text into sentences while preserving punctuation
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 < max_length:
            current_chunk += (sentence + " ").strip()
        else:
            if current_chunk.strip():  # Only add non-empty chunks
                chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    
    if current_chunk.strip():  # Add final chunk if not empty
        chunks.append(current_chunk.strip())

    return chunks

def save_chunks_to_vault(chunks):
    """
    Save chunks with better error handling and file management
    """
    if not chunks:
        return
    
    vault_path = "vault.txt"
    try:
        with open(vault_path, "a", encoding="utf-8") as vault_file:
            for chunk in chunks:
                if chunk.strip():  # Only write non-empty chunks
                    vault_file.write(chunk.strip() + "\n")
    except Exception as e:
        print(f"Error saving chunks to vault: {e}")

def get_text_from_html(html_content):
    """
    Extract text from HTML with error handling
    """
    if not html_content:
        return ""
    
    try:
        soup = BeautifulSoup(html_content, 'lxml')
        return soup.get_text()
    except Exception as e:
        print(f"Error parsing HTML content: {e}")
        # Fallback to basic text extraction
        return re.sub(r'<[^>]+>', '', html_content)

def get_charset_with_fallback(part):
    """
    Get character encoding with fallbacks
    """
    charset = part.get_content_charset()
    if charset:
        return charset
    
    # Common fallbacks
    fallback_charsets = ['utf-8', 'iso-8859-1', 'windows-1252', 'ascii']
    return fallback_charsets

def decode_content_safely(content, charsets):
    """
    Safely decode content with multiple charset attempts
    """
    if isinstance(charsets, str):
        charsets = [charsets]
    
    for charset in charsets:
        try:
            return content.decode(charset)
        except (UnicodeDecodeError, LookupError) as e:
            print(f"Failed to decode with {charset}: {e}")
            continue
    
    # Final fallback - decode with errors ignored
    try:
        return content.decode('utf-8', errors='ignore')
    except Exception:
        return str(content)  # Last resort

def save_plain_text_content(email_bytes, email_id):
    """
    Extract and save email content with improved error handling
    """
    try:
        msg = BytesParser(policy=policy.default).parsebytes(email_bytes)
        text_content = ""
        
        if msg.is_multipart():
            for part in msg.walk():
                try:
                    content_type = part.get_content_type()
                    if content_type == 'text/plain':
                        payload = part.get_payload(decode=True)
                        if payload:
                            charsets = get_charset_with_fallback(part)
                            text_content += decode_content_safely(payload, charsets)
                    elif content_type == 'text/html':
                        payload = part.get_payload(decode=True)
                        if payload:
                            charsets = get_charset_with_fallback(part)
                            html_content = decode_content_safely(payload, charsets)
                            text_content += get_text_from_html(html_content)
                except Exception as e:
                    print(f"Error processing email part: {e}")
                    continue
        else:
            try:
                content_type = msg.get_content_type()
                payload = msg.get_payload(decode=True)
                if payload:
                    charsets = get_charset_with_fallback(msg)
                    if content_type == 'text/plain':
                        text_content = decode_content_safely(payload, charsets)
                    elif content_type == 'text/html':
                        html_content = decode_content_safely(payload, charsets)
                        text_content = get_text_from_html(html_content)
            except Exception as e:
                print(f"Error processing single-part email: {e}")

        if text_content.strip():  # Only process non-empty content
            chunks = chunk_text(text_content)
            save_chunks_to_vault(chunks)
            return text_content
        else:
            print(f"No text content found in email ID: {email_id}")
            return ""
            
    except Exception as e:
        print(f"Error processing email ID {email_id}: {e}")
        return ""

def search_and_process_emails(imap_client, email_source, search_keyword, start_date, end_date):
    """
    Search and process emails with better error handling
    """
    try:
        # Build search criteria more carefully
        search_parts = []
        
        if start_date and end_date:
            search_parts.append(f'(SINCE "{start_date}" BEFORE "{end_date}")')
        
        if search_keyword:
            # Escape quotes in keyword
            escaped_keyword = search_keyword.replace('"', '\\"')
            search_parts.append(f'BODY "{escaped_keyword}"')
        
        if search_parts:
            search_criteria = ' '.join(search_parts)
        else:
            search_criteria = 'ALL'

        print(f"Using search criteria for {email_source}: {search_criteria}")
        
        typ, data = imap_client.search(None, search_criteria)
        if typ == 'OK' and data[0]:
            email_ids = data[0].split()
            print(f"Found {len(email_ids)} emails matching criteria in {email_source}.")

            success_count = 0
            for num in email_ids:
                try:
                    typ, email_data = imap_client.fetch(num, '(RFC822)')
                    if typ == 'OK' and email_data[0]:
                        email_id = num.decode('utf-8')
                        print(f"Processing email ID: {email_id} from {email_source}")
                        result = save_plain_text_content(email_data[0][1], email_id)
                        if result:
                            success_count += 1
                    else:
                        print(f"Failed to fetch email ID: {num.decode('utf-8')} from {email_source}")
                except Exception as e:
                    print(f"Error processing email {num}: {e}")
                    continue
            
            print(f"Successfully processed {success_count}/{len(email_ids)} emails from {email_source}")
        else:
            print(f"No emails found matching criteria in {email_source}.")
            
    except Exception as e:
        print(f"Error searching emails in {email_source}: {e}")

def connect_to_email_server(server, username, password, email_source):
    """
    Connect to email server with error handling
    """
    try:
        if not username or not password:
            print(f"Missing credentials for {email_source}. Skipping...")
            return None
        
        imap_client = imaplib.IMAP4_SSL(server)
        imap_client.login(username, password)
        imap_client.select('inbox')
        print(f"Successfully connected to {email_source}")
        return imap_client
    except Exception as e:
        print(f"Failed to connect to {email_source}: {e}")
        return None

def main():
    parser = argparse.ArgumentParser(description="Search and process emails based on optional keyword and date range.")
    parser.add_argument("--keyword", help="The keyword to search for in the email bodies.", default="")
    parser.add_argument("--startdate", help="Start date in DD.MM.YYYY format.", required=False)
    parser.add_argument("--enddate", help="End date in DD.MM.YYYY format.", required=False)
    args = parser.parse_args()

    start_date = None
    end_date = None

    # Check if both start and end dates are provided and valid
    if args.startdate and args.enddate:
        try:
            start_date = datetime.strptime(args.startdate, "%d.%m.%Y").strftime("%d-%b-%Y")
            end_date = datetime.strptime(args.enddate, "%d.%m.%Y").strftime("%d-%b-%Y")
        except ValueError as e:
            print(f"Error: Date format is incorrect. Please use DD.MM.YYYY format. Details: {e}")
            return
    elif args.startdate or args.enddate:
        print("Both start date and end date must be provided together.")
        return

    # Retrieve email credentials from environment variables
    gmail_username = os.getenv('GMAIL_USERNAME')
    gmail_password = os.getenv('GMAIL_PASSWORD')
    outlook_username = os.getenv('OUTLOOK_USERNAME')
    outlook_password = os.getenv('OUTLOOK_PASSWORD')

    connections = []
    
    # Connect to Gmail's IMAP server
    gmail_client = connect_to_email_server('imap.gmail.com', gmail_username, gmail_password, "Gmail")
    if gmail_client:
        connections.append((gmail_client, "Gmail"))

    # Connect to Outlook IMAP server
    outlook_client = connect_to_email_server('imap-mail.outlook.com', outlook_username, outlook_password, "Outlook")
    if outlook_client:
        connections.append((outlook_client, "Outlook"))

    if not connections:
        print("No email connections established. Please check your credentials.")
        return

    # Search and process emails from all connected services
    for client, source in connections:
        try:
            search_and_process_emails(client, source, args.keyword, start_date, end_date)
        except Exception as e:
            print(f"Error processing {source} emails: {e}")
        finally:
            try:
                client.logout()
            except Exception as e:
                print(f"Error closing {source} connection: {e}")

    print("Email processing completed!")

if __name__ == "__main__":
    main()


# emailrag2.py - FIXED VERSION
import torch
import ollama
import os
import json
from openai import OpenAI
import argparse
import yaml

# ANSI escape codes for colors
PINK = '\033[95m'
CYAN = '\033[96m'
YELLOW = '\033[93m'
NEON_GREEN = '\033[92m'
RESET_COLOR = '\033[0m'

def load_config(config_file):
    """Load configuration with error handling"""
    print("Loading configuration...")
    try:
        with open(config_file, 'r') as file:
            config = yaml.safe_load(file)
            
        # Validate required config keys
        required_keys = ['vault_file', 'embeddings_file', 'ollama_model', 'top_k', 'system_message']
        for key in required_keys:
            if key not in config:
                print(f"Missing required configuration key: {key}")
                exit(1)
                
        return config
    except FileNotFoundError:
        print(f"Configuration file '{config_file}' not found.")
        exit(1)
    except yaml.YAMLError as e:
        print(f"Error parsing YAML configuration: {e}")
        exit(1)

def open_file(filepath):
    """Open file with error handling"""
    print(f"Opening file: {filepath}")
    try:
        with open(filepath, 'r', encoding='utf-8') as infile:
            return infile.read()
    except FileNotFoundError:
        print(f"File '{filepath}' not found.")
        return None
    except Exception as e:
        print(f"Error reading file '{filepath}': {e}")
        return None

def load_or_generate_embeddings(vault_content, embeddings_file):
    """Load or generate embeddings with better error handling"""
    if os.path.exists(embeddings_file):
        print(f"Loading embeddings from '{embeddings_file}'...")
        try:
            with open(embeddings_file, "r", encoding="utf-8") as file:
                embeddings_data = json.load(file)
                if embeddings_data:
                    return torch.tensor(embeddings_data)
                else:
                    print("Embeddings file is empty. Generating new embeddings...")
        except json.JSONDecodeError as e:
            print(f"Invalid JSON format in embeddings file '{embeddings_file}': {e}")
        except Exception as e:
            print(f"Error loading embeddings: {e}")
    
    print("Generating new embeddings...")
    embeddings = generate_embeddings(vault_content)
    if embeddings:
        save_embeddings(embeddings, embeddings_file)
        return torch.tensor(embeddings)
    else:
        return torch.empty(0)

def generate_embeddings(vault_content):
    """Generate embeddings with error handling"""
    print("Generating embeddings...")
    embeddings = []
    
    if not vault_content:
        print("No vault content to generate embeddings for.")
        return embeddings
    
    for i, content in enumerate(vault_content):
        if not content.strip():  # Skip empty content
            continue
            
        try:
            response = ollama.embeddings(model='mxbai-embed-large', prompt=content.strip())
            if 'embedding' in response:
                embeddings.append(response["embedding"])
            else:
                print(f"No embedding returned for content item {i}")
        except Exception as e:
            print(f"Error generating embedding for content item {i}: {e}")
            continue
    
    print(f"Generated {len(embeddings)} embeddings from {len(vault_content)} content items")
    return embeddings

def save_embeddings(embeddings, embeddings_file):
    """Save embeddings with error handling"""
    print(f"Saving embeddings to '{embeddings_file}'...")
    try:
        # Ensure directory exists
        os.makedirs(os.path.dirname(embeddings_file) if os.path.dirname(embeddings_file) else '.', exist_ok=True)
        
        with open(embeddings_file, "w", encoding="utf-8") as file:
            json.dump(embeddings, file)
        print(f"Successfully saved {len(embeddings)} embeddings")
    except Exception as e:
        print(f"Error saving embeddings: {e}")

def get_relevant_context(rewritten_input, vault_embeddings, vault_content, top_k):
    """Get relevant context with improved error handling"""
    print("Retrieving relevant context...")
    
    if not rewritten_input or not rewritten_input.strip():
        print("Empty input provided")
        return []
        
    if vault_embeddings.nelement() == 0:
        print("No embeddings available")
        return []
    
    if not vault_content:
        print("No vault content available")
        return []
    
    try:
        input_embedding_response = ollama.embeddings(model='mxbai-embed-large', prompt=rewritten_input)
        if 'embedding' not in input_embedding_response:
            print("Failed to generate input embedding")
            return []
            
        input_embedding = torch.tensor(input_embedding_response["embedding"]).unsqueeze(0)
        cos_scores = torch.cosine_similarity(input_embedding, vault_embeddings)
        
        top_k = min(top_k, len(cos_scores))
        if top_k <= 0:
            return []
            
        top_indices = torch.topk(cos_scores, k=top_k)[1].tolist()
        relevant_context = []
        
        for idx in top_indices:
            if idx < len(vault_content) and vault_content[idx].strip():
                relevant_context.append(vault_content[idx].strip())
        
        return relevant_context
    except Exception as e:
        print(f"Error getting relevant context: {e}")
        return []

def ollama_chat(user_input, system_message, vault_embeddings, vault_content, ollama_model, conversation_history, top_k, client):
    """Chat function with improved error handling"""
    try:
        relevant_context = get_relevant_context(user_input, vault_embeddings, vault_content, top_k)
        
        if relevant_context:
            context_str = "\n".join(relevant_context)
            print("Context Pulled from Documents: \n\n" + CYAN + context_str + RESET_COLOR)
        else:
            print("No relevant context found.")

        user_input_with_context = user_input
        if relevant_context:
            user_input_with_context = context_str + "\n\n" + user_input

        conversation_history.append({"role": "user", "content": user_input_with_context})
        messages = [{"role": "system", "content": system_message}] + conversation_history

        response = client.chat.completions.create(
            model=ollama_model,
            messages=messages,
            max_tokens=2000,
            temperature=0.7
        )
        
        assistant_response = response.choices[0].message.content
        conversation_history.append({"role": "assistant", "content": assistant_response})
        
        return assistant_response
        
    except Exception as e:
        error_msg = f"Error in Ollama chat: {e}"
        print(error_msg)
        return error_msg

def main():
    parser = argparse.ArgumentParser(description="Ollama Chat with Email RAG")
    parser.add_argument("--config", default="config.yaml", help="Path to the configuration file")
    parser.add_argument("--clear-cache", action="store_true", help="Clear the embeddings cache")
    parser.add_argument("--model", help="Model to use for embeddings and responses")

    args = parser.parse_args()

    try:
        config = load_config(args.config)

        if args.clear_cache and os.path.exists(config["embeddings_file"]):
            print(f"Clearing embeddings cache at '{config['embeddings_file']}'...")
            os.remove(config["embeddings_file"])

        if args.model:
            config["ollama_model"] = args.model

        vault_content = []
        if os.path.exists(config["vault_file"]):
            print(f"Loading content from vault '{config['vault_file']}'...")
            with open(config["vault_file"], "r", encoding='utf-8') as vault_file:
                vault_content = [line for line in vault_file.readlines() if line.strip()]
        else:
            print(f"Vault file '{config['vault_file']}' not found. Please run collect_emails.py first.")
            return

        if not vault_content:
            print("No content found in vault file. Please run collect_emails.py first.")
            return

        vault_embeddings_tensor = load_or_generate_embeddings(vault_content, config["embeddings_file"])

        # Initialize OpenAI client for Ollama
        try:
            client = OpenAI(
                base_url=config["ollama_api"]["base_url"],
                api_key=config["ollama_api"]["api_key"]
            )
        except Exception as e:
            print(f"Error initializing Ollama client: {e}")
            print("Please ensure Ollama is running on the specified base_url")
            return

        conversation_history = []
        system_message = config["system_message"]

        print("Email RAG system initialized successfully!")
        print("You can now ask questions about your emails.")

        while True:
            try:
                user_input = input(YELLOW + "Ask a question about your emails (or type 'quit' to exit): " + RESET_COLOR)
                if user_input.lower() in ['quit', 'exit', 'q']:
                    break
                
                if not user_input.strip():
                    print("Please enter a valid question.")
                    continue
                    
                response = ollama_chat(
                    user_input, 
                    system_message, 
                    vault_embeddings_tensor, 
                    vault_content, 
                    config["ollama_model"], 
                    conversation_history, 
                    config["top_k"], 
                    client
                )
                print(NEON_GREEN + "Response: \n\n" + response + RESET_COLOR)
                
            except KeyboardInterrupt:
                print("\nExiting...")
                break
            except Exception as e:
                print(f"Error during conversation: {e}")
                continue

    except Exception as e:
        print(f"Fatal error: {e}")
        exit(1)

if __name__ == "__main__":
    main()