In [None]:
pip install imapclient

: 

In [None]:
pip install sentence-transformers



In [None]:
import pandas as pd
import numpy as np
import pickle
import email
import imaplib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [24]:

# ✅ Step 2: Load Dataset
df = pd.read_csv("/content/emails_content_classification_extended.csv")  # Replace with actual dataset file
df.dropna(subset=['Email Body', 'Category'], inplace=True)


In [28]:
# ✅ Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df['Email Body'], df['Category'], test_size=0.2, random_state=42)

# ✅ Step 4: TF-IDF Feature Extraction
tfidf_vectorization = TfidfVectorizer(stop_words='english', max_features=2000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# ✅ Step 5: Train RandomForest Model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train_tfidf, y_train)


In [29]:
# ✅ Step 6: Evaluate Model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"🎯 Model Accuracy: {accuracy * 100:.2f}%")
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

🎯 Model Accuracy: 100.00%

📊 Classification Report:
                              precision    recall  f1-score   support

        E-commerce & Orders       1.00      1.00      1.00        29
       Education & Learning       1.00      1.00      1.00        33
          Finance & Banking       1.00      1.00      1.00        40
       Healthcare & Medical       1.00      1.00      1.00        25
         Legal & Government       1.00      1.00      1.00        37
Newsletters & Subscriptions       1.00      1.00      1.00        39
                   Personal       1.00      1.00      1.00        38
                 Promotions       1.00      1.00      1.00        22
 Social Media Notifications       1.00      1.00      1.00        27
          Technical Support       1.00      1.00      1.00        27
       Travel & Hospitality       1.00      1.00      1.00        41
              Work/Business       1.00      1.00      1.00        42

                   accuracy                     

In [None]:
# ✅ Step 7: Save Model & Vectorizer
with open("email_classification.pkl", "wb") as f:
    pickle.dump(model, f)
with open("tfidf_vectorization.pkl", "wb") as f:
    pickle.dump(tfidf_vectorization, f)

print("✅ Model Training & Saved!")

# ----------------------------- #
# ✅ Step 8: Email Fetch & Classification
# ----------------------------- #
EMAIL = "aryaidamle@gmail.com"  # Replace with your Gmail
APP_PASSWORD = "yall znnu mcbj hpiu"  # Generate from Google Account

def fetch_latest_emails(n=50):
    """Fetches the latest 'n' emails from Gmail using IMAP."""
    try:
        mail = imaplib.IMAP4_SSL("imap.gmail.com")
        mail.login(EMAIL, APP_PASSWORD)
        mail.select("inbox")

        # Get latest 'n' emails (sorted by date)
        result, data = mail.search(None, "ALL")
        email_ids = data[0].split()[-n:]  # Take the last 'n' emails

        emails = []
        for email_id in email_ids:
            result, msg_data = mail.fetch(email_id, "(RFC822)")
            raw_email = msg_data[0][1]

            msg = email.message_from_bytes(raw_email)
            subject = msg["Subject"] if msg["Subject"] else "No Subject"
            date = msg["Date"]

            body = ""
            if msg.is_multipart():
                for part in msg.walk():
                    if part.get_content_type() == "text/plain":
                        body = part.get_payload(decode=True).decode(errors="ignore")
                        break
            else:
                body = msg.get_payload(decode=True).decode(errors="ignore")

            emails.append((date, subject, body))

        mail.logout()
        return emails

    except Exception as e:
        print(f"❌ Error fetching emails: {e}")
        return []

def classify_emails(n=50):
    """Classifies the latest 'n' emails into predefined categories."""
    clf, vectorizer = load_model()
    emails = fetch_latest_emails(n)

    classified_emails = []
    for date, subject, body in emails:
        email_content = f"{subject} {body}"
        email_vector = vectorizer.transform([email_content])
        prediction = clf.predict(email_vector)[0]

        classified_emails.append((date, subject, prediction))

    # Sort by priority (e.g., Work > Urgent > Promotions)
    priority_order = ["Work/Business", "Education & Learning", "Finance & Banking",
                      "Technical Support", "E-commerce & Orders", "Healthcare & Medical",
                      "Travel & Hospitality", "Legal & Government", "Newsletters & Subscriptions",
                      "Social Media Notifications", "Personal", "Promotions"]

    classified_emails.sort(key=lambda x: priority_order.index(x[2]))

    # ✅ Save results to CSV
    df_result = pd.DataFrame(classified_emails, columns=["Date", "Subject", "Category"])
    df_result.to_csv("classified_emails1.csv", index=False)

    print("✅ Classified Emails saved to 'classified_emails1.csv'!")


✅ Model Training & Saved!


In [31]:
# ✅ Step 9: Load Model & Run Classification
def load_model():
    """Loads the saved model and vectorizer."""
    with open("email_classifier.pkl", "rb") as f:
        clf = pickle.load(f)
    with open("tfidf_vectorizer.pkl", "rb") as f:
        vectorizer = pickle.load(f)
    return clf, vectorizer

# Run Email Classification
classify_emails(n=50)

✅ Classified Emails saved to 'classified_emails.csv'!
