In [1]:
!pip install mailbox

Collecting mailbox
  Downloading mailbox-0.4.tar.gz (4.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: mailbox
  Building wheel for mailbox (setup.py): started
  Building wheel for mailbox (setup.py): finished with status 'done'
  Created wheel for mailbox: filename=mailbox-0.4-py3-none-any.whl size=4701 sha256=f9e9cfb45e1337c0dbaa5949bbeb06cb3497b3c867f525ae5bc688dde1f0c430
  Stored in directory: c:\users\predator\appdata\local\pip\cache\wheels\15\37\f6\2cfb527173fbb22432b22a292eb666c23228db1b6ab4ae2738
Successfully built mailbox
Installing collected packages: mailbox
Successfully installed mailbox-0.4


In [9]:
# extract_features.py

import mailbox
import pandas as pd
import re
import os
from config import *
import utils
from bs4 import BeautifulSoup


class FeatureFinder:
    def getFeatureTitle(self):
        raise NotImplementedError

    def getFeature(self, message):
        raise NotImplementedError


class HTMLFormFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "HTML Form"

    def getFeature(self, message):
        payload = utils.getpayload(message).lower()
        return re.search(r'<\s*/?\s*form\s*>', payload) is not None


class IFrameFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "HTML iFrame"

    def getFeature(self, message):
        payload = utils.getpayload(message).lower()
        return re.search(r'<\s*/?\s*iframe\s*>', payload) is not None


class FlashFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Flash Content"

    def getFeature(self, message):
        payload = utils.getpayload(message).lower()
        swflinks = re.findall(FLASH_LINKED_CONTENT, payload)
        flash_obj = re.search(r'embed\s+src\s*=\s*".*\.swf"', payload)
        return (swflinks and len(swflinks) > 0) or flash_obj is not None


class AttachmentFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Attachments"

    def getFeature(self, message):
        return utils.getAttachmentCount(message)


class HTMLContentFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "HTML Content"

    def getFeature(self, message):
        return utils.ishtml(message)


class URLsFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "URL Count"

    def getFeature(self, message):
        return len(utils.geturls_payload(message))


class ExternalResourcesFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "External Resources"

    def getFeature(self, message):
        return len(utils.getexternalresources(message))


class JavascriptFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "JavaScript Usage"

    def getFeature(self, message):
        return len(utils.getjavascriptusage(message))


class CssFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "CSS Usage"

    def getFeature(self, message):
        return len(utils.getcssusage(message))


class IPsInURLs(FeatureFinder):
    def getFeatureTitle(self):
        return "IP in URLs"

    def getFeature(self, message):
        return len(utils.getIPHrefs(message)) > 0


class AtInURLs(FeatureFinder):
    def getFeatureTitle(self):
        return "@ in URLs"

    def getFeature(self, message):
        emailPattern = re.compile(EMAILREGEX)
        for url in utils.geturls_payload(message):
            if url.lower().startswith("mailto:") or emailPattern.search(url):
                continue
            if "@" in url or "%40" in url:
                return True
        return False


class EncodingFinder(FeatureFinder):
    def getFeatureTitle(self):
        return "Content Encoding"

    def getFeature(self, message):
        return str(message.get('content-transfer-encoding')).lower()


def extract_features_from_mbox(filepath, limit=500):
    mbox = mailbox.mbox(filepath)
    features = [
        HTMLFormFinder(), IFrameFinder(), FlashFinder(), AttachmentFinder(),
        HTMLContentFinder(), URLsFinder(), ExternalResourcesFinder(),
        JavascriptFinder(), CssFinder(), IPsInURLs(), AtInURLs(), EncodingFinder()
    ]

    data = []

    for i, message in enumerate(mbox):
        if limit and i >= limit:
            break
        try:
            record = {}
            payload_parts = utils.getpayload_dict(message)
            totalsize = sum(len(re.sub(r'\s+', '', p["payload"])) for p in payload_parts)

            if totalsize < 1:
                continue

            for finder in features:
                record[finder.getFeatureTitle()] = finder.getFeature(message)

            record["Phishy"] = 1  # These are phishing emails
            data.append(record)

        except Exception as e:
            print(f"Error parsing email {i}: {e}")
            continue

    df = pd.DataFrame(data)
    return df


In [12]:


# Adjust this path to where your file is
path = "phishing0.mbox"

df = extract_features_from_mbox(path, limit=500)
df.head()
