In [None]:
# =============================================================
# PDF Static Feature Extraction Script
# =============================================================
# Description:
#   This script extracts 40 static structural and content-based
#   features from PDF files for malware detection research.
# =============================================================

import os, re, math, subprocess, time
import fitz  # PyMuPDF
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
from pdfminer.high_level import extract_text
import pandas as pd

# -------------------------------------------------------------
# ========== USER PATHS (EDIT AS NEEDED) ==========
# -------------------------------------------------------------
input_folder = "Put your input path here"
output_csv = "Put your output path here"
error_log = "A pathway for tracking the corrupted files"

# -------------------------------------------------------------
# ========== TEXT EXTRACTION UTILITIES ==========
# -------------------------------------------------------------
def extract_text_pdfminer(filepath):
    """Extract visible text from PDF using pdfminer."""
    try:
        return extract_text(filepath)
    except Exception:
        return ""

def extract_text_pdftotext(filepath):
    """Extract text using the external pdftotext utility."""
    try:
        output = subprocess.check_output(["pdftotext", filepath, "-"], stderr=subprocess.DEVNULL)
        return output.decode("utf-8", errors="ignore")
    except Exception:
        return ""

def extract_text_ocr(filepath, max_pages=1):
    """OCR fallback for image-based PDFs (first page only)."""
    try:
        images = convert_from_path(filepath, dpi=200, first_page=1, last_page=max_pages)
        return "\n".join(pytesseract.image_to_string(img) for img in images)
    except Exception:
        return ""

# -------------------------------------------------------------
# ========== FEATURE CALCULATION UTILITIES ==========
# -------------------------------------------------------------
def calculate_entropy(text):
    """Compute Shannon entropy of text content."""
    if not text:
        return 0
    freq = [0] * 256
    for char in text:
        freq[ord(char) % 256] += 1
    freq = [p / len(text) for p in freq if p > 0]
    return -sum(p * math.log2(p) for p in freq)

def count_name_obfuscations(text):
    """Count obfuscated names (hex/escaped sequences)."""
    patterns = [
        r'/[a-zA-Z]*#\d{2}', r'/[a-zA-Z]*%[0-9a-fA-F]{2}',
        r'/[a-zA-Z]*\\x[0-9a-fA-F]{2}', r'/[a-zA-Z]*\\[0-7]{1,3}'
    ]
    return sum(len(re.findall(p, text)) for p in patterns)

def log_failure(filepath, message):
    """Log any failed files for debugging."""
    with open(error_log, 'a') as logf:
        logf.write(f"{filepath} -- {message}\n")

# -------------------------------------------------------------
# ========== MAIN FEATURE EXTRACTION FUNCTION ==========
# -------------------------------------------------------------
def extract_pdf_features(filepath):
    """
    Extract 40 static PDF features used for malware analysis.
    Returns a dictionary (one row per PDF).
    """
    features = {
        "file_path": filepath,
        "file_size": 0, "title_chars": 0, "encrypted": 0, "metadata_size": 0,
        "page_count": 0, "valid_pdf_header": 0, "image_count": 0,
        "text_length": 0, "object_count": 0, "font_object_count": 0,
        "embedded_file_count": 0, "average_embedded_file_size": 0,
        "stream_count": 0, "endstream_count": 0, "average_stream_size": 0,
        "entropy_of_streams": 0, "xref_count": 0, "xref_entries": 0,
        "name_obfuscations": 0, "total_filters": 0, "nested_filter_objects": 0,
        "objstm_count": 0, "js_count": 0, "javascript_count": 0,
        "uri_count": 0, "uses_nonstandard_port": 0, "action_count": 0,
        "aa_count": 0, "openaction_count": 0, "launch_count": 0,
        "submitform_count": 0, "acroform_count": 0, "xfa_count": 0,
        "jbig2decode_count": 0, "colors_count": 0, "richmedia_count": 0,
        "trailer_count": 0, "startxref_count": 0,
        "has_multiple_behavioral_keywords_in_one_object": 0,
        "used_ocr": 0
    }

    try:
        features["file_size"] = os.path.getsize(filepath)
    except Exception as e:
        log_failure(filepath, f"Size check failed: {e}")
        return features

    try:
        with open(filepath, 'rb') as f:
            reader = PyPDF2.PdfReader(f, strict=False)
            features["encrypted"] = int(reader.is_encrypted)
            features["page_count"] = len(reader.pages)

            # Metadata
            meta = reader.metadata
            if meta:
                features["metadata_size"] = len(str(meta))
                title = meta.get('/Title') or os.path.basename(filepath)
                features["title_chars"] = len(str(title))

            # Validate header
            f.seek(0)
            header = f.read(1024).decode(errors='ignore')
            features["valid_pdf_header"] = int(header.startswith('%PDF'))

            # Read content
            f.seek(0)
            raw = f.read().decode(errors='ignore')

            # Stream analysis
            features["endstream_count"] = raw.count('endstream')
            features["stream_count"] = raw.count('stream')
            matches = list(re.finditer(r'stream(.*?)endstream', raw, re.DOTALL))
            sizes = [len(m.group(1)) for m in matches if m.group(1)]
            entropies = [calculate_entropy(m.group(1)) for m in matches if m.group(1)]
            features["average_stream_size"] = sum(sizes) / len(sizes) if sizes else 0
            features["entropy_of_streams"] = sum(entropies) / len(entropies) if entropies else 0
            features["name_obfuscations"] = count_name_obfuscations(raw)

            # Keyword-based detection
            keyword_map = {
                'objstm_count': '/ObjStm', 'js_count': '/JS', 'javascript_count': '/JavaScript',
                'uri_count': '/URI', 'action_count': '/Action', 'aa_count': '/AA',
                'openaction_count': '/OpenAction', 'launch_count': '/Launch',
                'submitform_count': '/SubmitForm', 'acroform_count': '/AcroForm',
                'xfa_count': '/XFA', 'jbig2decode_count': '/JBig2Decode',
                'colors_count': '/Colors', 'richmedia_count': '/RichMedia',
                'trailer_count': '/Trailer', 'xref_count': '/Xref',
                'startxref_count': '/startxref', 'total_filters': '/Filter',
                'nested_filter_objects': '/Filter ['
            }
            for k, v in keyword_map.items():
                features[k] = raw.count(v)

            if re.search(r'http[s]?://[^:\s]+:\d{4,5}', raw):
                features["uses_nonstandard_port"] = 1

            # Behavioral overlap
            objs = re.findall(r'obj(.*?)endobj', raw, re.DOTALL)
            behaviors = ['/JS', '/Launch', '/URI', '/OpenAction', '/SubmitForm', '/JavaScript', '/AA']
            for block in objs:
                if sum(1 for b in behaviors if b in block) >= 2:
                    features["has_multiple_behavioral_keywords_in_one_object"] += 1

        # Structure and text via PyMuPDF
        try:
            doc = fitz.open(filepath)
            font_names = set()
            for page in doc:
                features["image_count"] += len(page.get_images(full=True))
                font_names.update([f[3] for f in page.get_fonts() if f[3]])
            features["font_object_count"] = len(font_names)
            features["object_count"] = doc.xref_length()
            features["xref_entries"] = sum(1 for i in range(doc.xref_length()) if doc.xref_object(i, compressed=False))
            features["text_length"] = sum(len(page.get_text()) for page in doc)
        except Exception:
            text = extract_text_pdfminer(filepath) or extract_text_pdftotext(filepath)
            if not text.strip():
                text = extract_text_ocr(filepath)
                features["used_ocr"] = 1
            features["text_length"] = len(text)

    except Exception as e:
        log_failure(filepath, f"Extraction failed: {e}")
        print(f"âŒ Failed: {filepath} â€” {e}")

    return features

# -------------------------------------------------------------
# ========== EXECUTION LOOP ==========
# -------------------------------------------------------------
processed = set()
if os.path.exists(output_csv):
    try:
        existing_df = pd.read_csv(output_csv, usecols=["file_path"])
        processed = set(existing_df["file_path"].tolist())
        print(f"ðŸ” Resuming previous run. {len(processed)} files already processed.")
    except Exception as e:
        print(f"âš ï¸ Could not read existing CSV: {e}")

first_write = not os.path.exists(output_csv)

with open(output_csv, 'a', encoding='utf-8', newline='') as f:
    for fname in os.listdir(input_folder):
        if fname.lower().endswith('.pdf'):
            path = os.path.join(input_folder, fname)
            if path in processed:
                continue

            print(f"ðŸ“„ Processing: {fname}")
            start = time.time()
            feats = extract_pdf_features(path)

            pd.DataFrame([feats]).to_csv(f, index=False, header=first_write)
            first_write = False
            print(f"âœ… Done in {round(time.time() - start, 2)} sec")

print("ðŸŽ‰ All files processed successfully.")
