#### Dependencies and libraries for the exercise

In [11]:
import os, zipfile, json, hashlib
from PIL import Image
import pytesseract
from oletools.olevba import VBA_Parser
from docx import Document
import pandas as pd


#### Functions: Hash File

In [12]:
def sha256_hash(file_path):
    
    with open(file_path, "rb") as f:
        return hashlib.sha256(f.read()).hexdigest()

#### Detect the macros of the Office Documents

In [3]:
def detect_macros(file_path):
    
    result = {"has_macros": False, "macro_snippets": []}
    try:
        vbaparser = VBA_Parser(file_path)
        if vbaparser.detect_vba_macros():
            result["has_macros"] = True
            for (_, _, _, vba_code) in vbaparser.extract_macros():
                snippet = vba_code[:300] if vba_code else ""
                result["macro_snippets"].append(snippet)
        vbaparser.close()
    except Exception as e:
        result["error"] = str(e)
    return result

#### Extract the images

In [4]:
def extract_images(file_path):

    image_dir = "extracted_images"
    os.makedirs(image_dir, exist_ok=True)
    extracted = []

    try:
        with zipfile.ZipFile(file_path, 'r') as docx:
            for item in docx.namelist():
                if item.startswith("word/media/"):
                    image_name = os.path.basename(item)
                    out_path = os.path.join(image_dir, image_name)
                    with open(out_path, "wb") as f:
                        f.write(docx.read(item))
                    extracted.append(out_path)
    except Exception as e:
        print(f"Error extracting images: {e}")
    return extracted

#### Use OCR to extract the images

In [5]:
def ocr_images(image_paths):
    
    results = {}
    for img_path in image_paths:
        try:
            text = pytesseract.image_to_string(Image.open(img_path))
            results[os.path.basename(img_path)] = text.strip()
        except Exception as e:
            results[os.path.basename(img_path)] = f"OCR Error: {e}"
    return results

#### Analyze the document

In [7]:
def analyze_office_file(file_path):
    
    print(f"[*] Analyzing: {file_path}")

    sha256 = sha256_hash(file_path)
    macros = detect_macros(file_path)
    images = extract_images(file_path)
    ocr_results = ocr_images(images)

    report = {
        "file_name": os.path.basename(file_path),
        "sha256": sha256,
        "has_macros": macros.get("has_macros", False),
        "macro_snippets": macros.get("macro_snippets", []),
        "images_found": len(images),
        "ocr_results": ocr_results
    }

    return report


#### Analyze all files

In [8]:
folder_path = "msoffice"
reports = []

for filename in os.listdir(folder_path):
    if filename.lower().endswith((".doc", ".docx", ".docm")):
        file_path = os.path.join(folder_path, filename)
        report = analyze_office_file(file_path)
        reports.append(report)

# Save full report as JSON
with open("msoffice_analysis_report.json", "w") as f:
    json.dump(reports, f, indent=4)



[*] Analyzing: msoffice/1bdbb2a88fc9b48a4d29ae76aafadf16.docx
[*] Analyzing: msoffice/240c5875a9ba744f6c61ff42a4d7d999.docx
[*] Analyzing: msoffice/46b138cf8645b457b2a8c4ebc79e06f1.docx


#### Results

In [9]:
import pprint
pprint.pprint(reports)


[{'file_name': '1bdbb2a88fc9b48a4d29ae76aafadf16.docx',
  'has_macros': True,
  'images_found': 2,
  'macro_snippets': ['Attribute VB_Name = "ThisDocument"\r\n'
                     'Attribute VB_Base = "1Normal.ThisDocument"\r\n'
                     'Attribute VB_GlobalNameSpace = False\r\n'
                     'Attribute VB_Creatable = False\r\n'
                     'Attribute VB_PredeclaredId = True\r\n'
                     'Attribute VB_Exposed = True\r\n'
                     'Attribute VB_TemplateDerived = True\r\n'
                     'Attribute VB_Customizable = True\r\n',
                     'Attribute VB_Name = "encountered"\r\n'
                     'Private Const STARTF_USESHOWWINDOW& = &H1\r\n'
                     'Private Const NORMAL_PRIORITY_CLASS = &H20&\r\n'
                     'Private Const INFINITE = -1&\r\n'
                     '\r\n'
                     'Private Type STARTUPINFO\r\n'
                     '    cb As Long\r\n'
                     '    lp

### Display the results

In [10]:
import pandas as pd

summary_data = []

for report in reports:
    summary_data.append({
        "File Name": report["file_name"],
        "SHA256": report["sha256"],
        "Has Macros": report["has_macros"],
        "Number of Images": report["images_found"],
        "OCR Extracted Text": "\n".join([f"{k}: {v[:50]}..." 
                                         for k, v in report["ocr_results"].items()])
    })

df_summary = pd.DataFrame(summary_data)

df_summary


Unnamed: 0,File Name,SHA256,Has Macros,Number of Images,OCR Extracted Text
0,1bdbb2a88fc9b48a4d29ae76aafadf16.docx,c30977d86a5f4e84a84435a29580c7bf39f1f5818fcb1d...,True,2,image2.png: This document was created with an ...
1,240c5875a9ba744f6c61ff42a4d7d999.docx,d57acf8eb7c1d9b23299042ccac8a7f16cb18599423282...,True,4,image3.wmf: OCR Error: Unsupported image forma...
2,46b138cf8645b457b2a8c4ebc79e06f1.docx,745b0e0793fc507d9e1ad7155beb7ac48f8a556e6ef06e...,True,1,image1.jpeg: 1) Office 365\nThis document crea...
