In [15]:
# from pptx import Presentation
from bs4 import BeautifulSoup as bs
import markdown
import os
import re

### Read in Obsidian files

In [108]:
class ProfileBuilder():
    def __init__(self, entities_path, assets_path):
        print("init()")
        self.entities_to_parse = []
        self.raw_entities_data = {}
        self.final_entities_data = {}
        self.PERSON_ATTRIBUTES = (
            ("Detail", "details"),
            ("Note", "notes"),
            ("Associate", "associates"),
            ("Email Address", "email_addresses"),
            ("Phone Number", "phone_numbers"),
            ("Residence Name", "residence_names"),
            ("Residence Address", "residence_addresses"),
            ("Residence Coordinates", "residence_coordinates"),
            ("Residence Map", "residence_maps"),
            ("Work Name", "work_names"),
            ("Work Address", "work_addresses"),
            ("Work Coordinates", "work_coordinates"),
            ("Work Map", "work_maps"),
        )
        
        self.entities_path = entities_path
        self.assets_path = assets_path
        # self.create_profiles()

    def create_profiles(self):
        print("create_profiles()")        
        self.scan_dir_for_markdown_files()
        print("entities_to_parse", self.entities_to_parse)
        
        self.turn_markdowns_into_soups()
        print("# of valid entities", len(self.raw_entities_data))
        
        return self.parse_info_from_soups()
    
    def scan_dir_for_markdown_files(self):
        for file in os.listdir(self.entities_path):
            if file.endswith("md"):
                self.entities_to_parse.append(file)
    
    def turn_markdowns_into_soups(self):
        for i in self.entities_to_parse[0:]:
            print("i", i)
            filename = i.replace(".md", "")
            
            html = None
            with open(f"{self.entities_path}/{i}", "r", encoding="utf-8") as f:
                text = f.read()
            html = markdown.markdown(text)
            soup = bs(html)
            
            if not self.is_file_person_of_interest(filename, soup):
                continue
            
            # Maybe later combine these 2 dicts into one and delete the unneeded values before output
            self.raw_entities_data[filename] = {"soup": soup}
            # self.final_entities_data[filename] = {}

    def is_file_person_of_interest(self, filename, soup):
        all_p_tags = soup.find_all("p")        
        if any("person_of_interest" in tag.get_text().lower() for tag in all_p_tags):
            return True
        else:
            return False
    
    def parse_info_from_soups(self):
        for name, data in self.raw_entities_data.items():
            self.parse_person_info(name, data)
        
        return [{name: data["parsed"]} for name, data 
                in self.raw_entities_data.items()]
    
    def parse_person_info(self, name, data):
        data["parsed"] = {}
        
        # Get all paragraph and list item tags
        all_p_and_li_tags = []
        all_p_and_li_tags += data["soup"].find_all("p")
        all_p_and_li_tags += data["soup"].find_all("li")
        # print(all_p_and_li_tags)
        
        # Find pic filename and store pic path
        pic_path = self.get_person_pic_path(all_p_and_li_tags)
        data["parsed"].update({"entity_pic_path": pic_path})
        
        # Parse all the other wanted information from the file
        wanted_info = self.find_tags_matching_attributes(all_p_and_li_tags)
        data["parsed"].update({"entity_info": wanted_info})

    def get_person_pic_path(self, tags):
        person_pic = None

        # Find correct p tag to parse
        try:
            person_pic = list(filter(lambda x: "picture" in x.get_text().lower(), tags))[0]
        except IndexError:
            print("There is no p tag or li tag with a picture in it")
            pass

        if not person_pic:
            return

        # Parse filename from p tag
        bracketed_filename_pattern = "([0-9A-Z]{1,}[\_\s]{0,}[0-9A-Z]{1,}\.{1}[0-9A-Z]{1,3})"
        try:
            person_pic = re.search(bracketed_filename_pattern, str(person_pic), re.IGNORECASE)[0]
        except TypeError:
            print("There is no pic filename")
            person_pic = None
            pass
        
        if not person_pic:
            return
        
        return f"{self.assets_path}/{person_pic.strip()}"
    
    def find_tags_matching_attributes(self, tags):
        info = {}
        
        for attr in self.PERSON_ATTRIBUTES:
#             print("attr: ", attr)
            info[attr[1]] = []
            
            matching_tags = list(filter(lambda x: attr[0] in x.get_text(), tags))
#             print("matching tags: ", matching_tags)
            
            cleaned_texts = self.extract_info_from_matching_tags(matching_tags)
            info[attr[1]] += cleaned_texts
        
        return info
    
    def extract_info_from_matching_tags(self, tags):
#         print(f"There are {len(tags)} matching tags here")
        tags = self.remove_code_tag_substring(tags)
        tags = self.get_text_inside_tags(tags)
        tags = self.extract_bracketed_link_texts(tags)
        return tags
        
    def remove_code_tag_substring(self, tags):
        cleaned_tags = []
        for tag in tags:
            cleaned_tag = None
            try:
                cleaned_tag = re.sub("<code>.*</code>", "", str(tag))
            except:
                # print("substring doesn't have a `Code` tag in it")
                pass
            if cleaned_tag:
                cleaned_tags.append(cleaned_tag)
        return cleaned_tags

    def get_text_inside_tags(self, tags):
        inside_texts = []
        for tag in tags:            
            inside_text = None
            # Check for text inside p tags
            try:
                inside_text = re.match("(<p>)(.*)(</p>)", tag)[2]
            except TypeError:
                # print("No text inside p tags")
                pass
            # Check for text inside li tags
            try:
                inside_text = re.match("(<li>)(.*)(</li>)", tag)[2]
            except TypeError:
                # print("No text inside li tags")
                pass
            if inside_text:
                inside_texts.append(inside_text.strip())
        return inside_texts

    def extract_bracketed_link_texts(self, texts):
        cleaned_texts = []
        for text in texts:
            cleaned_text = None
            try:
                match_obj = re.match("(.*)(\[\[)(.*)(\]\])(.*)", text)        
        #         print("0", match_obj[0]) # match object
        #         print("1", match_obj[1]) # match group 1, etc...
        #         print("2", match_obj[2])
        #         print("3", match_obj[3])
        #         print("4", match_obj[4])
        #         print("5", match_obj[5])
                text_before = match_obj[1]
                text_inside = match_obj[3]
                text_after = match_obj[5]
                rebuilt = f"{text_before.strip()} {text_inside.strip()} {text_after.strip()}"
                cleaned_text = rebuilt.strip()
            except TypeError:
                # print("There was no linked text in double brackets")
                cleaned_text = text.strip()
            if cleaned_text:
                cleaned_texts.append(cleaned_text)
        return cleaned_texts

In [109]:
entities_path = "Investigation/People"
assets_path = "Investigation/Assets"

builder = ProfileBuilder(entities_path, assets_path)
out = builder.create_profiles()
out

# builder.final_entities_data

init()
create_profiles()
entities_to_parse ['Arron Gaines.md', 'Elsie Graves.md', 'Jan Jackson.md', 'Kevin Ellison.md', 'Wilfredo Goodman.md']
i Arron Gaines.md
i Elsie Graves.md
i Jan Jackson.md
i Kevin Ellison.md
i Wilfredo Goodman.md
# of valid entities 2


[{'Arron Gaines': {'entity_pic_path': 'Investigation/Assets/Arron_Gaines.jpg',
   'entity_info': {'details': ['Subject likely lives and works in New York City, New York based on social media including pictures and check-ins. Follower/following relationships on social media platforms indicate a connection with Austin, Texas-based Kevin Ellison . Research on social engineering forums revealed usernames that were traced back to email addresses with a common web domain which has been separately flagged for malicious activity. Additional research on the emails suggests Gaines and Ellison are the probable end users.'],
    'notes': ['No previous criminal convictions',
     'Known phishing victims include: [[Elsie Graves]], [[Wilfredo Goodman]], and Jan Jackson'],
    'associates': ['Kevin Ellison'],
    'email_addresses': ['<em>probably uses</em> mglanman@mycbt(.)me'],
    'phone_numbers': ['+1472-217-7096', '+1(202)-918-2132'],
    'residence_names': ['350 Central Park West Apartments'],
  

- location maps should use similar logic to person pic path to get just the file name (regex)
- try re.sub again all tags to get rid of things like <em></em>
- need to unpack names from double brackets

In [22]:
# # Determine if file is for Person of Interest
# to_find = "Person_Of_Interest"

# is_person_file = any([to_find in tag.get_text() for tag in all_p_tags])
# is_person_file

True

In [87]:
# Same structure as raw_entities_data
test = {
    "John Doe": {
        "soup": "aaaaaaa",
        "parsed": {
            "pic_path": "123file.jpg"
        }
    }
}

type(test)
test

{'John Doe': {'soup': 'aaaaaaa', 'parsed': {'pic_path': '123file.jpg'}}}

In [92]:
# Getting a list of dicts with just the parsed info at the end
[{k:v["parsed"]} for k,v in test.items()]

[{'John Doe': {'pic_path': '123file.jpg'}}]