In [1]:
# from pptx import Presentation
from bs4 import BeautifulSoup as bs
import markdown
import os
import re

### Read in Obsidian files

In [146]:
class ProfileBuilder():
    def __init__(self):
        print("init()")
        
        self.path = None
        self.entities_to_parse = []
        self.raw_entity_data = {}
        self.final_entity_data = {}
        
        self.PERSON_ATTRIBUTES = (
            ("Detail", "details"),
            ("Note", "notes"),
            ("Associate", "associates"),
            ("Email Address", "email_addresses"),
            ("Phone Number", "phone_numbers"),
            ("Residence Name", "residence_names"),
            ("Residence Address", "residence_addresses"),
            ("Residence Coordinates", "residence_coordinates"),
            ("Residence Map", "residence_maps"),
            ("Work Name", "work_names"),
            ("Work Address", "work_addresses"),
            ("Work Coordinates", "work_coordinates"),
            ("Work Map", "work_maps"),
        )
        
    def create_profiles(self, path: str, entity_type: str):
        print("create_profiles()")
        self.path = path
        
        self.scan_dir_for_md_files()
        
        self.read_md_files_into_soup(entity_type)
        print(len(self.raw_entity_data))
        
        self.parse_entity_info_from_soup()
    
    def scan_dir_for_md_files(self):
#         entities_to_parse = []
        for file in os.listdir(self.path):
            if file.endswith("md"):
                self.entities_to_parse.append(file)
#         return entities_to_parse
    
    def read_md_files_into_soup(self, entity_type):
        for i in self.entities_to_parse[0:1]:
            name = i.replace(".md", "")
            self.raw_entity_data[name] = {"entity_type": entity_type}
            self.final_entity_data[name] = {"entity_type": entity_type}

            html = None
            with open(f"{self.path}/{i}", "r", encoding="utf-8") as f:
                text = f.read()

            html = markdown.markdown(text)
            soup = bs(html)
            self.raw_entity_data[name]["soup"] = soup
        
    def parse_entity_info_from_soup(self):
        for name, data in self.raw_entity_data.items():
            
            # Change to switch statement !
            if data["entity_type"] == "person":
                print("this is a person profile")
                self.parse_person_info(name, data["soup"])
            elif data["entity_type"] == "place":
                pass
            else:
                pass
    
    def parse_person_info(self, name, soup):
        # Get all paragraph and list item tags
        all_p_and_li_tags = []
        all_p_and_li_tags += soup.find_all("p")
        all_p_and_li_tags += soup.find_all("li")
#         print(all_p_and_li_tags)
        
        # Find pic filename and store pic path
        pic_path = self.get_person_pic_path(all_p_and_li_tags)
        self.final_entity_data[name]["person_pic_path"] = pic_path        

        self.final_entity_data[name].update(self.find_tags_matching_attributes(all_p_and_li_tags))
#         print(self.final_entity_data)


    def get_person_pic_path(self, tags):
        person_pic = None

        # Find correct p tag to parse
        try:
            person_pic = list(filter(lambda x: "Picture" in x.get_text(), tags))[0]
        except IndexError:
            print("There is no p or li tag with a picture in it")
            pass

        if not person_pic:
            return

        # Parse filename from p tag
        bracketed_filename_pattern = "([0-9A-Z]{1,}[\_\s]{0,}[0-9A-Z]{1,}\.{1}[0-9A-Z]{1,3})"
        try:
            person_pic = re.search(bracketed_filename_pattern, str(person_pic), re.IGNORECASE)[0]
        except TypeError:
            print("There is no pic filename")
            person_pic = None
            pass
        
        if not person_pic:
            return
        
        return f"Assets/{person_pic.strip()}"
    
    def find_tags_matching_attributes(self, tags):
        info = {}
        
        for attr in self.PERSON_ATTRIBUTES:
#             print("attr: ", attr)
            info[attr[1]] = []
            
            matching_tags = list(filter(lambda x: attr[0] in x.get_text(), tags))
#             print("matching tags: ", matching_tags)
            
            cleaned_texts = self.extract_info_from_matching_tags(matching_tags)
            info[attr[1]] += cleaned_texts
        
        return info
    
    def extract_info_from_matching_tags(self, tags):
#         print(f"There are {len(tags)} matching tags here")
        tags = self.remove_code_tag_substring(tags)
        tags = self.get_text_inside_tags(tags)
        tags = self.extract_bracketed_link_texts(tags)
        return tags
        
    def remove_code_tag_substring(self, tags):
        cleaned_tags = []
        for tag in tags:
            cleaned_tag = None
            try:
                cleaned_tag = re.sub("<code>.*</code>", "", str(tag))
            except:
                # print("substring doesn't have a `Code` tag in it")
                pass
            if cleaned_tag:
                cleaned_tags.append(cleaned_tag)
        return cleaned_tags

    def get_text_inside_tags(self, tags):
        inside_texts = []
        for tag in tags:            
            inside_text = None
            # Check for text inside p tags
            try:
                inside_text = re.match("(<p>)(.*)(</p>)", tag)[2]
            except TypeError:
                # print("No text inside p tags")
                pass
            # Check for text inside li tags
            try:
                inside_text = re.match("(<li>)(.*)(</li>)", tag)[2]
            except TypeError:
                # print("No text inside li tags")
                pass
            if inside_text:
                inside_texts.append(inside_text.strip())
        return inside_texts

    def extract_bracketed_link_texts(self, texts):
        cleaned_texts = []
        for text in texts:
            cleaned_text = None
            try:
                match_obj = re.match("(.*)(\[\[)(.*)(\]\])(.*)", text)        
        #         print("0", match_obj[0]) # match object
        #         print("1", match_obj[1]) # match group 1, etc...
        #         print("2", match_obj[2])
        #         print("3", match_obj[3])
        #         print("4", match_obj[4])
        #         print("5", match_obj[5])
                text_before = match_obj[1]
                text_inside = match_obj[3]
                text_after = match_obj[5]
                rebuilt = f"{text_before.strip()} {text_inside.strip()} {text_after.strip()}"
                cleaned_text = rebuilt.strip()
            except TypeError:
                # print("There was no linked text in double brackets")
                cleaned_text = text.strip()
            if cleaned_text:
                cleaned_texts.append(cleaned_text)
        return cleaned_texts

In [147]:
target_dir = "Investigation/People"

builder = ProfileBuilder()
builder.create_profiles(target_dir, "person")

builder.final_entity_data

init()
create_profiles()
1
this is a person profile


{'Arron Gaines': {'entity_type': 'person',
  'person_pic_path': 'Assets/Arron_Gaines.jpg',
  'details': ['Subject likely lives and works in New York City, New York based on social media including pictures and check-ins. Follower/following relationships on social media platforms indicate a connection with Austin, Texas-based Kevin Ellison . Research on social engineering forums revealed usernames that were traced back to email addresses with a common web domain which has been separately flagged for malicious activity. Additional research on the emails suggests Gaines and Ellison are the probable end users.'],
  'notes': ['No previous criminal convictions',
   'Known phishing victims include: [[Elsie Graves]], [[Wilfredo Goodman]], and Jan Jackson'],
  'associates': ['Kevin Ellison'],
  'email_addresses': ['<em>probably uses</em> mglanman@mycbt(.)me'],
  'phone_numbers': ['+1472-217-7096', '+1(202)-918-2132'],
  'residence_names': ['350 Central Park West Apartments'],
  'residence_addres

- location maps should use similar logic to person pic path to get just the file name (regex)
- try re.sub again all tags to get rid of things like <em></em>
- need to unpack names from double brackets

In [22]:
# # Determine if file is for Person of Interest
# to_find = "Person_Of_Interest"

# is_person_file = any([to_find in tag.get_text() for tag in all_p_tags])
# is_person_file

True