# PERICULUM DS INTERNSHIP - TECHNICAL ASSESSMENT

In [1]:
# Create class OwnerInfo
class OwnerInfo:
  def __init__(self, owner_name, owner_address, owner_telephone):
    self.owner_name = owner_name
    self.owner_address = owner_address
    self.owner_telephone = owner_telephone

In [2]:
# Create class Inventory
class Inventory:
  def __init__(self, purchase_date, serial_number, description, source_style_area, value):
    self.purchase_date = purchase_date
    self.serial_number = serial_number
    self.description = description
    self.source_style_area = source_style_area
    self.value = value

In [3]:
# create function get_data_from_pdf that will read and extract raw text
import PyPDF2

def get_data_from_pdf(path):
    """Extracts and returns all text from a PDF file."""
    text = ""
    try:
        with open(path, 'rb') as pdf_file:  
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            for page in pdf_reader.pages:
                text += page.extract_text() or ""  
    except Exception as e:
        print(f"Error reading PDF file: {e}")
    return text





In [4]:
pdf_text = get_data_from_pdf(r'C:\Users\Chinelo\Downloads\home_inventory.pdf')
print(pdf_text)


S/N Area Item Description Source Purchase Date Style Serial No Value
1Living Room Desk Target 07/06/2018 Premium 6DDZ7S36 846.59 $        
2Kitchen LED TV Walmart 31/05/2015 Classic NJEZ3OPO 382.04 $        
3Living Room LED TV Target 03/03/2019 Premium HRIS4LI8 1,603.37$     
4Garage Dining Table Wayfair 26/03/2023 Modern 2HLNMD64 552.74 $        
5Living Room Tool Set Target 06/01/2020 Classic R08QDU0S 1,546.39$     
6Office LED TV Amazon 03/04/2015 Classic GSABG41R 1,319.36$     
7Office Mattress Amazon 28/02/2022 Premium DVTMS64O 573.10 $        
8Bedroom Tool Set Home Depot 07/04/2017 Classic RB93WPTH 1,421.38$     
9Dining Room Mattress Target 20/03/2017 Compact 7RX8YNW9 1,760.11$     
10Living Room Desk Target 30/06/2019 Classic US1B0BQI 941.43 $        
11Garage Dining Table Wayfair 09/02/2016 Compact 1J7088BO 71.12 $           
12Bedroom Desk Home Depot 21/08/2025 Modern GVJFQYT1 841.41 $        
13Office Dining Table Best Buy 23/09/2022 Modern 5EXQQOB1 1,686.96$     
14Kitche

In [5]:
# create function align_content
# Split the raw text line by line, aligns as it appears in the PDF.

def align_content(raw_text):
    lines = raw_text.split('\n')
    aligned = [line.strip() for line in lines if line.strip()]
    return aligned



In [7]:
# create function extract_data

import re
from datetime import datetime

def extract_data(aligned_content):
    # Extract owner information
    owner_name = ""
    owner_address = ""
    owner_telephone = ""
    inventory_items = []
    owner_found = False

    for idx, line in enumerate(aligned_content):
        # Look for owner name
        if "HOME INVENTORY" in line and idx + 1 < len(aligned_content):
            owner_name = aligned_content[idx + 1]
            owner_address = aligned_content[idx + 2]
            # City, Zip and Address combined
            city_zip = aligned_content[idx + 3]
            if city_zip:
                owner_address += ", " + city_zip
            owner_telephone = aligned_content[idx + 4]
            owner_found = True
            break

    # After owner found, go back to parse inventory
    for line in aligned_content:
        # Look for lines that match inventory pattern
        match = re.match(r'^\d+\s+([A-Za-z\s]+)\s+([A-Za-z\s]+)\s+([A-Za-z\s]+)\s+([A-Za-z\s]+)\s+(\d{2}/\d{2}/\d{4})\s+([A-Za-z]+)\s+(\w+)\s+([\d,]+\.\d{2})\$', line)
        if match:
            area = match.group(1).strip()
            item = match.group(2).strip()
            description = match.group(3).strip()
            source = match.group(4).strip()
            purchase_date_raw = match.group(5).strip()
            style = match.group(6).strip()
            serial_number = match.group(7).strip()
            value_raw = match.group(8).strip()

            # Process date into ISO format
            purchase_date_obj = datetime.strptime(purchase_date_raw, "%d/%m/%Y")
            purchase_date_iso = purchase_date_obj.strftime("%Y-%m-%dT%H:%M:%S")

            # Process value into float
            value_clean = float(value_raw.replace(",", ""))

            # Create source_style_area field
            source_style_area = f"{area} - {source} - {style}"

            inventory = Inventory(
                purchase_date=purchase_date_iso,
                serial_number=serial_number,
                description=item,
                source_style_area=source_style_area,
                value=value_clean
            )

            inventory_items.append(inventory)

    # Create OwnerInfo object
    owner = OwnerInfo(
        owner_name=owner_name,
        owner_address=owner_address,
        owner_telephone=owner_telephone
    )

    # Build final response
    response = {
        "owner_name": owner.owner_name,
        "owner_address": owner.owner_address,
        "owner_telephone": owner.owner_telephone,
        "data": []
    }

    for inv in inventory_items:
        response["data"].append({
            "purchase_date": inv.purchase_date,
            "serial_number": inv.serial_number,
            "description": inv.description,
            "source_style_area": inv.source_style_area,
            "value": inv.value
        })

    return response



In [None]:
# Main section
if __name__ == "__main__":
    path = r'C:\Users\Chinelo\Downloads\home_inventory.pdf'  
    raw_text = get_data_from_pdf(path)
    aligned_content = align_content(raw_text)
    final_response = extract_data(aligned_content)
    
    import json
    print(json.dumps(final_response, indent=2))



{
  "owner_name": "Owner Information",
  "owner_address": "John Doe, 123 Maple StreetName",
  "owner_telephone": "Address",
  "data": [
    {
      "purchase_date": "2021-06-07T00:00:00",
      "serial_number": "BYWR0VQ3",
      "description": "Dining",
      "source_style_area": "Kitchen - Wayfair - Premium",
      "value": 1331.28
    },
    {
      "purchase_date": "2017-07-24T00:00:00",
      "serial_number": "Z4WKQAUD",
      "description": "Set",
      "source_style_area": "Kitchen Tool - Depot - Compact",
      "value": 1726.77
    },
    {
      "purchase_date": "2016-08-10T00:00:00",
      "serial_number": "5DTN8WP1",
      "description": "Tool",
      "source_style_area": "Dining Room - Amazon - Modern",
      "value": 1364.42
    },
    {
      "purchase_date": "2015-03-07T00:00:00",
      "serial_number": "S1IZ2ORO",
      "description": "Desk",
      "source_style_area": "Kitchen - Buy - Compact",
      "value": 1474.74
    },
    {
      "purchase_date": "2023-04-29T00:00