### Import Libraries and Define Constants

In [77]:
import pandas as pd
from urllib.parse import urlparse,parse_qs
from collections import defaultdict
SEED_URLS='./data/Urls.txt'
TARGET_DOMAIN='avature.net'
REDIRECT_TAG='mailRedir'
TRACK_TAG='ltrk'
CAREER_KEYWORDS=['career','job','talent','recruit','apply']
TARGET_REGISTRY_PATH='./data/avature_tenants.csv'

### Load the Seed URLS and verify

In [78]:
# Load de-duplicated seed-urls
seed_url_list=set()
try:
    with open(SEED_URLS,'r') as seed_file:
        for line in seed_file:
            seed_url_list.add(line.strip())
except FileNotFoundError:
    print("unable to retrieve file path")

# Verify count and sample
seed_url_list=list(seed_url_list)
print(f"Total Seed Urls: {len(seed_url_list)}")
print(f"first five seed Urls\n: {seed_url_list[:5]}")


Total Seed Urls: 781635
first five seed Urls
: ['https://premium.avature.net/fr_CA/jobs/Register?pipelineId=28640', 'https://nva.avature.net/jobs/JobDetail/Urgent-Care-Associate-Veterinarian-Canada-Ontario-NVAC/28888', 'https://nva.avature.net/jobs/SearchJobs/?3_83_3=4142000000000000000000000000000000000000000000000000000&jobOffset=540', 'https://nva.avature.net/jobs/SearchJobs/?3_83_3=4368000000000000000000000000000&jobOffset=1160', 'https://nva.avature.net/jobs/SearchJobs/?3_83_3=42010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000&jobOffset=10']


### Define base Tenants and corresponding Paths

In [79]:
tenants=defaultdict(set)
for url in seed_url_list:
    parsed_url=urlparse(url)
    current_tenant=parsed_url.netloc
    current_path=parsed_url.path

    # Make sure only avature URLs are accounted
    if TARGET_DOMAIN not in current_tenant:
        continue

    # Handle redirects and trackers
    if REDIRECT_TAG in current_path or TRACK_TAG in current_path:
        continue

    # Handle Paths
    if current_path=='/':
        base_path=''
    else:
        base_path=current_path.rstrip('/').split('/')[1] 

    if base_path:
        tenants[current_tenant].add(base_path)


### Build the Tenant Registry CSV

In [80]:
# Verify if any career pages related keywords are present in path
def is_career_path(path):
    if not path:
        return False
    return any(kw in path.lower() for kw in CAREER_KEYWORDS)


In [81]:
# Collect base attributes for each Tenant and store in CSV
tenant_registry=[]
for tenant,paths in tenants.items():
    for path in paths:
        current_data={
            'tenant':tenant,
            'base_url':f"https://{tenant}",
            'career_path':f"/{path}" if path else '/',
            'full_url':f"https://{tenant}/{path}",
            'is_career_page':is_career_path(path)
        }
        tenant_registry.append(current_data)


In [82]:
df_tenants=pd.DataFrame(tenant_registry)
df_tenants=df_tenants.sort_values(by=['tenant','is_career_page'],ascending=[True,False])
df_tenants.to_csv(TARGET_REGISTRY_PATH,index=False)


### Base Statistics

In [83]:
df = pd.read_csv(TARGET_REGISTRY_PATH)

print("=" * 60)
print("AVATURE TENANTS - BREAKDOWN")
print("=" * 60)

# Overall counts
total_rows = len(df)
unique_tenants = df['tenant'].nunique()

print(f"\nOverall:")
print(f"Total rows: {total_rows:,}")
print(f"Unique tenants: {unique_tenants:,}")

# Breakdown by page type
career_pages = df['is_career_page'].sum()
non_career_pages = len(df[~df['is_career_page']])

print(f"\nPage Type Breakdown:")
print(f"Career pages: {career_pages:,} ({career_pages/total_rows*100:.1f}%)")
print(f"Non-career pages: {non_career_pages:,} ({non_career_pages/total_rows*100:.1f}%)")
print(f"Total: {career_pages + root_pages + non_career_pages:,}")



AVATURE TENANTS - BREAKDOWN

Overall:
Total rows: 2,398
Unique tenants: 536

Page Type Breakdown:
Career pages: 707 (29.5%)
Non-career pages: 1,691 (70.5%)
Total: 2,398
