### Import Libraries and Define Constants

In [16]:
import pandas as pd
import requests
import re
import time
import socket
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse,parse_qs
from collections import defaultdict
from helpers import RateLimiter
from bs4 import BeautifulSoup
    
ORIGINAL_SEED_URLS = './data/Urls.txt'
TARGET_DOMAIN = 'avature.net'
REDIRECT_TAG = 'mailRedir'
TRACK_TAG = 'ltrk'
CAREER_KEYWORDS = ['career', 'job', 'talent', 'recruit', 'apply']
TARGET_REGISTRY_PATH = './data/avature_tenants.csv'
EXPANDED_SEED_URLS= './data/expanded_urls.csv'


### Load the Seed URLS and verify

In [17]:
# Load de-duplicated seed-urls
seed_url_list=set()
try:
    with open(ORIGINAL_SEED_URLS,'r') as seed_file:
        for line in seed_file:
            seed_url_list.add(line.strip())
except FileNotFoundError:
    print("unable to retrieve file path")

# Verify count and sample
seed_url_list=list(seed_url_list)
print(f"Total Seed Urls: {len(seed_url_list)}")
print(f"first five seed Urls\n: {seed_url_list[:5]}")


Total Seed Urls: 781635
first five seed Urls
: ['https://fmlogistic.avature.net/ro_RO/careers/_indeedLogin?jobId=5225', 'https://nva.avature.net/jobs/SearchJobs/?3_83_3=430600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000&jobOffset=1370', 'https://nva.avature.net/jobs/SearchJobs/?3_83_3=2783030000000000&jobOffset=1730', 'https://nva.avature.net/jobs/SearchJobs/?3_83_3=430600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000&jobOffset=300', 'https://premium.avature.net/en_US/jobs/PipelineDetail/Wireless-Sales-Pro/32786']


### Define base Tenants and corresponding Paths

In [18]:
tenants=defaultdict(set)
for url in seed_url_list:
    parsed_url=urlparse(url)
    current_tenant=parsed_url.netloc
    current_path=parsed_url.path

    # Make sure only avature URLs are accounted
    if TARGET_DOMAIN not in current_tenant:
        continue

    # Handle redirects and trackers
    if REDIRECT_TAG in current_path or TRACK_TAG in current_path:
        continue

    # Handle Paths
    if current_path=='/':
        base_path=''
    else:
        base_path=current_path.rstrip('/').split('/')[1] 

    if base_path:
        tenants[current_tenant].add(base_path)


### Build the Tenant Registry CSV

In [19]:
# Verify if any career pages related keywords are present in path
def is_career_path(path):
    if not path:
        return False
    return any(kw in path.lower() for kw in CAREER_KEYWORDS)


In [20]:
# Collect base attributes for each Tenant and store in CSV
tenant_registry=[]
for tenant,paths in tenants.items():
    for path in paths:
        current_data={
            'tenant':tenant,
            'base_url':f"https://{tenant}",
            'career_path':f"/{path}" if path else '/',
            'full_url':f"https://{tenant}/{path}",
            'is_career_page':is_career_path(path)
        }
        tenant_registry.append(current_data)


In [21]:
df_tenants=pd.DataFrame(tenant_registry)
df_tenants=df_tenants.sort_values(by=['tenant','is_career_page'],ascending=[True,False])
df_tenants.to_csv(TARGET_REGISTRY_PATH,index=False)


### Base Statistics

In [22]:
df = pd.read_csv(TARGET_REGISTRY_PATH)

print("=" * 60)
print("AVATURE TENANTS - BREAKDOWN")
print("=" * 60)

# Overall counts
total_rows = len(df)
unique_tenants = df['tenant'].nunique()

print(f"\nOverall:")
print(f"Total rows: {total_rows:,}")
print(f"Unique tenants: {unique_tenants:,}")

# Breakdown by page type
career_pages = df['is_career_page'].sum()
non_career_pages = len(df[~df['is_career_page']])

print(f"\nPage Type Breakdown:")
print(f"Career pages: {career_pages:,} ({career_pages/total_rows*100:.1f}%)")
print(f"Non-career pages: {non_career_pages:,} ({non_career_pages/total_rows*100:.1f}%)")
print(f"Total: {career_pages + non_career_pages:,}")



AVATURE TENANTS - BREAKDOWN

Overall:
Total rows: 2,398
Unique tenants: 536

Page Type Breakdown:
Career pages: 707 (29.5%)
Non-career pages: 1,691 (70.5%)
Total: 2,398


In [23]:
def extract_tenant_from_domain(domain):
    """Extract base tenant from any subdomain format"""
    if not domain or TARGET_DOMAIN not in domain:
        return None
    
    domain = domain.lower().strip()
    
    if domain.count('.') == 2:
        match = re.match(r'^([a-z0-9-]+)\.avature\.net$', domain)
        if match and len(match.group(1)) > 2:
            return domain
    elif domain.count('.') >= 3:
        match = re.search(r'([a-z0-9-]+)\.avature\.net', domain)
        if match and len(match.group(1)) > 2:
            return f"{match.group(1)}.avature.net"
    
    return None

### Additional step for Seed URL expansion: CT Disocvery

In [24]:
existing_tenants=set(df['tenant'].unique())
expanded_tenant_registry=[]

In [25]:
def expand_with_ct(existing_tenants):
    """
    Strategy 1: Enhanced Certificate Transparency parsing
    """
    discovered = set()
    try:
        response = requests.get(
            "https://crt.sh/?q=%.avature.net&output=json",
            timeout=180,
            headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        )
        
        if response.status_code == 200:
            data = response.json()
            print(f"  Certificate records: {len(data):,}")
            
            for entry in data:
                name_value = entry.get('name_value', '')
                
                for name in name_value.split('\n'):
                    name = name.strip().lower()
                    
                    if 'avature.net' in name and '*' not in name:
                        tenant = extract_tenant_from_domain(name)
                        if tenant and tenant not in existing_tenants:
                            discovered.add(tenant)
            
            print(f"New tenants found: {len(discovered)}")
        else:
            print(f"Error: HTTP {response.status_code}")
    
    except Exception as e:
        print(f"  Error: {e}")
    
    return discovered



In [26]:
ct_tenants= expand_with_ct(existing_tenants)
existing_tenants.update(ct_tenants)

for tenant in ct_tenants:
    for path in ['careers', 'jobs', 'talent', 'recruiting', 'talentcommunity']:
        expanded_tenant_registry.append({
            'tenant': tenant,
            'base_url': f"https://{tenant}",
            'career_path': f"/{path}",
            'full_url': f"https://{tenant}/{path}",
            'is_career_page': True,
        })

  Certificate records: 164
New tenants found: 28


In [27]:
def expand_with_company_search(existing_tenants):
    """
    Strategy : Test Fortune 1000 + Global 500 companies
    Direct HTTP HEAD requests (faster than DNS)
    """
    print("\nFortune 1000 + Global 500 brute force...")
    
    # Comprehensive list of major companies
    major_companies = [
        # Technology (100 companies)
        'microsoft', 'apple', 'google', 'alphabet', 'amazon', 'meta', 'facebook',
        'ibm', 'oracle', 'sap', 'salesforce', 'adobe', 'cisco', 'intel', 'amd',
        'nvidia', 'qualcomm', 'broadcom', 'micron', 'texas-instruments', 'ti',
        'dell', 'hp', 'hpe', 'vmware', 'redhat', 'servicenow', 'workday',
        'splunk', 'palo-alto', 'fortinet', 'crowdstrike', 'okta', 'zoom',
        'slack', 'atlassian', 'datadog', 'snowflake', 'mongodb', 'elastic',
        
        # Finance (150 companies)
        'jpmorgan', 'jpmorganchase', 'jpm', 'chase', 'bankofamerica', 'bofa',
        'wellsfargo', 'wf', 'citi', 'citigroup', 'citibank', 'goldmansachs', 'gs',
        'morganstanley', 'ms', 'ubs', 'credit-suisse', 'cs', 'deutschebank', 'db',
        'barclays', 'hsbc', 'bnpparibas', 'bnp', 'societe-generale', 'sg',
        'blackrock', 'vanguard', 'fidelity', 'statestreet', 'invesco',
        'franklin-templeton', 'troweprice', 'capital-group', 'jpmorgan-asset',
        'amex', 'american-express', 'visa', 'mastercard', 'discover',
        'charles-schwab', 'schwab', 'tdameritrade', 'etrade', 'robinhood',
        'prudential', 'metlife', 'aig', 'travelers', 'allstate', 'progressive',
        'nationwide', 'liberty-mutual', 'usaa', 'state-farm',
        
        # Consulting (50 companies)
        'deloitte', 'deloitteus', 'deloitteuk', 'deloittebe', 'deloitteau',
        'pwc', 'pwcus', 'pwcuk', 'kpmg', 'kpmgus', 'kpmguk', 'ey', 'eyus', 'eyuk',
        'accenture', 'accentureus', 'mckinsey', 'bcg', 'bain', 'booz', 'boozallen',
        'oliverwyman', 'atkearney', 'rolandberger', 'lbg', 'capgemini', 'cognizant',
        'infosys', 'wipro', 'tcs', 'hcl', 'atos', 'dxc',
        
        # Pharma/Healthcare (100 companies)
        'pfizer', 'jnj', 'johnson-johnson', 'merck', 'abbvie', 'novartis',
        'roche', 'sanofi', 'gsk', 'glaxosmithkline', 'astrazeneca', 'bms',
        'bristol-myers', 'bristol', 'lilly', 'eli-lilly', 'gilead', 'biogen',
        'amgen', 'regeneron', 'vertex', 'moderna', 'biontech', 'illumina',
        'unitedhealth', 'uhg', 'anthem', 'elevance', 'cigna', 'humana', 'centene',
        'hca', 'tenet', 'universal-health', 'hcahealthcare', 'commonspirit',
        'ascension', 'providence', 'mayo', 'cleveland', 'johnshopkins', 'kaiser',
        
        # Manufacturing/Industrial (80 companies)
        'ge', 'generalelectric', 'honeywell', 'siemens', '3m', 'caterpillar',
        'deere', 'boeing', 'lockheed', 'lockheedmartin', 'raytheon', 'rtx',
        'northrop', 'northropgrumman', 'generaldynamics', 'gd', 'l3harris',
        'emerson', 'rockwell', 'schneider', 'abb', 'eaton', 'parker',
        'ford', 'gm', 'general-motors', 'stellantis', 'fca', 'toyota', 'honda',
        'nissan', 'hyundai', 'kia', 'vw', 'volkswagen', 'bmw', 'mercedes',
        'daimler', 'audi', 'porsche', 'tesla', 'rivian', 'lucid',
        
        # Energy (60 companies)
        'exxon', 'exxonmobil', 'chevron', 'shell', 'bp', 'total', 'totalenergies',
        'conocophillips', 'cop', 'valero', 'marathon', 'phillips66',
        'halliburton', 'slb', 'schlumberger', 'baker-hughes', 'weatherford',
        'duke-energy', 'southern-company', 'nextera', 'dominion', 'exelon',
        
        # Retail/Consumer (100 companies)
        'walmart', 'target', 'costco', 'homedepot', 'lowes', 'kroger',
        'albertsons', 'publix', 'wegmans', 'bestbuy', 'macys', 'nordstrom',
        'gap', 'tjx', 'ross', 'burlington', 'kohls', 'nike', 'adidas',
        'pg', 'proctergamble', 'unilever', 'nestle', 'cocacola', 'pepsico',
        'kraft', 'mondelez', 'mars', 'general-mills', 'kellogg', 'campbells',
        'mcdonalds', 'starbucks', 'yum', 'chipotle', 'dominos', 'subway',
        
        # Telecom/Media (50 companies)
        'att', 'verizon', 'tmobile', 'sprint', 'comcast', 'charter', 'cox',
        'vodafone', 'orange', 'telefonica', 'bt', 'disney', 'warnermedia',
        'paramount', 'cbs', 'nbc', 'universal', 'sony', 'fox', 'netflix',
        
        # Other (50 companies)
        'fedex', 'ups', 'dhl', 'maersk', 'marriott', 'hilton', 'hyatt',
        'aon', 'marsh', 'willis', 'cbre', 'jll', 'cushman', 'colliers',
        'leidos', 'saic', 'caci', 'aecom', 'jacobs', 'fluor', 'bechtel',
    ]
    
    print(f"  Testing {len(major_companies)} major companies...")
    
    discovered_tenants = set()
    
    def test_tenant(company):
        """Test if a company tenant exists via HTTP HEAD"""
        tenant = f"{company}.avature.net"
        
        if tenant in existing_tenants:
            return None
        
        try:
            response = requests.head(
                f"https://{tenant}/careers",
                timeout=5,
                allow_redirects=True,
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            
            if response.status_code in [200, 301, 302, 403]:
                return tenant
        except:
            pass
        
        return None
    
    found_count = 0
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(test_tenant, company): company for company in major_companies}
        
        for future in as_completed(futures):
            result = future.result()
            if result:
                discovered_tenants.add(result)
                found_count += 1
                if found_count % 10 == 0:
                    print(f"    Found {found_count} live tenants...")
    
    print(f"\n  Live tenants found: {len(discovered_tenants)}")
    
    if len(discovered_tenants) > 0:
        print(f"\n  Sample:")
        for tenant in sorted(discovered_tenants)[:30]:
            print(f"    {tenant}")
    
    return discovered_tenants

In [28]:
search_based_tenants= expand_with_company_search(existing_tenants)
existing_tenants.update(search_based_tenants)

# Add to registry
for tenant in search_based_tenants:
    for path in ['careers', 'jobs', 'talent', 'recruiting']:
        expanded_tenant_registry.append({
            'tenant': tenant,
            'base_url': f"https://{tenant}",
            'career_path': f"/{path}",
            'full_url': f"https://{tenant}/{path}",
            'is_career_page': True,
        })



Fortune 1000 + Global 500 brute force...
  Testing 313 major companies...

  Live tenants found: 2

  Sample:
    boozallen.avature.net
    schneider.avature.net


In [29]:
# Load seed-based registry
df_seeds = pd.read_csv(TARGET_REGISTRY_PATH)
print(f"Unique tenants from original seed URLs: {df_seeds['tenant'].nunique()}")

# Create expanded tenants DataFrame
df_expanded = pd.DataFrame(expanded_tenant_registry)
print(f"Unique tenants from expanded seed URLs: {df_expanded['tenant'].nunique()}")


# Combine both datasets
df_combined = pd.concat([df_seeds, df_expanded], ignore_index=True)

# Remove exact duplicates (same tenant + path)
df_combined = df_combined.drop_duplicates(
    subset=['tenant', 'career_path'], 
    keep='first'  # Keep seed version for duplicates
)
# Sort by tenant and career page priority
df_combined = df_combined.sort_values(
    by=['tenant', 'is_career_page'], 
    ascending=[True, False]
)

# Save combined dataset
df_combined.to_csv(TARGET_REGISTRY_PATH, index=False)
print(f'Final rows: {len(df_combined)}')

# Also save just CT discoveries for reference
df_expanded.to_csv(EXPANDED_SEED_URLS, index=False)


Unique tenants from original seed URLs: 536
Unique tenants from expanded seed URLs: 30
Final rows: 2546
