In [47]:
import sys
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re
from itertools import chain
import pickle
import glob
import json
import math

In [None]:
ua = UserAgent()
user_agent = {'User-agent': ua.random}

#functions are in the order called in generate_pdfs()

def rrc_query(api):
    '''
    Sends a post request for a specific well (api number) and returns the html received by the request
    '''
    url_base = 'http://webapps.rrc.texas.gov'
    comp_url = url_base + '/CMPL/publicSearchAction.do'

    request_params = {
        'searchArgs.apiNoHndlr.inputValue' : api,
        'formData.methodHndlr.inputValue' : 'search'

    }

    with requests.post(comp_url, data=request_params, headers=user_agent) as response:
        doc = BeautifulSoup(response.text, 'html.parser')
        return doc

In [59]:
ua = UserAgent()
user_agent = {'User-agent': ua.random}

url_base = "https://rrcsearch3.neubus.com/esd3-rrc"

def collectIds(page_num=0,page_size=50):
    api_url = "/api.php?function=SearchImages"
    
    request_json = {"Neusearch":
                    {"profile":"15",
                     "api_key":"publicuser",
                     "api_sig":"68461babf138014f252e64732ef3b1a0",
                     "Searchitems": {"item":[{"key":"file_type","value":"LAS"}]},
                     "page":page_num,
                     "pageSize":page_size,
                     "strict":"true",
                     "saveSearch":"true"}
                   }
    
    request_string = {
        "json" : json.dumps(request_json)
    }
    
    with requests.post(url_base+api_url, data=request_string, headers=user_agent) as response:
        response_dict = json.loads(response.text)
        num_results = response_dict['NeusearchResults']['numImages']
        page_results = [image[-2] for image in response_dict['NeusearchResults']['images']]
    
    try:
        return page_results , num_results
    except NameError:
        print('No response was returned')
        return None

def getLogID(docID):
    api_url = "/api.php?function=GetDocType"
    
    request_json = {"Neusearch":
                    {"profile":"15",
                     "api_key":"publicuser",
                     "api_sig":"68461babf138014f252e64732ef3b1a0",
                     "docID": docID}
                   }
    
    request_string = {
        "json" : json.dumps(request_json),
        "_" : 1572212833587
    }
    
    with requests.get(url_base+api_url,params=request_string) as response:
        response_dict = json.loads(response.text)

    try:
        return response_dict['docTypes']['Well Log'][0]['id'] # [0] Assumes 1 log / well
    except NameError:
        print('No response was returned')
        return None

def getURL(logID):
    api_url = "/api.php?function=GetMiscDoc"
    
    request_json = {"Neusearch":
                    {"profile":"15",
                     "api_key":"publicuser",
                     "api_sig":"68461babf138014f252e64732ef3b1a0",
                     "miscDocID": logID}
                   }
    
    request_string = {
        "json" : json.dumps(request_json),
        "_" : 1572212833587
    }
    
    with requests.get(url_base+api_url,params=request_string) as response:
        response_dict = json.loads(response.text)
        
    try:
        return response_dict['fileSets'][0]['files'][0]['url']
    except NameError:
        print('No response was returned')
        return None
    
def scrapeManager(page_size=50,page_limit=1):
    
    print('Gathering well IDs')
    id_list = []
    page_num = 0
    while page_num < page_limit:
        print(f'    Collecting Page {page_num+1}')
        page_results , num_results = collectIds(page_num,page_size)
        total_pages = math.ceil(num_results/page_size) - 1 # subtract 1 for 0 indexing
        if total_pages < page_limit:
            page_limit = total_pages
        id_list += page_results
        page_num += 1
    print('------------------')
    
    log_id_list = []
    print('Gathering log IDs')
    for well_id in id_list:
        print(f'    Getting log ID for {well_id}')
        log_id_list.append(getLogID(well_id))
    print('------------------')
    
    URL_list = []
    print('Gathering log file URLs')
    for log_id in log_id_list:
        print(f'    Getting log URL for {log_id}')
        URL_list.append(getURL(log_id))
    print('------------------')
        
    return URL_list

In [60]:
log_urls = scrapeManager()

Gathering well IDs
    Collecting Page 1
------------------
Gathering log IDs
    Getting log ID for hz6dahx7n8I.
    Getting log ID for 1EkJK1j1ILI.
    Getting log ID for WrCiWd2PPpk.
    Getting log ID for tIGLh94CFAA.
    Getting log ID for W-2U-1mVCPY.
    Getting log ID for 0t6PQ9a8YLE.
    Getting log ID for H5clnO_7qZQ.
    Getting log ID for RzW7R76amnM.
    Getting log ID for EgEJ0ukK9s8.
    Getting log ID for u3TcgIJ-I8Y.
    Getting log ID for _fVQhfkvhZE.
    Getting log ID for 5gBcw-Aac78.
    Getting log ID for 3156F-vVGCM.
    Getting log ID for P7iZ-GVnEbg.
    Getting log ID for Q80VSm-fYXM.
    Getting log ID for JeplABYlvqE.
    Getting log ID for o3m-uJT_LDk.
    Getting log ID for XhsxvijQeMU.
    Getting log ID for irQNTV3ums4.
    Getting log ID for -k8TLVGHB48.
    Getting log ID for D3S00r59dpU.
    Getting log ID for DPMquoN8zuE.
    Getting log ID for DT_x8SQdz8s.
    Getting log ID for kEJAfhzaLBY.
    Getting log ID for SVcylxdUYUM.
    Getting log ID for

In [61]:
log_urls

['api.php?function=GetAttachment&profile=15&id=kNAMejO3OhA.&docid=_fmqnOHMHZo.',
 'api.php?function=GetAttachment&profile=15&id=B_luBUBgUfU.&docid=CCm8PzUAKWE.',
 'api.php?function=GetAttachment&profile=15&id=sg62sDmqr1w.&docid=KA7DImcpd4A.',
 'api.php?function=GetAttachment&profile=15&id=NIA7rBuKkWc.&docid=tR9rbC1byIg.',
 'api.php?function=GetAttachment&profile=15&id=3Eni5H5ObHc.&docid=g_ndzMAra0w.',
 'api.php?function=GetAttachment&profile=15&id=qn2lKJVblWs.&docid=lj-ZbuWxM6E.',
 'api.php?function=GetAttachment&profile=15&id=pAn9KHGn9F0.&docid=r20SqiISE2o.',
 'api.php?function=GetAttachment&profile=15&id=vvE_qaNmVCM.&docid=NQtFFj3hmcQ.',
 'api.php?function=GetAttachment&profile=15&id=bgSc9YKITaI.&docid=BuQP95FqiN4.',
 'api.php?function=GetAttachment&profile=15&id=HOQbd3Gf3LU.&docid=iTwxcaE0xsc.',
 'api.php?function=GetAttachment&profile=15&id=KMXlJBcPSVw.&docid=n9kjgCJ2wcQ.',
 'api.php?function=GetAttachment&profile=15&id=AnianGpeupU.&docid=YqqXweusOvs.',
 'api.php?function=GetAttach

In [69]:
url_base = 'https://rrcsearch3.neubus.com/esd3-rrc/'
log_url = log_urls[0]

headers = {
    'Host': 'rrcsearch3.neubus.com',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
    'Sec-Fetch-User': '?1',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Referer': 'https://rrcsearch3.neubus.com/esd3-rrc/index.php?_module_=esd&_action_=keysearch&profile=15',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Cookie': 'NeuAuth=d8b7e52c2ba379c4195b7c4894972b2f; Neups=50; NeuESD=6a0vmil2id5h9io57dbohafd23'
}

with requests.get(url_base+log_url,headers=headers) as response:
    response_las = response.text

In [70]:
response_las

"~VERSION INFORMATION \r\n VERS. 2.0: CWLS LOG ASCII STANDARD -VERSION 2.0 \r\n WRAP.  No: SINGLE LINE PER DEPTH STEP\r\n~WELL INFORMATION BLOCK \r\n#MNEM.UNIT\t\t  DATA\t\t\t        DESCRIPTION OF MNEMONIC\r\n#------------------------------------------------------------------------------\r\n STRT.ft                   3522.50                      : Start Depth\r\n STOP.ft               \t  17014.00                      : Stop  Depth\r\n STEP.ft                      0.50                      : Step  Depth\r\n NULL.                     -999.25                      : Null value\r\n COMP.                    Penn Virginia Oil & Gas, L.P. : Company Name\r\n WELL.                    L & J Lee Unit No. 1H         : Well Name\r\n PATH.                    Original Path Hole            : Path Name\r\n FLD.                     Eagleville (Eagle Ford-1)     : Field Name\r\n LOC.                     Gonzales County               : Location\r\n PROV.                    Texas                         :

In [71]:
with open('test.las', 'w') as file:
    file.write(response_las)

In [66]:
user_agent

{'User-agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}