In [None]:
import logging
import pandas as pd
import lasio as las
import numpy as np
import os
import time
from zipfile import ZipFile
import regex as re

In [None]:
# required curves/channels

required_channels = ['TEMP_DNI', 'VIBX', 'VIBY']

In [None]:
#counter intialization
scan_success = 0
scan_failed = 0

#creating the list for the files counting
scanned_files = list()
failed_files = list()

#logging file name
file_name = 'log_file_list.txt'


In [None]:
directory_list = [r'****',
            r'****'] #just in case if there are several folder where the data can be fetched

output_directory = r'****' #the dirctory where the logs and outputs are collected


In [None]:
# reading the las file

def read_las_df(directory:str, run:int, filename:str):
    
    las_file = las.read(os.path.join(directory,filename))
    las_df = pd.DataFrame(
                data = las_file.data[1:],
                columns = las_file.keys()
            )
    
    las_df['WELL'] = las_file.well['WELL'].value
    las_df['RUN'] = run

    return las_df

In [None]:
def data_processing(data):
    
    # the function process the data from las files
    # also checks whether some curves are within the file
    # returns list of predefined variables for further concatination into DataFrame
    
    #data processing
    data['TIME'] = pd.to_datetime(data['TIME'], unit='D') #converting to datatime
    data['TIME_DIFF'] = data['TIME'].diff() # computing timestep
    well_name = data['WELL'][1]
    run = data['RUN'][1]
    median_vib = np.nan
    max_vib = np.nan
    energy = np.nan
    n2_rate_av = np.nan
    n2_rate_tot = np.nan
    n2_rate_max = np.nan
    median_vibz = np.nan
    max_vibz = np.nan
    
    #check point if the curves are within the las file
    for item in required_channels:
        
        if not(item in data.columns):
            
            log.warning(f'The channel {item} has not been found in {well_name} well run {run}!')
    
    #computing VIB_LAT, max/median(VIB_LAT), energy, time aobve 10g/15g
    
    if ('VIBX' in data.columns) and ('VIBY' in data.columns):
        
        data['VIB_LAT'] = np.sqrt(data['VIBX']**2 + data['VIBY']**2)
        mask_10g = data['VIB_LAT'] >= 10
        mask_15g = data['VIB_LAT'] >= 15
        time_10g = data['TIME_DIFF'][mask_10g].sum().seconds
        time_15g = data['TIME_DIFF'][mask_15g].sum().seconds
        median_vib = data['VIB_LAT'].median()
        max_vib = data['VIB_LAT'].max()
        energy = (data['VIB_LAT'] * data['TIME_DIFF'].dt.microseconds).sum()/1000000

    else:
        
        time_10g = np.nan
        time_15g = np.nan
        energy = np.nan
    
    #TEMP_DNI validation
    
    if ('TEMP_DNI' in data.columns):
        max_temp = data['TEMP_DNI'].max()
    
    else:
        max_temp = np.nan
    
    # N2 pumping data processing and validation
    
    if ('N2_RATE' in data.columns):
        n2_rate_max = data['N2_RATE'].max()
        n2_rate_av = data['N2_RATE'].mean()
        n2_rate_tot = ((data['N2_RATE']/60) * data['TIME_DIFF'].dt.microseconds).sum()/1000000
    
    if ('VIBZ' in data.columns):
        median_vibz = data['VIBZ'].median()
        max_vibz = data['VIBZ'].max()

                
    return [well_name, run, time_10g, time_15g, median_vib, max_vib, median_vibz, max_vibz, max_temp, energy, n2_rate_av, n2_rate_tot, n2_rate_max]
        
        

In [None]:
def get_run(text:str):
    
    #The function gets name of the file as string 
    #--> returns run number and whether the las file is timebased

    text = str.lower(text)
    text = text.replace('-', '_')
    text = text.replace(' ', '_')
    text = text.replace('#', '_')
    text = text.replace('.', '_')
    
    text_list = text.split(sep='_')
    Is_time = False #flag whether the file has time format
    run = '0'
    
    for item in text_list:
        
        if 'run' in item:
            if 'run' == item:
                run=(item+text_list[text_list.index(item)+1]).upper()
            else:
                run=item.upper()
        
        # 'dml' is an indication of timebased las file
        
        if 'dml' in item:
            Is_time = True
    
    run = re.findall(r'\d+', run)
    
    return int(run[0]), Is_time

In [None]:
def wells_to_file(directory:str, well_list:list):
    
    #the functions saves already scanned wells list to the log file
    
    well_list = list(set(well_list))
    
    log_path = os.path.join(directory, file_name)

    if os.path.exists(log_path):
        log_file = open(log_path, 'a')

        for item in well_list:
            log_file.write(item + '\n')

        log_file.close()
        
    else:
        log_file = open(log_path, 'w')
        
        for item in well_list:
            log_file.write(item + '\n')
            
        log_file.close()
        
    pass
    

In [None]:
def read_log(directory:str):
    
    #the function reads log file for already scanned wells 
    #--> returns list of the wells
    
    log_path = os.path.join(directory, file_name)
    well_list = list()
    
    if os.path.exists(log_path):
        log_file = open(log_path, 'r')
        well_list = log_file.read().split('\n')
        log_file.close()

    return well_list

### Main Loop

In [None]:
total_las_files = 0 # Total las file counter for all folders
output_list = list() # the list where we are going to collect results for data processing

loop_st_time = time.time() #recording the start time of the script

#logging activation
log = logging.getLogger('my-logger')
logging.basicConfig(level=logging.WARNING)
logging.basicConfig(format='%(asctime)s - %(message)s')

# scanning the las files in the directories

for directory in directory_list:

    print('Start scanning the folder...', directory)

    
    las_file_counter = 0 # Las file counter for scanning folder

    for root, dirs, files in os.walk(directory):

        for file in files:

            if file.endswith('.las'):

                if not(file in read_log(output_directory)):

                    if get_run(file)[1]:

                        las_file_counter +=1
                        total_las_files +=1
            
            if file.endswith('.zip'):
                
                try:
                    with ZipFile(os.path.join(root, file), 'r') as zip_file:
                    
                        files_in_zip = zip_file.namelist()
                    
                        for element in files_in_zip:
                        
                            if 'las' in element.split('.'):
                                
                                if not(element in read_log(output_directory)):
                                    
                                    if get_run(element)[1]:
                                        
                                        las_file_counter +=1
                                        total_las_files +=1
                
                except:
                    log.warning(f'Error in reading file...{file}')


    print('Total Las files: ', las_file_counter)
    las_file_counter = 0
    
#main loop

    for root, dirs, files in os.walk(directory):

        for file in files:
            
            #reading of nonarchived las files
            if file.endswith('.las'):

                if not(file in read_log(output_directory)): #check if the file was scanned before

                    if get_run(file)[1]:

                        try:
                            print('Reading file... {}'.format(file))
                            data = read_las_df(root, get_run(file)[0], file)
                            print('Processing the file... {}'.format(file))
                            output_list.append(data_processing(data))
                            scan_success +=1
                            print('Current Progress: ', scan_success+scan_failed,'/', total_las_files)
                            scanned_files.append(file)
                        except:
                            log.warning(f'Error in reading file...{file}')
                            scan_failed +=1
                            print('Current Progress: ', scan_success+scan_failed,'/', total_las_files)
                            failed_files.append(file)
            
            
            #reading of archived las files
            if file.endswith('.zip'):
                
                try:
                    with ZipFile(os.path.join(root, file), 'r') as zip_file:
                    
                        files_in_zip = zip_file.namelist()
                    
                        for element in files_in_zip:
                        
                            if 'las' in element.split('.'):
                                
                                if not(element in read_log(output_directory)): #check if the file was scanned before

                                    if get_run(element)[1]:

                                        try:
                                            print('Reading file... {}'.format(element))
                                            data = read_las_df(root, get_run(file)[0], element)
                                            print('Processing the file... {}'.format(element))
                                            output_list.append(data_processing(data))
                                            scan_success +=1
                                            print('Current Progress: ', scan_success+scan_failed,'/', total_las_files)
                                            scanned_files.append(element)
                                        except:
                                            log.warning(f'Error in reading file...{element}')
                                            scan_failed +=1
                                            print('Current Progress: ', scan_success+scan_failed,'/', total_las_files)
                                            failed_files.append(element)

                                
                
                except:
                    log.warning(f'Error in reading file...{file}')
                


    #end of main loop

#calculating the total time

loop_end_time = time.time()

#displaying results of the scanning

print('Total time spent for scanning and processing...', round(loop_end_time-loop_st_time),'sec')
print('Total successfully scanned las files: ', scan_success,'/', total_las_files)
print('Total failed scans: ', scan_failed)
print('Failed files list: ',failed_files)

In [None]:
#reading the previous scan results for updating

previous_results = pd.read_excel(os.path.join(output_directory,'CTD_output.xlsx'), index_col=0)

if output_list != []:
    
    output = pd.DataFrame(
    output_list,
    columns=['Well', 'Run', 'Time_10g', 'Time_15g', 'Median VIB', 'Max VIB', 'Median VIBZ', 'Max VIBZ', 'Max_Temp', 'Energy', 'N2_RATE_av', 'N2_RATE_tot', 'N2_RATE_max']
    )
    
    output_dupl = output.drop_duplicates()
    print(output.shape[0] - output_dupl.shape[0], 'duplicates were deleted')
    
    pd.concat([previous_results, output_dupl], ignore_index=True).to_excel(os.path.join(output_directory,'CTD_output.xlsx'))
    
    print('Updating the log file...')
    wells_to_file(output_directory, scanned_files+failed_files)

