In [1]:
# Import libraries
import os
from pathlib import Path
import tarfile
import time

In [2]:
# Get current working directory
cwd = os.getcwd()

# Inspect results
cwd

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\scripts'

In [3]:
# Define root file directory folder where the files are being stored
#os.chdir(cwd + alarmLoc)
os.chdir(os.path.dirname(os.getcwd()) + '\\alarm-event-logs')

# Check current directory location
cwd = os.getcwd()

# Check directory location
cwd

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\alarm-event-logs'

In [4]:
# Inspect files in directory
fileList = os.listdir()
fileList

['alarmLog.7z',
 'alarmLog_expanded',
 'dataCleaned',
 'desktop.ini',
 'dummyLog',
 'dummyLog - Holding',
 'DummyZip',
 'eventLog.7z',
 'eventLog_expanded',
 'Original Sample from 27 Oct 2020 (simplified)',
 'Repair Logs',
 'Sample from 27 Oct 2020 (OG).zip',
 'sample_data_ats',
 'sample_data_ats.zip',
 'sample_data_cms',
 'sample_data_cms - test',
 'sample_data_cms.zip',
 'sample_data_ecs',
 'sample_data_ecs.zip',
 'testLog']

In [5]:
# Location of Alarm and Normal Event Files
#targetFolder = '\\DummyZip\\testZip_Test - Copy'
#targetFolder = '\\sample_data_ecs\\batch 001 - 20201230 to 20210201\\EventLogs'
#targetFolder = '\\sample_data_cms\\batch 001 - 20201230 to 20210201\\AlarmLogs'
targetFolder = '\\sample_data_test2\\batch 001 - 20201230 to 20210201\\TestLogs\\CMS-0000005'

In [6]:
# Define root file directory folder where the files are being stored
os.chdir(cwd + targetFolder)

# Check directory location
os.getcwd()

'C:\\Users\\cftfda01\\Documents\\SBST Train IAMS Project\\alarm-event-logs\\sample_data_cms\\batch 001 - 20201230 to 20210201\\EventLogs'

In [7]:
# Inspect files in directory
fileList = os.listdir()
len(fileList)

798

In [8]:
# Define function to get folder contents
def list_files(dir):
    fileDirList = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            fileDirList.append(os.path.join(root, name))
    return fileDirList


def list_files2(dir):
    fileList = []
    fileDirList = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            fileList.append(os.path.join(name))
            fileDirList.append(os.path.join(root, name))
    return fileList, fileDirList


In [9]:
# Generate list of files with directory
#fileList_Dir = list_files(os.getcwd())

# Inspect list of files
#fileList_Dir

In [10]:
# Generate list of files with directory
#fileList_Dir2 = list_files2(os.getcwd())

# Inspect list of files
#fileList_Dir2

In [11]:
#indexCounter = 0
#print(fileList_Dir2[0][indexCounter])
#print(fileList_Dir2[1][indexCounter])

# Parsing Strategy

**Scenario**
1. The folder may have a mix of many log files, .tar files and .tar.gz files in the thousands
2. Each .tar and .tar.gz file may have varying levels of subfolders in the form of more .tar and .tar.gz files under them
3. Extracting too many files to a single directory would cause the windows indexer to overwork, resulting in a severe slowdown in system performance
4. Some log files are duplicates


**Goal**
1. Extract out all the log files from the corresponding multi-level .tar and .tar.gz files
2. Bin the log files into smaller single level subdirectories of 50K files each
3. Delete .tar and .tar.gz files once files have been extracted
4. Remove duplicates

**Plan**
1. Initialise variables
2. Create while loop to check for length of list of files within directory (including subdirectories) that contain .tar.gz or .tar files
    1. Use os.walk to comb through directory and subdirectory for .tar.gz or .tar files
    2. If each .tar.gz or .tar file is found
        1. Extract a file to a new unique folder as a subfolder to the main parent directory
        2. Delete original file once extraction has been completed
        3. Update counter
    3. Use os.walk to comb through directory for any empty subfolders (this is caused by the child elements being .tar.gz or .tar files which were extracted out)
    4. If a subfolder is found to be empty, delete the folder
3. Repeat Step 2 till there are no more .tar.gz or .tar files

**Note**
1. Duplicates would only be overwritten if the files are written to a common directory
2. Additional  duplicate removal needs to be done when all the processed files are merged







In [12]:
# Start timer
startTime = time.time()

# Get List of .tar.gz and .tar files
docType_targz = ".tar.gz"
docType_tar = ".tar"

# Get Current Directory
print(os.getcwd())

# Need to update counter with each run
# 899,999,999 possible subfolders can be created
counter = 100000001
runCounter = 1

# Repeat function till there are no more .tar.gz or .tar files in directory and subfolders
while len([s for s in list_files(os.getcwd()) if (s.endswith(docType_targz) | s.endswith(docType_tar))]) > 0:
    
    print("Commence Extraction Run: " + str(runCounter))
    print("Batch started on: " + time.ctime(time.time()))
    
    # Get list of files & their directories
    for dirpath, dir, files in os.walk(os.getcwd()):    
        # Loop through each file
        for file in files:
            # Get file directory of each file
            filepath = dirpath + os.sep + file
            
            # target only specific doctypes
            if filepath.endswith(docType_targz ) | file.endswith(docType_tar):
                # Create Directory for Output Files Generated
                outputDirName = "output - " + str(counter)
                # Create directory at the 2nd level
                Path("/" + outputDirName).mkdir(parents=True, exist_ok=True)
                #os.makedirs(dirpath + os.sep + outputDirName)
                
                # Update counter
                counter = counter + 1

                # Extract File
                tar = tarfile.open(filepath)
                tar.extractall(outputDirName)
                tar.close()

                # Delete file
                os.remove(filepath)
    
    # Print Current Time
    print("Completed Extraction Run: " + str(runCounter))
    print("Batch completed on: " + time.ctime(time.time()))
    print()
    
    # Update counter
    runCounter = runCounter + 1

print("Commence file and folder clean up")
# Remove duplicate files with the same name spread across multiple subfolders
# Generate list of files and subdirectories
fileList_Dir2 = list_files2(os.getcwd())
# Initate variables
#trialFileList = [] # depreciated as the use of sets is significantly faster for longer lists
trialFileSet = set()
indexCounter = 0
for file in fileList_Dir2[0]:
    #if file in trialFileList: # depreciated as the use of sets is significantly faster for longer lists
    if file in trialFileSet:
        # Delete file if a duplicate is found
        os.remove(fileList_Dir2[1][indexCounter])
        #Update counter
        indexCounter = indexCounter + 1
    else:
        # Update Trial List/Set
        #trialFileList.append(file) # depreciated as the use of sets is significantly faster for longer lists
        trialFileSet.add(file)
        #Update counter
        indexCounter = indexCounter + 1

    
# Remove empty folders
folders = list(os.walk(os.getcwd()))[1:]

for folder in folders:
    # Check if folder is empty
    if not folder[2]:
        # Delete folder
        os.rmdir(folder[0])
        
print("File and folder clean up complete")
print()
        
# Stop Timer
endTime = time.time()
executionTime = (endTime - startTime)

# Print Timing
print("Extraction Complete")
print("End Time: " + time.ctime(endTime))
print('Execution time in seconds: ' + str(executionTime))
print('Execution time in minutes: ' + str(executionTime/60))
print('Execution time in hours: ' + str(executionTime/60/60))

C:\Users\cftfda01\Documents\SBST Train IAMS Project\alarm-event-logs\sample_data_cms\batch 001 - 20201230 to 20210201\EventLogs
Commence Extraction Run: 1
Batch started on: Thu Mar 18 13:24:35 2021
Completed Extraction Run: 1
Batch completed on: Thu Mar 18 14:14:59 2021

Commence Extraction Run: 2
Batch started on: Thu Mar 18 14:15:16 2021
Completed Extraction Run: 2
Batch completed on: Thu Mar 18 15:35:35 2021

Commence file and folder clean up
File and folder clean up complete

Extraction Complete
End Time: Thu Mar 18 15:40:17 2021
Execution time in seconds: 8141.939615249634
Execution time in minutes: 135.6989935874939
Execution time in hours: 2.2616498931248983


In [13]:
# Generate list of files with directory
fileList_Dir = list_files(os.getcwd())

# Inspect list of files
len(fileList_Dir)

3353925