### Unzip, Safe Script to RepoFiles, Delete Unzipped Repo

In [None]:
import os
import shutil
import pandas as pd
import tarfile
import logging
from tqdm import tqdm  # For progress bar

# Configure logging to file and console
log_file_path = 'E:/Research/parser/Adam/process_log.log'
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.FileHandler(log_file_path)])

# Read CSV, skipping bad lines
df = pd.read_csv('E:/Research/parser/Adam/filtered_trivy_output.csv', on_bad_lines='skip')

# Drop duplicates from the 'Target' column to ensure each .tgz is processed only once
df_unique = df.drop_duplicates(subset=['Target'])

# Create the RepoFiles directory if it doesn't exist
repo_files_dir = 'E:/Research/parser/Adam/RepoFiles'
if not os.path.exists(repo_files_dir):
    os.makedirs(repo_files_dir)

# Function to delete a directory manually without using shutil.rmtree
def delete_directory_manually(directory):
    try:
        for root, dirs, files in os.walk(directory, topdown=False):
            # Remove files
            for name in files:
                os.remove(os.path.join(root, name))
            # Remove directories
            for name in dirs:
                os.rmdir(os.path.join(root, name))
        # Finally, remove the root directory itself
        os.rmdir(directory)
        logging.info(f"Successfully deleted directory: {directory}")
    except Exception as e:
        logging.error(f"Error while deleting directory {directory}: {e}")

# Function to extract the tgz file and rename values.yaml
def extract_and_rename(tgz_path):
    try:
        # Extract the directory name by removing the version part (everything after the last '-')
        tgz_name = os.path.basename(tgz_path)
        directory_name = '-'.join(tgz_name.split('-')[:-1])  # Remove the version part by excluding the last split part

        # Replace periods with hyphens in directory_name (except for the .tgz part)
        output_file_name = tgz_name.replace('.tgz', '').replace('.', '-') + "_values.yaml"

        # Full path where the .tgz file is located
        full_tgz_path = f"E:/Research/parser/Adam/SecurityContextRepos/{tgz_path}"

        # Directory where the contents will be extracted
        extraction_dir = f"E:/Research/parser/Adam/extracted/{directory_name}"

        # Extract the tgz file
        logging.info(f"Extracting {full_tgz_path} to {extraction_dir}")
        with tarfile.open(full_tgz_path, "r:gz") as tar:
            tar.extractall(path=extraction_dir)

        # Directory where values.yaml is located
        values_yaml_dir = os.path.join(extraction_dir, directory_name)

        # Define the source and destination for renaming values.yaml
        values_yaml_path = os.path.join(values_yaml_dir, "values.yaml")
        new_values_yaml_path = os.path.join(repo_files_dir, output_file_name)  # Save in RepoFiles

        # Check if values.yaml exists and rename/move it to RepoFiles
        if os.path.exists(values_yaml_path):
            os.rename(values_yaml_path, new_values_yaml_path)
            logging.info(f"Renamed and moved: {values_yaml_path} -> {new_values_yaml_path}")
        else:
            logging.warning(f"values.yaml not found in {values_yaml_dir}")

        # Return the extraction directory to delete later
        return extraction_dir

    except Exception as e:
        logging.error(f"Error processing {tgz_path}: {e}")
        return None

# List to store the directories that need to be deleted at the end
directories_to_delete = []

# Add progress bar with tqdm
total_files = len(df_unique)
with tqdm(total=total_files, desc="Processing files", unit="file") as pbar:
    # Iterate over each unique row in the DataFrame
    for index, row in df_unique.iterrows():
        # Split the Target column to get the tgz path
        target_value = row['Target']
        tgz_path = target_value.split(":")[0]  # Get the tgz part before the colon

        # Process the tgz file and collect the directory to delete later
        extraction_dir = extract_and_rename(tgz_path)
        if extraction_dir:
            directories_to_delete.append(extraction_dir)

        # Update progress bar after each file
        pbar.update(1)

# After processing all files, delete all extracted directories
logging.info("Deleting all extracted directories...")
for directory in directories_to_delete:
    if os.path.exists(directory):
        delete_directory_manually(directory)
    else:
        logging.warning(f"Directory {directory} does not exist or has already been deleted.")


Processing files: 100%|██████████| 18/18 [00:00<00:00, 33.37file/s]


pip install ruamel.yaml
### Grab Target from CSV, align it to file, open file, graph Resolution from CSV, insert into script based on rules. But it needs to have an universal approach and its currently crashing after inserting one line.

In [None]:
import os
import pandas as pd
from ruamel.yaml import YAML

# Assuming the CSV has already been read into df
df = pd.read_csv('E:/Research/parser/Adam/filtered_trivy_output.csv', on_bad_lines='skip')

# Define the directories where the files are saved
repo_files_dir = 'E:/Research/parser/Adam/RepoFiles'
remediated_repo_files_dir = 'E:/Research/parser/Adam/RemediatedRepoFiles'

# Create the RemediatedRepoFiles directory if it doesn't exist
if not os.path.exists(remediated_repo_files_dir):
    os.makedirs(remediated_repo_files_dir)

# Initialize the ruamel.yaml YAML instance with round-trip capability
yaml = YAML()
yaml.preserve_quotes = True  # Preserve any quotes in the YAML

# Function to extract key-value pairs from the resolution string
def parse_resolution(resolution_line):
    resolution_map = {}

    # Define known patterns for resolution
    if "allowPrivilegeEscalation" in resolution_line:
        key = "allowPrivilegeEscalation"
        value = False if "false" in resolution_line else True
    elif "runAsNonRoot" in resolution_line:
        key = "runAsNonRoot"
        value = True if "true" in resolution_line else False
    elif "runAsUser" in resolution_line:
        key = "runAsUser"
        value = int(resolution_line.split('>')[-1].strip())
    elif "runAsGroup" in resolution_line:
        key = "runAsGroup"
        value = int(resolution_line.split('>')[-1].strip())
    elif "readOnlyRootFilesystem" in resolution_line:
        key = "readOnlyRootFilesystem"
        value = True if "true" in resolution_line else False
    elif "capabilities.drop" in resolution_line:
        key = "capabilities"
        value = {"drop": ["ALL"]}
    elif "capabilities.add" in resolution_line:
        key = "capabilities"
        value = {"add": ["NET_BIND_SERVICE"]}
    else:
        return None  # If the resolution type is not recognized

    resolution_map[key] = value
    return resolution_map

# Function to open and modify the YAML file based on the parsed key-value pairs
def modify_yaml_file(original_file_path, resolution_line, remediated_file_path):
    try:
        # Load the YAML content using ruamel.yaml
        with open(original_file_path, 'r') as file:
            yaml_content = yaml.load(file)

        # Parse the resolution and get the key-value pair to modify
        resolution_data = parse_resolution(resolution_line)

        if resolution_data:
            # Ensure 'deployment.application.securityContext' exists
            if 'deployment' in yaml_content and 'application' in yaml_content['deployment']:
                if 'securityContext' not in yaml_content['deployment']['application']:
                    # Create securityContext if it doesn't exist
                    yaml_content['deployment']['application']['securityContext'] = {}

                # Update the securityContext with the parsed key-value pair
                yaml_content['deployment']['application']['securityContext'].update(resolution_data)

                # Save the modified YAML content back to the new file in RemediatedRepoFiles
                with open(remediated_file_path, 'w') as file:
                    yaml.dump(yaml_content, file)

                print(f"Modified {remediated_file_path}: Added {resolution_data} to securityContext")
            else:
                print(f"Error: 'deployment.application' section not found in {original_file_path}")
        else:
            print(f"No recognized 'securityContext' setting found in the resolution for {original_file_path}")
    except Exception as e:
        print(f"Error modifying {original_file_path}: {e}")

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Get the tgz file name from the 'Target' column
    tgz_file_name = row['Target'].split(":")[0]  # Get the .tgz part before the colon
    # Create the corresponding file name in RepoFiles
    output_file_name = tgz_file_name.replace('.tgz', '').replace('.', '-') + "_values.yaml"

    # Full path to the original file in RepoFiles
    original_file_path = os.path.join(repo_files_dir, output_file_name)

    # Full path to the new file in RemediatedRepoFiles
    remediated_file_path = os.path.join(remediated_repo_files_dir, output_file_name)

    # Check if the file exists
    if os.path.exists(original_file_path):
        # Extract the resolution content
        resolution = row.get('Resolution', '')

        # If the resolution contains a recognized securityContext setting, process it
        modify_yaml_file(original_file_path, resolution, remediated_file_path)
    else:
        print(f"File {original_file_path} does not exist.")


Modified E:/Research/parser/Adam/RemediatedRepoFiles\12factor-22-5-11_values.yaml: Added {'allowPrivilegeEscalation': False} to securityContext
Modified E:/Research/parser/Adam/RemediatedRepoFiles\12factor-22-5-11_values.yaml: Added {'capabilities': {'drop': ['ALL']}} to securityContext
No recognized 'securityContext' setting found in the resolution for E:/Research/parser/Adam/RepoFiles\12factor-22-5-11_values.yaml
Modified E:/Research/parser/Adam/RemediatedRepoFiles\12factor-22-5-11_values.yaml: Added {'runAsNonRoot': True} to securityContext
Modified E:/Research/parser/Adam/RemediatedRepoFiles\12factor-22-5-11_values.yaml: Added {'readOnlyRootFilesystem': True} to securityContext
No recognized 'securityContext' setting found in the resolution for E:/Research/parser/Adam/RepoFiles\12factor-22-5-11_values.yaml
No recognized 'securityContext' setting found in the resolution for E:/Research/parser/Adam/RepoFiles\12factor-22-5-11_values.yaml
No recognized 'securityContext' setting found i