In [1241]:
import pandas as pd
import numpy as np
import json
import subprocess
import re
import os
import shutil
from itertools import chain


In [397]:
class ArqManipulation:
    """
    A utility class for file operations and data manipulation.
    """

    @staticmethod 
    def read_parquet_file(parquet_file_name: str) -> pd.DataFrame:
        """
        Reads a Parquet file and returns a DataFrame.

        :param parquet_file_name: Path to the Parquet file.
        :return: DataFrame with file contents.
        """
        try:
            if not os.path.exists(parquet_file_name):
                print(f"File '{parquet_file_name}' does not exist.")
                return pd.DataFrame()
            
            return pd.read_parquet(parquet_file_name)
        except Exception as e:
            raise RuntimeError(f"Error reading Parquet file '{parquet_file_name}': {e}")

    @staticmethod
    def save_df_to_parquet(df: pd.DataFrame, parquet_file_name: str):
        """
        Saves a DataFrame to a Parquet file.

        :param df: Dataframe to save.
        :param parquet_file_name: Parqueet saving path.
        """
        try:
            os.makedirs(os.path.dirname(parquet_file_name), exist_ok=True)
            df.to_parquet(parquet_file_name)
            print(f"DataFrame successfully saved to {parquet_file_name}")
        except Exception as e:
            raise RuntimeError(f"Error saving DataFrame to Parquet file '{parquet_file_name}': {e}")

    @staticmethod
    def clean_ansi_escape(base_str: str) -> str:
        """
        Removes ANSI escape values from a string.

        :param base_str: Unformmated string.
        :return: Cleaned string.
        """
        return re.sub(r'\x1B\[[0-9;]*[A-Za-z]', '', base_str)

    @staticmethod
    def parse_stdout_json(base_str: str) -> dict:
        """
        Parses JSON output from GitHub CLI after cleaning ANSI escape sequences.

        :param base_str: The raw output string from the GitHub CLI.
        :return: Parsed JSON dictionary.
        """
        try:
            cleaned = ArqManipulation.clean_ansi_escape(base_str)
            str_output = ''.join(cleaned.splitlines())
            return json.loads(str_output)
        except json.JSONDecodeError as e:
            raise e

    @staticmethod
    def json_to_df(parsed_json: dict) -> pd.DataFrame:
        """
        Converts a JSON dictionary to a sorted DataFrame with specific columns.

        :param parsed_json: Parsed JSON data.
        :return: Pandas DataFrame sorted by the 'createdAt' column.
        """
        try:
            df_json = pd.DataFrame(parsed_json)
            required_columns = ['name', 'createdAt', 'conclusion', 'status', 'databaseId', 'workflowDatabaseId']
            
            if not all(col in df_json.columns for col in required_columns):
                raise KeyError(f"Missing required columns in JSON data: {set(required_columns) - set(df_json.columns)}")

            df_json['createdAt'] = pd.to_datetime(df_json['createdAt'])
            return df_json[required_columns].sort_values(by="createdAt")
        except KeyError as e:
            raise ValueError(f"Error processing JSON to DataFrame: {e}")
        except Exception as e:
            raise RuntimeError(f"Unexpected error in json_to_df: {e}")

In [398]:
class ActionsWorkflow:
    """
    A class to extract GitHub Actions workflows using the GitHub CLI, generating a dataframe with returned data
    """

    def __init__(self, repository, query_size):
        """
        Initializes the ActionsWorkflow class.

        :param repository: GitHub repository in the format "owner/repo".
        :param query_size: Number of workflows to retrieve.
        """
        self.repository = repository
        self.json_attributes = '--json name,status,conclusion,createdAt,databaseId,workflowDatabaseId'
        self.query_size = query_size
        self.df = self.__gh_list_query__()

    def __gh_list_query__(self):
        """
        Calls the GitHub API via the GitHub CLI (`gh run list`) and retrieves
        a specified number of workflows.

        :return: A DataFrame containing the parsed workflow data.
        """
        try:
            list_command = f'gh run --repo {self.repository} list {self.json_attributes} -L {self.query_size}'
            
            output_json = subprocess.run(
                list_command, shell=True, text=True, check=True, capture_output=True
            ).stdout

            parsed_json = ArqManipulation.parse_stdout_json(output_json)
            df = ArqManipulation.json_to_df(parsed_json)

            ArqManipulation.save_df_to_parquet(df = df, parquet_file_name="./bin/actionsWorflow.parquet")

            return df.set_index('name')

        except subprocess.CalledProcessError as e:
            print(f"Error executing GitHub CLI command: {e}")
            return pd.DataFrame()  # Return an empty DataFrame on error



In [None]:
class ActionsJobs:
    """
    A class to interact with GitHub Actions jobs using the GitHub CLI.
    """

    def __init__(self, repository, workflow):
        """
        Initializes the ActionsJobs class.

        :param repository: GitHub repository in the format "owner/repo".
        :param workflow: Workflow associated with the jobs.
        """
        self.repository = repository
        self.workflow = workflow  

    def __retrieve_jobs__(self, database_id: int):
        command = f'gh run --repo {self.repository} view {database_id}'
        jobs_data = subprocess.run(command, shell=True, text=True, check=True, capture_output=True).stdout

        return jobs_data

    def get_jobs(self, database_id: int) -> pd.DataFrame:
            """
            Retrieves job data from the GitHub CLI and processes it.

            :param database_id: The ID of the workflow run.
            :return: A Pandas DataFrame containing job details.
            """
            try:
                jobs_df = ArqManipulation.read_parquet_file(parquet_file_name="./bin/actionsJobs.parquet")

                if jobs_df.empty:
                    data = self.__retrieve_jobs__(database_id=database_id)
                    jobs_df = self.__clean_job_text__(data)

                    jobs_df["databaseId"] = int(database_id)

                    ArqManipulation.save_df_to_parquet(jobs_df, parquet_file_name="./bin/actionsJobs.parquet")

                elif not database_id in jobs_df['databaseId'].values:
                    data = self.__retrieve_jobs__(database_id=database_id)
                    data_df = self.__clean_job_text__(data)
                    data_df["databaseId"] = int(database_id)

                    jobs_df = pd.concat([jobs_df, data_df], ignore_index=True)

                    ArqManipulation.save_df_to_parquet(jobs_df, parquet_file_name="./bin/actionsJobs.parquet")

                return jobs_df

            except subprocess.CalledProcessError as e:
                print(f"Error executing GitHub CLI command: {e}")
                return pd.DataFrame()

            except Exception as e:
                print(f"Unexpected error: {e}")
                return pd.DataFrame()
        
    def __split_string__(self, job_list):
        """
        Splits a job string into structured components.

        :param job: The job string to split.
        :return: A list of cleaned job attributes.
        """
        jobs = []

        for job in job_list:
            delimiters = r" \| | / build in | \(ID |\| in| / cleanup in | /| in " 
            splitted_job = re.split(delimiters, job)
            splitted_job = [s.strip() for s in splitted_job if s.strip()]
            jobs.append(splitted_job)
        
        jobs.pop(0)

        return jobs

    def __build_cleaned_df__(self, data):
        # Define columns
        columns = ["conclusion", "test", "buildTime (sec)", "jobId"]
        jobs_df = pd.DataFrame(columns=columns)
        jobs_df["failedAt"] = None

        for job in data:
            if any("ID" in item and ("PASSED" in item or "FAILED" in item) for item in job):
                temp_df = pd.DataFrame(self.__split_string__(job), columns=columns)

                temp_df['buildTime (sec)'] = temp_df['buildTime (sec)'].apply(str_time_to_int)
                jobs_df = pd.concat([jobs_df, temp_df], ignore_index=True)
            
            elif any("FAILED" in item for item in job):
                failed = next(item for item in job if "FAILED" in item).split("FAILED | ")
                if not jobs_df.empty:
                    jobs_df.at[jobs_df.index[-1], "failedAt"] = failed[1]  

        jobs_df["jobId"] = jobs_df["jobId"].str.rstrip(")").astype('int')
        return jobs_df


    def __find_jobs__(self, base_str: str) -> list[str]:
        lines = base_str.splitlines()
        arr = []  # Stores grouped sections
        current_group = []  # Temporary storage for the current section

        for line in lines:
            if line.isupper() or not line.strip():  # New section (uppercase or empty line)
                if current_group:  # Avoid adding empty groups
                    arr.append(current_group)
                current_group = [line]  # Start a new group
            else:
                current_group.append(line)

        if current_group:  # Append the last group
            arr.append(current_group)

        # Filter out groups that do not start with an uppercase title
        filtered_arr = [group for group in arr if group and group[0].isupper()]
        return filtered_arr

    def __clean_job_text__(self, base_str: str) -> pd.DataFrame:
        """
        Cleans and structures GitHub job data from CLI output.

        :param base_str: Raw job text output from the GitHub CLI.
        :return: A Pandas DataFrame with structured job data.
        """
        try:
            # Remove ANSI escape sequences and unwanted characters
            ansi_cleaned = ArqManipulation.clean_ansi_escape(base_str)
            cleaned = ansi_cleaned.replace("✓", "PASSED |").replace("X", "FAILED |")

            stripped_list = self.__find_jobs__(cleaned)

            if not (x.find('JOBS') or x.find("ANNOTATIONS") for x in stripped_list):
                return pd.DataFrame()

            jobs_df = self.__build_cleaned_df__(stripped_list)

            return jobs_df

        except Exception as e:
            print(f"Error processing job text: {e}")
            return pd.DataFrame()

    

def str_time_to_int(time_str: str) -> int:
    """
    Converts a time string to seconds.
    returns: int
    """
    names = ['d', 'h', 'm', 's']
    seconds = [86400, 3600, 60, 1]

    total_time = 0

    for m, t in zip(names,seconds):
        if m in time_str:
            time_list = time_str.split(m)
            total_time +=  int(time_list[0]) * t
            time_str = time_list[1]

    return total_time


In [400]:
class ActionsArtifacts:
    """
    A class to handle downloading, retrieving, and deleting GitHub Actions artifacts.
    """

    def __init__(self, repository: str):
        """
        Initializes the ActionsArtifacts object.

        :param repository: The GitHub repository in the format "owner/repo".
        """
        self.repository = repository
        self.folder = './artifacts/'  # Default storage dir
        self.paths = self.retrieve_downloaded_artifacts() 

    def download_artifact(self, database_id: str):
        """
        Downloads an artifact from GitHub Actions using the GitHub CLI.

        :param database_id: The database ID of the artifact to download.
        """
        try:
            # Ensure the folder exists before downloading
            os.makedirs(self.folder, exist_ok=True)

            # Construct the command to download the artifact
            command = f'gh run --repo {self.repository} download {database_id} --dir {os.path.join(self.folder, str(database_id))}'

            # Execute the command
            subprocess.run(command, shell=True, text=True, check=True)
            print("Download Successful")
        except subprocess.CalledProcessError as e:
            print(f"Error during artifact download: {e}")
        except Exception as e:
            print(f"Unexpected error: {e}")

    def retrieve_downloaded_artifacts(self) -> list[str]:
        """
        Retrieves all downloaded artifacts file paths.

        :return: returns Paths of the downloaded artifacts
        """
        paths = []

        # Walk through the artifacts folder and collect all file paths
        for path, _, files in os.walk(self.folder):
            for file in files:
                paths.append(os.path.join(path, file))

        return paths

    def delete_downloaded_artifacts(self):
        """
        Deletes all downloaded artifacts recursively
        """
        try:
            shutil.rmtree(self.folder)
            if os.path.exists(self.folder):
                print("Error: Failed to delete artifacts directory.")
            else:
                print("Artifacts directory deleted successfully.")
        except FileNotFoundError:
            print("Artifacts directory not found, nothing to delete.")
        except Exception as e:
            print(f"Error while deleting artifacts: {e}")


In [None]:
class PytestArtifactLogExtractor:
    """
    A class to extract and process test status and timing information from a pytest artifact log.
    """

    def __init__(self, path: str):
        """
        Initializes the PytestArtifactLogExtractor object.

        :param path: Path to the pytest artifact log file.
        """
        self.path = path
        self.data = self.__read_file__()

    def __read_file__(self):
        """
        Reads the contents of the log file and returns it as a string.

        :return: String containing the file content.
        """
        with open(self.path, "r") as file: 
            data = file.read()

        return ArqManipulation.clean_ansi_escape(data)

    def log_to_df(self):
        """
        Parses the log file to extract test results and performance metrics.

        :return: A DataFrame combining test statuses with time metrics.
        """

        df_parquet = ArqManipulation.read_parquet_file(parquet_file_name='pytest.log.parquet')   

        databaseId = self.__extract_self_path_info__().get('databaseId').get(0)
        databaseId = int(databaseId) if databaseId else None 
    
        if not df_parquet.empty and (databaseId in df_parquet['databaseId']):
            return df_parquet

        tests, categories, failures = self.__extract_all_categories__()
        
        # Creating dataframes test status and categories
        status_df = pd.DataFrame(tests, columns=["status", "name", "category", "arguments"]).set_index('name')
        categories_df = self.__create_df__(categories)
        failures_df = pd.DataFrame(failures,columns=['name', 'category', 'arguments', 'error', 'error_details']).set_index('name')

        # Labeling the dfs
        status_df.index.name = 'pytest_tests_status'
        categories_df.index.name = 'pytest_run_times'
        failures_df.index.name = 'pytest_failures_errors'

        # Applying individual id for each table
        status_df['databaseId'] = databaseId
        categories_df['databaseId'] = databaseId
        failures_df['databaseId'] = databaseId


        return status_df, categories_df, failures_df

    def __extract_all_categories__(self):
        """
        Converts extracted timing data into DataFrames.

        :param values: A list of lists with extracted time metrics.
        :type values: list[list]
        :return: A list of DataFrames with execution time statistics.
        :rtype: list[pandas.DataFrame]
        """
        header = []
        # Filtering out irrelevant categories
        keywords = ('deselected', 'passed in', 'grand total', 'live log')

        values = self.data.splitlines()
        for value in values:
            if any(k in value for k in keywords):
                continue   
            elif re.match(r'=+|-+', value): # Divide by headers demarked by '=' or '-' (logging)
                value = value.replace("=", "")  
                value = value.replace("-", "")  
                header.append([value]) 
            else:
                # Populate each category and break in the case of the pytest-durations tables while ignoring empty values
                value = re.split(r"\s+", value) 
                if list(filter(None, value)):
                    header[-1].append(list(filter(None, value)))

        headers = [['live_log','live_log','live_log']]
        if not 'live log' in self.data:
            headers = self.__extract_test_status_names__(self.__get_list_by_name__(header, 'session')[0])
            
        categories = self.__get_list_by_name__(header, 'duration top')
        failures = self.__extract_failures_errors__(self.__get_list_by_name__(header, 'summary')[0])

        return headers, categories, failures

    def __get_list_by_name__(self, data: list, name: str):
        """
        Find the sublist containing the specified name in the first element.

        :param data: A list of sublists to search through.
        :type data: list[list]
        :param name: The name to search for in the first element of each sublist.
        :type name: str
        :return: A list of sublists where the first element matches the name.
        :rtype: list[list]
        """
        matching_sublists = []
        
        for sublist in data:
            if re.search(name, sublist[0]):  # Converte os itens para string
                matching_sublists.append(sublist)
        
        return matching_sublists

    def __extract_test_status_names__(self, data):
        """
        Extracts the status and the tests names out of the pytest log, breaking them down to a list of lists.

        :param data: A list of lines containing test results.
        :type data: list[str]
        :return: list[list[str]]: A list of lists with test names, statuses (PASSED, FAILED, ERROR), and additional details.
        """

        tests = []
        keywords = ('PASSED', 'FAILED', 'ERROR')

        for line in data:
            line = ''.join(line).strip()
            
            if any(k in line for k in keywords):
                line = re.sub(r'\[.*?\d%\]', '', line)
                parts = line.split('::', maxsplit=1)

                match = re.search(r'(PASSED|FAILED|ERROR)', parts[0])
                if match:
                    test_name = parts[0][:match.start()].strip()
                    status = match.group(0)
                else:
                    test_name, status = parts[0], None

                tmp = [test_name, status]

                if len(parts) > 1:
                    values = list(filter(None, re.split(r'\[(.*?)\]', parts[1])))
                    tmp += values
                
                while len(tmp) < 4:
                    tmp.append(None)

                tests.append(tmp)
        
        return tests

    def __extract_failures_errors__(self, data):

        """
        Extracts from the pytest log the details of tests with failures or errors cleaning the data to make it ready to a dataframe.

        :param data: A list of strings containing test results.
        :type data: list[str]
        :return: list[list]: A list of lists containing details of tests with failures and/or errors.
        """

        # Regex asks for a string, cleaning it and concatening the list
        data_str = ''.join(list(''.join(d) for d in data[1:]))
        data_str = list(filter(None, re.split(r'(FAILED|ERROR)', data_str)))

        splitted_data = []

        # Splitting test from error
        for d in data_str:
            if d and ('FAILED' or 'ERROR') not in d:
                splitted_data.append(list(filter(None, re.split(r'\[(.*?)\]-|::|(\w+):([\w=*]+)', d))))   

        return splitted_data

    def __create_df__(self, values):
        """
        Converts extracted timing information into DataFrames.

        :param values: A list of lists containing extracted time metrics.
        :return: A list of DataFrames with execution time statistics.
        """
        dfs = pd.DataFrame()
        
        for h in values:
            time_df = pd.DataFrame(h[2:], columns=h[1])

            # Converting time-related columns to datetime.time format
            time_columns = ['avg', 'min', 'total']
            for col in time_columns:
                if col in time_df.columns:
                    time_df[col] = pd.to_datetime(time_df[col], format="%H:%M:%S.%f", errors='coerce').dt.time  

            # Assigning a 'durationType' column for metric categorization
            time_df['durationType'] = h[0].replace('top', '').replace('test', '')

            dfs = pd.concat([time_df, dfs], ignore_index=True)

        if 'name' in dfs.columns:
            dfs = dfs.set_index('name') 

        return dfs

    def __extract_self_path_info__(self):
        """
        Extracts test and database ID information from the log file path.

        :return: A DataFrame containing 'test' and 'databaseId' information.
        """
        # Extract filename without extension
        stripped = self.path.split('/')[-1].split('.')
        stripped.pop()  # Remove the file extension

        # Ensure there are exactly three elements (fill missing ones with None)
        while len(stripped) < 3:
            stripped.append(None)  # Fill missing values with NaN

        # Create DataFrame
        df = pd.DataFrame([stripped], columns=['test', 'region', 'databaseId'])

        return df

    def __merge_artifact_dfs__(self, times_df, status_df):
        """
        Merges test execution time data with test status information.

        :param times_df: A list of DataFrames containing time-related data.
        :param status_df: A DataFrame containing test statuses.
        :return: A combined DataFrame containing execution metrics and test results.
        """
        databaseId_df = self.__extract_self_path_info__()  
        order = ['category', 'durationType', 'databaseId', 'status', 'num', 'avg', 'min', 'total']
        dfs = []

        for h in times_df:
            joined_df = h.join(status_df)  # Merging time metrics with test statuses

            # Adding database ID to each row
            for col in databaseId_df.columns.values:
                joined_df[col] = databaseId_df[col].values[0]  

            # Reordering columns
            joined_df = joined_df[order]  
            dfs.append(joined_df)

        return pd.concat(dfs)  


In [None]:
jobs = ActionsJobs(repo_path, workflow)
jobs.get_jobs(13269014124)


Unnamed: 0,conclusion,test,buildTime (sec),jobId,failedAt,databaseId
0,PASSED,"run_tests (cold_storage, cold_storage, ../para...",85,37044161130,,13269149127
1,PASSED,"run_tests (basic, basic, ../params.example.yam...",96,37044161606,,13269149127
2,PASSED,"run_tests (presign, presign, ../params.example...",69,37044161918,,13269149127
3,PASSED,tests-success,0,37044262983,,13269149127
4,PASSED,cleanup-tests,102,37044273275,,13269149127
5,PASSED,"extra_tests_debug (locking, ../params/br-ne1.y...",222,37031740446,,13265481700
6,PASSED,"extra_tests_debug (locking, ../params/br-se1.y...",10945,37031741305,,13265481700
7,PASSED,"extra_tests_dist (bucket_versioning, ../params...",101,37031741997,,13265481700
8,PASSED,"extra_tests_dist (acl, ../params/br-ne1.yaml, ...",323,37031742597,,13265481700
9,PASSED,"extra_tests_dist (policy, ../params/br-ne1.yam...",156,37031743174,,13265481700


### "Main"

In [402]:
repo_path = 'MagaluCloud/s3-specs'
query_size = 10

workflow = ActionsWorkflow(repository=repo_path, query_size=query_size)
workflow.df

DataFrame successfully saved to ./bin/actionsWorflow.parquet


Unnamed: 0_level_0,createdAt,conclusion,status,databaseId,workflowDatabaseId
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Pull Request Essential Tests,2025-02-11 14:42:01+00:00,success,completed,13265481705,132962917
Pull Request Extra Tests,2025-02-11 14:42:01+00:00,success,completed,13265481700,142271933
Pull Request Extra Tests,2025-02-11 17:37:25+00:00,success,completed,13269014124,142271933
Pull Request Essential Tests,2025-02-11 17:37:25+00:00,success,completed,13269014120,132962917
Pull Request Essential Tests,2025-02-11 17:39:58+00:00,success,completed,13269057739,132962917
Pull Request Extra Tests,2025-02-11 17:39:58+00:00,success,completed,13269057738,142271933
Pull Request Extra Tests,2025-02-11 17:45:26+00:00,success,completed,13269149128,142271933
Pull Request Essential Tests,2025-02-11 17:45:26+00:00,success,completed,13269149127,132962917
Pull Request Essential Tests,2025-02-11 17:52:13+00:00,success,completed,13269265728,132962917
Pull Request Extra Tests,2025-02-11 17:52:13+00:00,success,completed,13269265723,142271933


In [403]:
jobs = ActionsJobs(repo_path, workflow)
jobs.get_jobs(13269014124)


Unnamed: 0,conclusion,test,buildTime (sec),jobId,failedAt,databaseId
0,PASSED,"run_tests (cold_storage, cold_storage, ../para...",85,37044161130,,13269149127
1,PASSED,"run_tests (basic, basic, ../params.example.yam...",96,37044161606,,13269149127
2,PASSED,"run_tests (presign, presign, ../params.example...",69,37044161918,,13269149127
3,PASSED,tests-success,0,37044262983,,13269149127
4,PASSED,cleanup-tests,102,37044273275,,13269149127
5,PASSED,"extra_tests_debug (locking, ../params/br-ne1.y...",222,37031740446,,13265481700
6,PASSED,"extra_tests_debug (locking, ../params/br-se1.y...",10945,37031741305,,13265481700
7,PASSED,"extra_tests_dist (bucket_versioning, ../params...",101,37031741997,,13265481700
8,PASSED,"extra_tests_dist (acl, ../params/br-ne1.yaml, ...",323,37031742597,,13265481700
9,PASSED,"extra_tests_dist (policy, ../params/br-ne1.yam...",156,37031743174,,13265481700


In [1263]:
artifacts = ActionsArtifacts(repository=repo_path)
#a = artifacts.download_artifact(13269014124)
a = artifacts.retrieve_downloaded_artifacts()
a

['./artifacts/local_artifact.br_se1.123456.log',
 './artifacts/13160019050/output_artifact_not_cli_and_locking_se1.13160019050/pytest_output_not_cli_and_locking_se1.13160019050.log',
 './artifacts/13160019050/output_artifact_not_cli_and_locking_ne1.13160019050/pytest_output_not_cli_and_locking_ne1.13160019050.log',
 './artifacts/13269014124/output_artifact_policy_br.ne1.13269014124.13269014124/pytest_output_policy_br_ne1.13269014124.log',
 './artifacts/13269014124/output_artifact_locking_br.ne1.13269014124.13269014124/pytest_output_locking_br.ne1.13269014124.log',
 './artifacts/13269014124/output_artifact_bucket_versioning_br.ne1.13269014124.13269014124/pytest_output_bucket_versioning.br_ne1.13269014124.log',
 './artifacts/13269014124/output_artifact_locking_br.se1.13269014124.13269014124/pytest_output_locking_br_se1.13269014124.log',
 './artifacts/13269014124/output_artifact_acl_br.ne1.13269014124.13269014124/pytest_output_acl.br_ne1.13269014124.log']

In [1701]:
artifact = PytestArtifactLogExtractor(path = a[0])
cat, categories, failures = artifact.log_to_df()

display(cat)
display(categories)
display(failures)

File 'pytest.log.parquet' does not exist.


Unnamed: 0_level_0,status,category,arguments,databaseId
pytest_tests_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FAILED,,test_upload_multiple_objects,num=100-5-10,123456
PASSED,,test_upload_multiple_objects,num=100-3-10,123456
FAILED,,test_upload_multiple_objects,num=100-9-10,123456
PASSED,,test_upload_multiple_objects,num=100-1-10,123456
PASSED,,test_upload_multiple_objects,num=100-7-10,123456
PASSED,,test_upload_multiple_objects,num=100-6-10,123456
PASSED,,test_upload_multiple_objects,num=100-4-10,123456
PASSED,,test_download_multiple_objects,num=100-1-10,123456
PASSED,,test_upload_multiple_objects,num=100-10-10,123456
PASSED,,test_download_multiple_objects,num=100-5-10,123456


Unnamed: 0_level_0,total,num,avg,min,durationType,databaseId
pytest_run_times,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
test_download_multiple_objects,00:00:40.494736,10,00:00:03.311439,00:00:02.036198,teardown duration,123456
test_upload_multiple_objects,00:00:28.457474,10,00:00:02.639308,00:00:01.722496,teardown duration,123456
test_download_multiple_objects,00:01:06.248248,10,00:00:06.635785,00:00:03.790442,setup duration,123456
test_upload_multiple_objects,00:00:08.536476,10,00:00:00.840663,00:00:00.541395,setup duration,123456
test_download_multiple_objects,00:01:05.466381,10,00:00:06.557192,00:00:05.550107,call duration,123456
test_upload_multiple_objects,00:00:43.853014,10,00:00:04.014908,00:00:01.980053,call duration,123456
fixture_upload_multiple_objects,00:00:57.071949,10,00:00:05.958852,00:00:03.087084,fixture duration,123456
fixture_bucket_with_name,00:00:15.654320,20,00:00:00.843605,00:00:00.478260,fixture duration,123456
s3_client,00:00:02.010599,20,00:00:00.086407,00:00:00.056561,fixture duration,123456
test_params,00:00:00.029676,20,00:00:00.001610,00:00:00.000740,fixture duration,123456


Unnamed: 0_level_0,category,arguments,error,error_details,databaseId
pytest_failures_errors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
docs/multiple_objects_test.py,test_upload_multiple_objects,num=100-5-10,AssertionError,Expectsuploads100tobeequaltoobjectsinthebucket...,123456
docs/multiple_objects_test.py,test_upload_multiple_objects,num=100-9-10,AssertionError,Expectsuploads100tobeequaltoobjectsinthebucket...,123456
docs/multiple_objects_test.py,test_download_multiple_objects,num=100-3-10,AssertionError,Expectsdownloads99tobeequaltouploads100assert9...,123456
docs/multiple_objects_test.py,test_download_multiple_objects,num=100-4-10,AssertionError,Expectsdownloads99tobeequaltouploads100assert9...,123456


# Plotting

## Workflow Df

In [406]:
Faz o teste -> gera graficos com tempo e taxa de falhas por tipo de teste

Workflow -> Job -> Passos -> Resultados pytest 



SyntaxError: invalid syntax (2924319023.py, line 1)

In [None]:
import matplotlib.pyplot as plt

# Define color mapping
colors = {
    'failure': 'firebrick',
    'cancelled': 'darkgray',
    'startup_failure': 'darkorange',
    'success':  'darkgreen'
}

# Filter the DataFrame
a = workflow.df[workflow.df['status'] == 'completed']

# Get value counts of the 'conclusion' columnimport matplotlib.pyplot as plt

# Define color mapping
colors = {
    'failure': 'firebrick',
    'cancelled': 'darkgray',
    'startup_failure':'darkorange',
    'success':  'darkgreen'

}

# Filter the DataFrame
a = workflow.df[workflow.df['status'] == 'completed']

# Get value counts of the 'conclusion' column
value_counts = a['conclusion'].value_counts()

# Map colors to the categories in value_counts
bar_colors = [colors[cat] for cat in value_counts.index]

# Create a figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))  # 1 row, 2 columns

# Plot the bar chart on the first subplot
value_counts.plot.bar(color=bar_colors, ax=ax1)
ax1.set_xlabel('Conclusion')
ax1.set_ylabel('Count')
ax1.set_title('Bar Chart: Conclusion Counts')

# Plot the pie chart on the second subplot
value_counts.plot.pie(colors=bar_colors, autopct='%1.1f%%', ax=ax2)
ax2.set_ylabel('')  # Remove the y-label for the pie chart
ax2.set_title('Pie Chart: Conclusion Distribution')

# Adjust layout for better spacing
plt.tight_layout()

# Show the plots
plt.show()
value_counts = a['conclusion'].value_counts()

# Map colors to the categories in value_counts
bar_colors = [colors[cat] for cat in value_counts.index]

# Create a figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))  # 1 row, 2 columns

# Plot the bar chart on the first subplot
value_counts.plot.bar(color=bar_colors, ax=ax1)
ax1.set_xlabel('Conclusion')
ax1.set_ylabel('Count')
ax1.set_title('Bar Chart: Conclusion Counts')

# Plot the pie chart on the second subplot
value_counts.plot.pie(colors=bar_colors, autopct='%1.1f%%', ax=ax2)
ax2.set_ylabel('')  # Remove the y-label for the pie chart
ax2.set_title('Pie Chart: Conclusion Distribution')

# Adjust layout for better spacing
plt.tight_layout()

# Show the plots
plt.show()

## Jobs Df

In [None]:
jobs = ActionsJobs(repository=repo_path, workflow=workflow)
ids = workflow.df['databaseId'].unique()
all_job_dfs = [jobs.get_jobs(id)for id in ids]
jobs_df = pd.concat(all_job_dfs)
jobs_df

In [None]:
import matplotlib.pyplot as plt

def plot_failed_passed_jobs_bars(df):
    unique_names = df.groupby(['Test', 'Conclusion']).size().unstack(fill_value=0)
    test_to_number = {test: i + 1 for i, test in enumerate(df['Test'].unique())}

    # Define colors for 'FAILED' and 'PASSED'
    colors = {
        'FAILED': 'firebrick',
        'PASSED': 'darkgreen'
    }

    ax = unique_names.plot.bar(color=[colors['FAILED'], colors['PASSED']], figsize=(8, 4))

    # Add labels and title
    ax.set_xlabel('Test')
    ax.set_ylabel('Count')
    ax.set_title('FAILED vs PASSED by Test')

    # Change the x-tick labels to their respective numbers
    ax.set_xticklabels([test_to_number[test] for test in unique_names.index], rotation=0)

    # Create a legend for the test numbers and names
    test_legend = [f"{num}. {test}" for test, num in test_to_number.items()]
    plt.figtext(1.05, 0.5, "\n".join(test_legend), va='center', fontsize=10, wrap=True)

    # Show the plot
    plt.tight_layout()
    plt.show()

# Example usage
#plot_failed_passed_jobs_bars(jobs_df[jobs_df['Conclusion'] == 'FAILED'])
plot_failed_passed_jobs_bars(jobs_df)