In [12]:
import os

# Set base directory path
base_dir = r'C:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset'
# List all projects in the base directory
projects = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
print("Projects in base directory:")
print(projects)


Projects in base directory:
['.git', '.venv', '19wu@19wu', 'AlchemyCMS@alchemy_cms', 'AsyncHttpClient@async-http-client', 'Atmosphere@atmosphere', 'BBC-News@wraith', 'BertrandBordage@django-cachalot', 'BuildCraft@BuildCraft', 'CartoDB@cartodb', 'CloudifySource@cloudify', 'CloudSlang@cloud-slang', 'DataDog@dd-agent', 'DiUS@java-faker', 'DroidPlanner@Tower', 'DSpace@DSpace', 'FasterXML@jackson-databind', 'GeoNode@geonode', 'GoClipse@goclipse', 'Graylog2@graylog2-server', 'Guake@guake', 'HubSpot@Singularity']


In [18]:
# Define the filenames to check
required_files = [
    'buildlog-data-travis.csv',
    'buildlog-data-travis.json',
    'repo-data-travis.csv',
    'repo-data-travis.json'
]

# Check for the presence of required files in each project folder
missing_files = {}
for project in projects:
    project_dir = os.path.join(base_dir, project)
    if os.path.isdir(project_dir):
        for file in required_files:
            file_path = os.path.join(project_dir, file)
            if not os.path.exists(file_path):
                if project not in missing_files:
                    missing_files[project] = []
                missing_files[project].append(file)

# Print the results
if missing_files:
    print("Missing files in the following projects:")
    for project, files in missing_files.items():
        print(f"Project: {project}")
        print("Missing files:")
        for file in files:
            print(f"  - {file}")
# Exclude '.git' and '.venv' folders from the search
excluded_projects = ['.git', '.venv']
missing_projects = [project for project in missing_files if project not in excluded_projects]

# Store the missing project names in a variable
missing_project_names = list(missing_projects)
if( missing_project_names ):
    print("\nMissing project names:")
    print(missing_project_names)
else:
    print("All required files are present in every project folder.")

Missing files in the following projects:
Project: .git
Missing files:
  - buildlog-data-travis.csv
  - buildlog-data-travis.json
  - repo-data-travis.csv
  - repo-data-travis.json
Project: .venv
Missing files:
  - buildlog-data-travis.csv
  - buildlog-data-travis.json
  - repo-data-travis.csv
  - repo-data-travis.json
Project: DataDog@dd-agent
Missing files:
  - buildlog-data-travis.csv
  - buildlog-data-travis.json
  - repo-data-travis.csv
  - repo-data-travis.json
Project: HubSpot@Singularity
Missing files:
  - buildlog-data-travis.json

Missing project names:
['DataDog@dd-agent', 'HubSpot@Singularity']


In [20]:
import pandas as pd
csv_path = r"C:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\AlchemyCMS@alchemy_cms\buildlog-data-travis.csv"
file_path = r"C:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\AlchemyCMS@alchemy_cms\buildlog-data-travis.json"
# Load the .csv file into a DataFrame
csv_df = pd.read_csv(csv_path)

# Load the .json file into a DataFrame
json_df = pd.read_json(file_path)

# Compare the two DataFrames
if csv_df.equals(json_df):
    print("The contents of the .csv and .json files are identical.")
else:
    print("The contents of the .csv and .json files differ.")

The contents of the .csv and .json files differ.


# Refined Data Preprocessing

## 🎯 **Phase 0: Setup & Discovery (30 mins)**  
### **Environment Setup**  
- [ ] Install dependencies (`pandas`, `numpy`, `matplotlib`, `seaborn`)  
- [ ] Create project structure: `data/`, `plots/`, `processed/`  
- [ ] Initialize logging for data quality tracking  

### **Data Inventory**  
- [ ] **Project Discovery**: List all 20 project folders  
- [ ] **File Validation**: Verify both CSVs exist per project  
- [ ] **Size Assessment**: Check file sizes and estimated record counts  
- [ ] **Sample Preview**: Load 2-3 projects to understand schema variations  



In [26]:
print("Current working directory:")
print(os.getcwd())

Current working directory:
c:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset


In [25]:
# This cell creates a project structure with directories: 'data', 'plots', and 'processed'.
# It also initializes logging for data quality tracking, storing logs in 'data_quality.log'.

import os
import logging

# Create project structure
directories = ['data', 'plots', 'processed']
for directory in directories:
    os.makedirs(directory, exist_ok=True)
print("Project structure created.")

# Initialize logging
logging.basicConfig(
    filename='data_quality.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.info("Logging initialized for data quality tracking.")
print("Logging initialized.")

Project structure created.
Logging initialized.


In [42]:
# Print the names in columns-wise format
print("\nFiltered Projects:")
for project in filtered_projects:
    print(project)

# Filter projects to exclude '.git' and '.venv'
filtered_projects = [project for project in projects if project not in ['.git', '.venv']]

# Store project names that don't have the required files
projects_missing_required_files = []

for project in filtered_projects:
    project_dir = os.path.join(base_dir, project)
    if os.path.isdir(project_dir):
        missing = False
        for file in required_files:
            file_path = os.path.join(project_dir, file)
            if not os.path.exists(file_path):
                missing = True
                break
        if missing:
            projects_missing_required_files.append(project)

print("\nProjects missing required files:")
print(projects_missing_required_files)





Filtered Projects:
19wu@19wu
AlchemyCMS@alchemy_cms
AsyncHttpClient@async-http-client
Atmosphere@atmosphere
BBC-News@wraith
BertrandBordage@django-cachalot
BuildCraft@BuildCraft
CartoDB@cartodb
CloudifySource@cloudify
CloudSlang@cloud-slang
DataDog@dd-agent
DiUS@java-faker
DroidPlanner@Tower
DSpace@DSpace
FasterXML@jackson-databind
GeoNode@geonode
GoClipse@goclipse
Graylog2@graylog2-server
Guake@guake
HubSpot@Singularity

Projects missing required files:
['DataDog@dd-agent', 'HubSpot@Singularity']


In [43]:
# Assess file sizes and record counts for filtered projects
file_sizes = {}
record_counts = {}

for project in filtered_projects:
    project_dir = os.path.join(base_dir, project)
    for file in required_files:
        file_path = os.path.join(project_dir, file)
        if os.path.exists(file_path):
            file_size = os.path.getsize(file_path)
            file_sizes[file_path] = file_size
            if file.endswith('.csv'):
                record_counts[file_path] = sum(1 for _ in open(file_path)) - 1  # Exclude header row
            elif file.endswith('.json'):
                with open(file_path, 'r') as f:
                    record_counts[file_path] = len(pd.read_json(f))

print("File sizes (in bytes):")
print(file_sizes)
print("\nEstimated record counts:")
for file_path, size in file_sizes.items():
    print(f"File: {file_path}, Size: {size} bytes")
for file_path, count in record_counts.items():
    print(f"File: {file_path}, Record Count: {count}")


File sizes (in bytes):
{'C:\\Users\\Chethan\\OneDrive\\Desktop\\Capstone-CICD\\dataset\\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\\19wu@19wu\\buildlog-data-travis.csv': 593598, 'C:\\Users\\Chethan\\OneDrive\\Desktop\\Capstone-CICD\\dataset\\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\\19wu@19wu\\buildlog-data-travis.json': 3930655, 'C:\\Users\\Chethan\\OneDrive\\Desktop\\Capstone-CICD\\dataset\\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\\19wu@19wu\\repo-data-travis.csv': 163108, 'C:\\Users\\Chethan\\OneDrive\\Desktop\\Capstone-CICD\\dataset\\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\\19wu@19wu\\repo-data-travis.json': 265581, 'C:\\Users\\Chethan\\OneDrive\\Desktop\\Capstone-CICD\\dataset\\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\\AlchemyCMS@alchemy_cms\\buildlog-data-travis.csv': 2369161, 'C:\\Users\\Chethan\\OneDrive\\Desktop\\Capstone-CICD\\dataset\\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\\AlchemyCMS@alchemy_cms\\buildlog-da

**Sample Preview**: Load 2-3 projects to understand schema variations

In [45]:
# Select 2-3 projects to preview
preview_projects = ['19wu@19wu', 'AlchemyCMS@alchemy_cms', 'AsyncHttpClient@async-http-client']

# Iterate through the selected projects and load their files
for project in preview_projects:
    project_dir = os.path.join(base_dir, project)
    print(f"\nPreviewing schemas for project: {project}")
    
    for file in required_files:
        file_path = os.path.join(project_dir, file)
        if os.path.exists(file_path):
            print(f"\nFile: {file_path}")
            if file.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif file.endswith('.json'):
                df = pd.read_json(file_path)
            
            # Display the first few rows and schema
            print("First few rows:")
            print(df.head())
            print("\nSchema:")
            print(df.info())
        else:
            print(f"File {file} does not exist in project {project}.")


Previewing schemas for project: 19wu@19wu

File: C:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\19wu@19wu\buildlog-data-travis.csv
First few rows:
   tr_build_id  tr_job_id  tr_build_number  \
0     10739394   10739395             1000   
1     10739394   10739396             1000   
2     10739449   10739450             1001   
3     10739449   10739451             1001   
4     10741050   10741051             1002   

                         tr_original_commit tr_log_lan tr_log_status  \
0  e49acc12e2c99f55e03314cea930a4e812a186f9       ruby        broken   
1  e49acc12e2c99f55e03314cea930a4e812a186f9       ruby            ok   
2  d0f5f61aafb98092179b024e336784937168a3ba       ruby            ok   
3  d0f5f61aafb98092179b024e336784937168a3ba       ruby            ok   
4  119c0850a02fe992b943b64bb9e5c0bbb25688c1       ruby       unknown   

   tr_log_setup_time tr_log_analyzer tr_log_frameworks  tr_log_bool_tests_ran  \


# Let's analysise one of the repo-data-travis

In [50]:
# Load the CSV file into a DataFrame
repo_data_df = pd.read_csv(r"C:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\AlchemyCMS@alchemy_cms\repo-data-travis.csv")

# List the columns
print("Columns in the CSV file:")
print(repo_data_df.columns.tolist())

missing_values = csv_df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Columns in the CSV file:
['build_id', 'commit', 'pull_req', 'branch', 'status', 'duration', 'started_at', 'jobs', 'event_type']
Missing values in each column:
tr_build_id                          0
tr_job_id                            0
tr_build_number                      0
tr_original_commit                   0
tr_log_lan                           0
tr_log_status                        0
tr_log_setup_time                10678
tr_log_analyzer                      0
tr_log_frameworks                 1364
tr_log_bool_tests_ran                0
tr_log_bool_tests_failed           857
tr_log_num_tests_ok               1364
tr_log_num_tests_failed            288
tr_log_num_tests_run              1364
tr_log_num_tests_skipped          1364
tr_log_num_test_suites_run       18619
tr_log_num_test_suites_ok        18619
tr_log_num_test_suites_failed    18619
tr_log_tests_failed              18619
tr_log_testduration               1364
tr_log_buildduration             18619
dtype: int64


In [51]:
# Calculate the percentage of missing values for each column
missing_percentage = (csv_df.isnull().sum() / len(csv_df)) * 100

# Display the results
print("Percentage of missing values in each column:")
print(missing_percentage)

Percentage of missing values in each column:
tr_build_id                        0.000000
tr_job_id                          0.000000
tr_build_number                    0.000000
tr_original_commit                 0.000000
tr_log_lan                         0.000000
tr_log_status                      0.000000
tr_log_setup_time                 57.350019
tr_log_analyzer                    0.000000
tr_log_frameworks                  7.325850
tr_log_bool_tests_ran              0.000000
tr_log_bool_tests_failed           4.602825
tr_log_num_tests_ok                7.325850
tr_log_num_tests_failed            1.546807
tr_log_num_tests_run               7.325850
tr_log_num_tests_skipped           7.325850
tr_log_num_test_suites_run       100.000000
tr_log_num_test_suites_ok        100.000000
tr_log_num_test_suites_failed    100.000000
tr_log_tests_failed              100.000000
tr_log_testduration                7.325850
tr_log_buildduration             100.000000
dtype: float64


In [52]:
# Column cleanup and feature engineering based on the decision table

# Drop columns with 100% missing values
columns_to_drop = [
    'tr_log_num_test_suites_run',
    'tr_log_num_test_suites_ok',
    'tr_log_num_test_suites_failed',
    'tr_log_tests_failed',
    'tr_log_buildduration'
]
csv_df = csv_df.drop(columns=columns_to_drop)

# Create flags for missing values
csv_df['has_setup_time'] = csv_df['tr_log_setup_time'].notnull().astype(int)
csv_df['framework_unknown'] = csv_df['tr_log_frameworks'].isnull().astype(int)
csv_df['tests_failed_unknown'] = csv_df['tr_log_bool_tests_failed'].isnull().astype(int)
csv_df['test_info_available'] = csv_df['tr_log_num_tests_ok'].notnull().astype(int)
csv_df['has_test_duration'] = csv_df['tr_log_testduration'].notnull().astype(int)

# Impute missing values
csv_df['tr_log_frameworks'] = csv_df['tr_log_frameworks'].fillna('unknown')
csv_df['tr_log_bool_tests_failed'] = csv_df['tr_log_bool_tests_failed'].fillna(0)
csv_df['tr_log_num_tests_ok'] = csv_df['tr_log_num_tests_ok'].fillna(0)
csv_df['tr_log_num_tests_failed'] = csv_df['tr_log_num_tests_failed'].fillna(0)
csv_df['tr_log_num_tests_run'] = csv_df['tr_log_num_tests_run'].fillna(0)
csv_df['tr_log_num_tests_skipped'] = csv_df['tr_log_num_tests_skipped'].fillna(0)
csv_df['tr_log_testduration'] = csv_df['tr_log_testduration'].fillna(0)

# Drop optional columns if not needed
optional_columns_to_drop = ['tr_build_number', 'tr_log_analyzer', 'tr_log_num_tests_skipped']
csv_df = csv_df.drop(columns=optional_columns_to_drop)

# Display the cleaned DataFrame
print("Cleaned DataFrame:")
print(csv_df.head())

Cleaned DataFrame:
   tr_build_id  tr_job_id                        tr_original_commit  \
0      6751848    6751849  463d7e8efa420e28a9cda860865b0a62b5b9b549   
1      6751848    6751850  463d7e8efa420e28a9cda860865b0a62b5b9b549   
2      6751848    6751851  463d7e8efa420e28a9cda860865b0a62b5b9b549   
3      6751848    6751852  463d7e8efa420e28a9cda860865b0a62b5b9b549   
4      6761369    6761370  08e86c73e51735f62b2adbebfef32e9ac2c1417a   

  tr_log_lan tr_log_status  tr_log_setup_time tr_log_frameworks  \
0       ruby            ok                NaN             rspec   
1       ruby            ok                NaN             rspec   
2       ruby            ok                NaN             rspec   
3       ruby            ok                NaN             rspec   
4       ruby            ok                NaN             rspec   

   tr_log_bool_tests_ran tr_log_bool_tests_failed  tr_log_num_tests_ok  \
0                   True                    False                692.0   
1  

In [53]:
missing_percentage = (csv_df.isnull().sum() / len(csv_df)) * 100

# Display the results
print("Percentage of missing values in each column:")
print(missing_percentage)

Percentage of missing values in each column:
tr_build_id                  0.000000
tr_job_id                    0.000000
tr_original_commit           0.000000
tr_log_lan                   0.000000
tr_log_status                0.000000
tr_log_setup_time           57.350019
tr_log_frameworks            0.000000
tr_log_bool_tests_ran        0.000000
tr_log_bool_tests_failed     0.000000
tr_log_num_tests_ok          0.000000
tr_log_num_tests_failed      0.000000
tr_log_num_tests_run         0.000000
tr_log_testduration          0.000000
has_setup_time               0.000000
framework_unknown            0.000000
tests_failed_unknown         0.000000
test_info_available          0.000000
has_test_duration            0.000000
dtype: float64


In [54]:
# Define the columns for the metadata CSV
metadata_columns = [
    'tr_build_id',
    'tr_log_lan',
    'tr_log_status',
    'tr_log_frameworks',
    'tr_log_num_tests_ok',
    'tr_log_num_tests_failed',
    'tr_log_num_tests_run',
    'tr_log_testduration',
    'has_setup_time',
    'framework_unknown',
    'tests_failed_unknown',
    'test_info_available',
    'has_test_duration'
]

# Filter the DataFrame to include only the suggested columns
metadata_df = csv_df[metadata_columns]

# Save the filtered DataFrame to a new CSV file
metadata_csv_path = os.path.join(base_dir, 'repo-data-travis-metadata.csv')
metadata_df.to_csv(metadata_csv_path, index=False)

print(f"Metadata CSV file created at: {metadata_csv_path}")

Metadata CSV file created at: C:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\repo-data-travis-metadata.csv


now lets create a pipeline for the entire process we did till now  
from the base dir iterate through each files {excluding .venv and .git }
inside each folder look for repo-data-travis.csv 
create a df fram for it and create a new repo-data-travis-metadate.csv with columns preprocessed and ready for ml model training later like we did for one particular csv file now

## 🧼 **Phase 2: Data Cleaning & Standardization (1.5 hours)**  
### **Critical Field Validation**  
- [ ] **Essential Columns**: Ensure `status`, `duration`, `build_id` completeness  
- [ ] **Data Type Fixes**: Standardize timestamps, numeric fields, booleans  
- [ ] **Status Normalization**: Map status variations to standard categories  

### **Missing Data Strategy**  
- [ ] **Log Fields**: Handle missing test/build log data  
    - Impute with `-1` for counts, `0` for durations  
    - Flag builds with incomplete log parsing  
- [ ] **Metadata Fields**: Address missing commit/branch information  
- [ ] **Threshold Decisions**: Drop rows with >50% missing critical features  

### **Outlier Detection**  
- [ ] **Duration Outliers**: Identify extremely long/short builds  
- [ ] **Test Count Anomalies**: Flag unrealistic test numbers  
- [ ] **Cross-Project Consistency**: Check for project-specific data quality issues 

In [58]:
import os
import pandas as pd
from tqdm import tqdm

def process_repo_data(csv_path):
    """
    Processes a repo-data-travis.csv file to create standardized metadata
    Includes status normalization, outlier detection, and missing data handling
    """
    try:
        # Load the CSV file
        csv_df = pd.read_csv(csv_path)
        
        # --- Column Cleanup ---
        # Drop columns with 100% missing values (if they exist)
        columns_to_drop = [
            'tr_log_num_test_suites_run',
            'tr_log_num_test_suites_ok',
            'tr_log_num_test_suites_failed',
            'tr_log_tests_failed',
            'tr_log_buildduration'
        ]
        columns_to_drop = [col for col in columns_to_drop if col in csv_df.columns]
        csv_df = csv_df.drop(columns=columns_to_drop)
        
        # --- Status Normalization ---
        # Map all status variations to standardized values
        status_mapping = {
            'passed': 'success',
            'failed': 'failed',
            'errored': 'failed',
            'error': 'failed',
            'canceled': 'canceled',
            'timeout': 'failed'
        }
        if 'tr_log_status' in csv_df.columns:
            csv_df['tr_log_status'] = csv_df['tr_log_status'].str.lower().map(status_mapping).fillna('unknown')
        else:
            csv_df['tr_log_status'] = 'unknown'
        
        # Convert status to categorical codes for ML
        status_codes = {
            'success': 1,
            'failed': 0,
            'canceled': -1,
            'unknown': -1
        }
        csv_df['build_status_code'] = csv_df['tr_log_status'].map(status_codes)
        
        # --- Missing Data Handling ---
        # Create flags for missing values (only for columns that exist)
        if 'tr_log_setup_time' in csv_df.columns:
            csv_df['has_setup_time'] = csv_df['tr_log_setup_time'].notnull().astype(int)
            csv_df['tr_log_setup_time'] = csv_df['tr_log_setup_time'].fillna(0)
        else:
            csv_df['has_setup_time'] = 0
            csv_df['tr_log_setup_time'] = 0
            
        if 'tr_log_frameworks' in csv_df.columns:
            csv_df['framework_unknown'] = csv_df['tr_log_frameworks'].isnull().astype(int)
            csv_df['tr_log_frameworks'] = csv_df['tr_log_frameworks'].fillna('unknown')
        else:
            csv_df['framework_unknown'] = 1
            csv_df['tr_log_frameworks'] = 'unknown'
            
        if 'tr_log_bool_tests_failed' in csv_df.columns:
            csv_df['tests_failed_unknown'] = csv_df['tr_log_bool_tests_failed'].isnull().astype(int)
            csv_df['tr_log_bool_tests_failed'] = csv_df['tr_log_bool_tests_failed'].fillna(0)
        else:
            csv_df['tests_failed_unknown'] = 1
            csv_df['tr_log_bool_tests_failed'] = 0
            
        # --- Test Data Imputation ---
        test_columns = [
            'tr_log_num_tests_ok',
            'tr_log_num_tests_failed',
            'tr_log_num_tests_run',
            'tr_log_num_tests_skipped',
            'tr_log_testduration'
        ]
        
        for col in test_columns:
            if col in csv_df.columns:
                csv_df[col] = csv_df[col].fillna(0)
                csv_df[f'{col}_missing'] = csv_df[col].isnull().astype(int)
            else:
                csv_df[col] = 0
                csv_df[f'{col}_missing'] = 1
        
        # --- Outlier Detection ---
        numeric_cols = ['tr_log_testduration', 'tr_log_setup_time', 'tr_log_num_tests_run']
        
        for col in numeric_cols:
            if col in csv_df.columns:
                # Calculate IQR
                q1 = csv_df[col].quantile(0.25)
                q3 = csv_df[col].quantile(0.75)
                iqr = q3 - q1
                
                # Define bounds
                lower_bound = q1 - (1.5 * iqr)
                upper_bound = q3 + (1.5 * iqr)
                
                # Flag outliers
                csv_df[f'{col}_outlier'] = ((csv_df[col] < lower_bound) | 
                                          (csv_df[col] > upper_bound)).astype(int)
        
        # --- Final Metadata Columns ---
        metadata_columns = [
            # Core identifiers
            'tr_build_id',
            'gh_project_name',
            
            # Status information
            'tr_log_status',
            'build_status_code',
            
            # Test results
            'tr_log_num_tests_ok',
            'tr_log_num_tests_failed',
            'tr_log_num_tests_run',
            'tr_log_num_tests_skipped',
            'tr_log_bool_tests_failed',
            
            # Timing information
            'tr_log_testduration',
            'tr_log_setup_time',
            
            # Data quality flags
            'has_setup_time',
            'framework_unknown',
            'tests_failed_unknown',
            'tr_log_num_tests_ok_missing',
            'tr_log_num_tests_failed_missing',
            'tr_log_num_tests_run_missing',
            'tr_log_testduration_missing',
            
            # Outlier flags
            'tr_log_testduration_outlier',
            'tr_log_setup_time_outlier',
            'tr_log_num_tests_run_outlier'
        ]
        
        # Ensure all columns exist (add with defaults if missing)
        for col in metadata_columns:
            if col not in csv_df.columns:
                if 'missing' in col:
                    csv_df[col] = 1
                elif 'outlier' in col:
                    csv_df[col] = 0
                elif col == 'gh_project_name':
                    csv_df[col] = os.path.basename(os.path.dirname(csv_path))
                else:
                    csv_df[col] = 0 if col.isnumeric() else 'unknown'
        
        # Select only our target columns
        metadata_df = csv_df[metadata_columns]
        
        # --- Save Metadata ---
        output_path = os.path.join(os.path.dirname(csv_path), 'repo-data-metadata.csv')
        metadata_df.to_csv(output_path, index=False)
        
        return True
        
    except Exception as e:
        print(f"Error processing {csv_path}: {str(e)}")
        return False

def process_all_repositories(base_dir):
    """
    Processes all repository data files in the base directory
    """
    # Exclude these directories
    exclude_dirs = {'.venv', '.git', '__pycache__'}
    
    # Count total files to process
    total_files = 0
    for root, dirs, files in os.walk(base_dir):
        dirs[:] = [d for d in dirs if d not in exclude_dirs]
        if 'repo-data-travis.csv' in files:
            total_files += 1
    
    # Process files with progress tracking
    processed_count = 0
    with tqdm(total=total_files, desc="Processing repositories") as pbar:
        for root, dirs, files in os.walk(base_dir):
            dirs[:] = [d for d in dirs if d not in exclude_dirs]
            
            if 'repo-data-travis.csv' in files:
                csv_path = os.path.join(root, 'repo-data-travis.csv')
                success = process_repo_data(csv_path)
                if success:
                    processed_count += 1
                pbar.update(1)
    
    print(f"\nProcessing complete. Successfully processed {processed_count} of {total_files} repository data files.")

if __name__ == "__main__":
    # Configuration
    base_directory = r"C:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset"
    
    # Run processing
    process_all_repositories(base_directory)

Processing repositories: 100%|██████████| 19/19 [00:00<00:00, 22.50it/s]


Processing complete. Successfully processed 19 of 19 repository data files.





#### Sample data preprocessing for some random single buildlog-data-travis.csv to gain insights and later scale up for every other csv 

In [59]:
import pandas as pd

# Define the path to the new CSV file
new_csv_path = r"C:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset\BBC-News@wraith\buildlog-data-travis.csv"

# Load the CSV file into a DataFrame
new_csv_df = pd.read_csv(new_csv_path)

# Display basic information about the DataFrame
print("Basic Information:")
print(new_csv_df.info())

# Display the first few rows of the DataFrame
print("\nFirst few rows:")
print(new_csv_df.head())

# Check for missing values in each column
missing_values = new_csv_df.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)

# Calculate the percentage of missing values for each column
missing_percentage = (missing_values / len(new_csv_df)) * 100
print("\nPercentage of missing values in each column:")
print(missing_percentage)

# Display summary statistics for numeric columns
print("\nSummary statistics for numeric columns:")
print(new_csv_df.describe())

Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1804 entries, 0 to 1803
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   tr_build_id                    1804 non-null   int64  
 1   tr_job_id                      1804 non-null   int64  
 2   tr_build_number                1804 non-null   int64  
 3   tr_original_commit             1804 non-null   object 
 4   tr_log_lan                     1804 non-null   object 
 5   tr_log_status                  1804 non-null   object 
 6   tr_log_setup_time              939 non-null    float64
 7   tr_log_analyzer                1804 non-null   object 
 8   tr_log_frameworks              844 non-null    object 
 9   tr_log_bool_tests_ran          1804 non-null   bool   
 10  tr_log_bool_tests_failed       852 non-null    object 
 11  tr_log_num_tests_ok            844 non-null    float64
 12  tr_log_num_tests_failed      

For buildlog-data-travis.csv

In [65]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# To keep track of failures
failed_files = []

def process_single_buildlog_csv(csv_path):
    """
    Process a single buildlog-data-travis.csv file to extract metadata features.
    """
    try:
        df = pd.read_csv(csv_path)

        # --- Critical Field Validation ---
        essential_columns = ['tr_build_id', 'tr_log_status']
        for col in essential_columns:
            if col not in df.columns or df[col].isnull().all():
                print(f"⚠️ Skipping {csv_path} due to missing essential column: {col}")
                return False

        # Ensure `tr_log_testduration` exists or create with 0
        if 'tr_log_testduration' not in df.columns:
            print(f"⚠️ {csv_path} missing 'tr_log_testduration'. Filling with 0.")
            df['tr_log_testduration'] = 0

        # --- Status Normalization ---
        status_mapping = {
            'passed': 'success',
            'failed': 'failed',
            'errored': 'failed',
            'error': 'failed',
            'canceled': 'canceled',
            'timeout': 'failed',
            'running': 'in_progress'
        }
        df['tr_log_status'] = df['tr_log_status'].str.lower().str.strip().map(status_mapping).fillna('unknown')

        status_codes = {
            'success': 1,
            'failed': 0,
            'canceled': -1,
            'in_progress': -2,
            'unknown': -3
        }
        df['status_code'] = df['tr_log_status'].map(status_codes)

        # --- Missing Data Flags ---
        missing_flags = {
            'setup_time': 'tr_log_setup_time',
            'frameworks': 'tr_log_frameworks',
            'tests_failed': 'tr_log_bool_tests_failed',
            'tests_ok': 'tr_log_num_tests_ok',
            'test_duration': 'tr_log_testduration'
        }

        for flag_name, col in missing_flags.items():
            df[f'missing_{flag_name}'] = df[col].isnull().astype(int) if col in df.columns else 1

        # --- Fill Missing Values (Safe Imputing) ---
        default_fill = {
            'tr_log_num_tests_ok': 0,
            'tr_log_num_tests_failed': 0,
            'tr_log_num_tests_run': 0,
            'tr_log_testduration': 0,
            'tr_log_setup_time': 0,
            'tr_log_bool_tests_failed': 0,
            'tr_log_frameworks': 'unknown',
            'tr_log_buildduration': 0
        }
        for col, default in default_fill.items():
            df[col] = df[col].fillna(default) if col in df.columns else default

        # --- Derived Features ---
        if 'tr_log_buildduration' in df.columns:
            df['build_success_per_minute'] = df['tr_log_num_tests_ok'] / (df['tr_log_buildduration'] / 60 + 1)
            df['build_success_per_minute'].replace([np.inf, -np.inf], 0, inplace=True)
        else:
            df['build_success_per_minute'] = 0

        # --- Outlier Detection ---
        for col in ['tr_log_testduration', 'tr_log_num_tests_run', 'tr_log_buildduration']:
            if col in df.columns:
                q1, q3 = df[col].quantile([0.25, 0.75])
                iqr = q3 - q1
                lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
                df[f'{col}_outlier'] = ((df[col] < lower) | (df[col] > upper)).astype(int)
            else:
                df[f'{col}_outlier'] = 0

        # --- Project Name ---
        df['gh_project_name'] = os.path.basename(os.path.dirname(csv_path))

        # --- Select Metadata Columns ---
        metadata_columns = [
            'tr_build_id', 'gh_project_name', 'tr_log_status', 'status_code',
            'tr_log_num_tests_ok', 'tr_log_num_tests_failed', 'tr_log_num_tests_run',
            'tr_log_bool_tests_failed', 'tr_log_testduration', 'tr_log_setup_time',
            'tr_log_buildduration', 'build_success_per_minute',
            'missing_setup_time', 'missing_frameworks', 'missing_tests_failed',
            'missing_tests_ok', 'missing_test_duration',
            'tr_log_testduration_outlier', 'tr_log_num_tests_run_outlier',
            'tr_log_buildduration_outlier'
        ]
        metadata_df = df[[col for col in metadata_columns if col in df.columns]]

        # --- Save Output ---
        out_path = os.path.join(os.path.dirname(csv_path), 'buildlog-metadata.csv')
        metadata_df.to_csv(out_path, index=False)
        return True

    except Exception as e:
        failed_files.append((csv_path, str(e)))
        print(f"❌ Error processing {csv_path}: {str(e)}")
        return False


def process_all_buildlogs(base_dir):
    """
    Recursively process all buildlog-data-travis.csv files under the given base directory.
    """
    exclude_dirs = {'.venv', '.git', '__pycache__'}
    total_files = 0

    for root, dirs, files in os.walk(base_dir):
        dirs[:] = [d for d in dirs if d not in exclude_dirs]
        if 'buildlog-data-travis.csv' in files:
            total_files += 1

    processed_count = 0
    with tqdm(total=total_files, desc="Processing build logs") as pbar:
        for root, dirs, files in os.walk(base_dir):
            dirs[:] = [d for d in dirs if d not in exclude_dirs]
            if 'buildlog-data-travis.csv' in files:
                csv_path = os.path.join(root, 'buildlog-data-travis.csv')
                if process_single_buildlog_csv(csv_path):
                    processed_count += 1
                pbar.update(1)

    print(f"\n✅ Processing complete. Successfully processed {processed_count} of {total_files} build log files.")

    if failed_files:
        print("\n⚠️ Failed Files:")
        for path, reason in failed_files:
            print(f"- {path} → {reason}")


# ---------- ENTRY POINT ----------
if __name__ == "__main__":
    base_directory = r"C:\Users\Chethan\OneDrive\Desktop\Capstone-CICD\dataset\MINORS_CAPSTONE-travistorrent-java-ci-build-dataset"
    process_all_buildlogs(base_directory)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['build_success_per_minute'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['build_success_per_minute'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work becaus


✅ Processing complete. Successfully processed 19 of 19 build log files.



