# Code Quality Dataset - Feature Engineering Pipeline

This notebook provides a complete feature engineering pipeline for the code quality dataset:
- Data extraction and JSON parsing
- Feature engineering and transformation
- Missing value handling and outlier detection
- Normalization and encoding
- Production-ready dataset creation

In [5]:
import pandas as pd
import numpy as np
import json
import ast
import warnings
from pathlib import Path
from typing import Dict, List, Tuple, Any
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


## Class Definition: CodeQualityFeatureBuilder

In [6]:
class CodeQualityFeatureBuilder:
    """Build and engineer features from raw code quality dataset."""
    
    def __init__(self, input_csv: str, output_csv: str = None):
        """
        Initialize the feature builder.
        
        Args:
            input_csv: Path to input CSV file
            output_csv: Path to output CSV file (default: input_csv_processed.csv)
        """
        self.input_csv = input_csv
        self.output_csv = output_csv or input_csv.replace('.csv', '_processed.csv')
        self.df = None
        self.original_shape = None
        
    def load_data(self) -> pd.DataFrame:
        """Load CSV data with error handling."""
        logger.info(f"Loading data from {self.input_csv}")
        try:
            self.df = pd.read_csv(self.input_csv, low_memory=False)
            self.original_shape = self.df.shape
            logger.info(f"Data loaded successfully. Shape: {self.original_shape}")
            return self.df
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise
    
    def identify_data_types(self) -> Dict[str, str]:
        """Identify and categorize columns by type."""
        type_mapping = {}
        
        for col in self.df.columns:
            if col in ['y_binary', 'unit_test_presence', 'vcs_available']:
                type_mapping[col] = 'boolean'
            elif col in ['file_path', 'vcs_top_coupled']:
                type_mapping[col] = 'categorical'
            elif col in ['coupled_file_changes', 'cross_file_call_edges', 'smells', 'pep8_examples']:
                type_mapping[col] = 'json_dict'
            elif self.df[col].dtype == 'object':
                type_mapping[col] = 'object'
            else:
                type_mapping[col] = 'numeric'
        
        return type_mapping
    
    def parse_json_columns(self):
        """Parse JSON/dict-like columns safely."""
        json_like_cols = ['coupled_file_changes', 'cross_file_call_edges', 'smells', 
                         'pep8_examples', 'indentation_irregularity', 'god_class_proxies', 'pep8_violations']
        
        for col in json_like_cols:
            if col in self.df.columns:
                logger.info(f"Parsing JSON column: {col}")
                self.df[col] = self.df[col].apply(self._safe_parse_json)
    
    @staticmethod
    def _safe_parse_json(val: Any) -> Any:
        """Safely parse JSON/dict strings."""
        if pd.isna(val) or val == '':
            return None
        if isinstance(val, (int, float)):
            return val
        if isinstance(val, (dict, list)):
            return val
        try:
            return json.loads(val)
        except (json.JSONDecodeError, ValueError):
            try:
                return ast.literal_eval(str(val))
            except (ValueError, SyntaxError):
                return None
    
    def handle_missing_values(self):
        """Handle missing and invalid values."""
        logger.info("Handling missing values...")
        
        # Count missing values
        missing_counts = self.df.isnull().sum()
        if missing_counts.sum() > 0:
            logger.info(f"Missing values found:\n{missing_counts[missing_counts > 0]}")
        
        # Fill numeric columns with median
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if self.df[col].isnull().sum() > 0:
                median_val = self.df[col].median()
                self.df[col].fillna(median_val, inplace=True)
                logger.info(f"Filled {col} with median: {median_val}")
        
        # Fill categorical columns with 'Unknown'
        categorical_cols = self.df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            if self.df[col].isnull().sum() > 0:
                self.df[col].fillna('Unknown', inplace=True)
        
        # Fill boolean columns with False
        boolean_cols = self.df.select_dtypes(include=['bool']).columns
        for col in boolean_cols:
            if self.df[col].isnull().sum() > 0:
                self.df[col].fillna(False, inplace=True)
    
    def create_derived_features(self):
        """Engineer new features from existing ones."""
        logger.info("Creating derived features...")
        
        # Code quality complexity score
        complexity_cols = ['average_cyclomatic_complexity', 'max_cyclomatic_ratio', 'mean_cyclomatic_ratio']
        self.df['complexity_score'] = self.df[complexity_cols].mean(axis=1)
        
        # Code health indicator
        self.df['code_health'] = (
            (100 - self.df['pep8_violations'].fillna(0)) * 0.3 +
            self.df['maintainability_score'].fillna(50) * 0.4 +
            (100 - self.df['comment_code_mismatch_score'].fillna(0) * 100) * 0.3
        )
        
        # Documentation quality
        self.df['doc_quality'] = (
            self.df['documentation_coverage'].fillna(0) * 0.5 +
            (100 - self.df['comment_percentage'].fillna(0)) * 0.5
        )
        
        # Testing coverage indicator
        if 'test_to_source_ratio' in self.df.columns:
            self.df['has_tests'] = (self.df['test_to_source_ratio'].fillna(0) > 0).astype(int)
        
        # Coupling complexity
        if 'inter_file_coupling' in self.df.columns and 'call_graph_density' in self.df.columns:
            self.df['coupling_complexity'] = (
                self.df['inter_file_coupling'].fillna(0) * 0.5 +
                self.df['call_graph_density'].fillna(0) * 0.5
            )
        
        # Code smell density
        self.df['smell_density'] = self.df['smells'].apply(
            lambda x: len(x) if isinstance(x, list) else 0
        ) / (self.df['lines_of_code'].fillna(1) / 100)
        
        # Effort-to-impact ratio
        self.df['effort_impact_ratio'] = (
            self.df['halstead_effort'].fillna(0) / 
            (self.df['halstead_estimated_bugs'].fillna(1) + 1)
        )
        
        # File maturity (based on age and changes)
        self.df['file_maturity'] = np.log1p(self.df['file_age_days'].fillna(0)) * \
                                   (1 + self.df['lines_added'].fillna(0) / 
                                   (self.df['lines_of_code'].fillna(1)))
        
        logger.info(f"Created 8 new features")
    
    def encode_categorical_features(self):
        """Encode categorical columns."""
        logger.info("Encoding categorical features...")
        
        # Binary encoding for boolean columns
        boolean_cols = ['unit_test_presence', 'vcs_available']
        for col in boolean_cols:
            if col in self.df.columns:
                self.df[col] = self.df[col].astype(int)
        
        # One-hot encode file_path directory (top-level project)
        if 'file_path' in self.df.columns:
            self.df['project'] = self.df['file_path'].apply(
                lambda x: str(x).split('\\')[5] if isinstance(x, str) and len(str(x).split('\\')) > 5 else 'unknown'
            )
            # Keep top 10 projects, rest as 'other'
            top_projects = self.df['project'].value_counts().head(10).index
            self.df['project'] = self.df['project'].apply(
                lambda x: x if x in top_projects else 'other'
            )
            self.df = pd.get_dummies(self.df, columns=['project'], drop_first=True)
    
    def handle_outliers(self):
        """Detect and handle outliers using IQR method."""
        logger.info("Handling outliers...")
        
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        outliers_found = {}
        
        for col in numeric_cols:
            if col.startswith('project_'):  # Skip one-hot encoded columns
                continue
            
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_bound = Q1 - 3 * IQR
            upper_bound = Q3 + 3 * IQR
            
            outlier_count = ((self.df[col] < lower_bound) | (self.df[col] > upper_bound)).sum()
            
            if outlier_count > 0:
                outliers_found[col] = outlier_count
                # Cap outliers instead of removing
                self.df[col] = self.df[col].clip(lower=lower_bound, upper=upper_bound)
        
        if outliers_found:
            logger.info(f"Outliers capped: {outliers_found}")
    
    def normalize_features(self):
        """Normalize numeric features to 0-1 scale."""
        logger.info("Normalizing features...")
        
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        skip_cols = [col for col in numeric_cols if col.startswith('project_')] + \
                   ['y_binary', 'has_tests']  # Don't normalize binary target/flags
        
        for col in numeric_cols:
            if col not in skip_cols:
                min_val = self.df[col].min()
                max_val = self.df[col].max()
                
                if max_val > min_val:
                    self.df[col] = (self.df[col] - min_val) / (max_val - min_val)
    
    def remove_low_variance_features(self, threshold: float = 0.01):
        """Remove features with very low variance."""
        logger.info(f"Removing low variance features (threshold: {threshold})...")
        
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        skip_cols = [col for col in numeric_cols if col.startswith('project_')]
        
        removed = []
        for col in numeric_cols:
            if col not in skip_cols:
                variance = self.df[col].var()
                if variance < threshold:
                    removed.append(col)
                    self.df.drop(col, axis=1, inplace=True)
        
        if removed:
            logger.info(f"Removed low variance features: {removed}")
    
    def drop_unnecessary_columns(self):
        """Drop columns that are not useful for modeling."""
        logger.info("Dropping unnecessary columns...")
        
        cols_to_drop = []
        
        # Drop file_path (replaced with project feature)
        if 'file_path' in self.df.columns:
            cols_to_drop.append('file_path')
        
        # Drop raw JSON columns (features already extracted)
        json_cols = ['coupled_file_changes', 'cross_file_call_edges', 'smells', 
                    'pep8_examples', 'indentation_irregularity', 'god_class_proxies', 
                    'pep8_violations', 'vcs_top_coupled']
        for col in json_cols:
            if col in self.df.columns:
                cols_to_drop.append(col)
        
        # Drop redundant raw columns (we have encoded versions)
        redundant_cols = []
        if 'project' in self.df.columns:
            cols_to_drop.append('project')
        
        self.df.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')
        logger.info(f"Dropped {len(cols_to_drop)} columns")
    
    def validate_final_data(self):
        """Validate the final dataset."""
        logger.info("Validating final dataset...")
        
        # Check for remaining NaN values
        nan_count = self.df.isnull().sum().sum()
        if nan_count > 0:
            logger.warning(f"Found {nan_count} NaN values remaining")
            # Fill remaining NaNs with 0
            self.df.fillna(0, inplace=True)
        
        # Check for infinite values
        inf_count = np.isinf(self.df.select_dtypes(include=[np.number])).sum().sum()
        if inf_count > 0:
            logger.warning(f"Found {inf_count} infinite values, replacing with 0")
            self.df = self.df.replace([np.inf, -np.inf], 0)
        
        # Validate target variable
        if 'y_binary' in self.df.columns:
            unique_targets = self.df['y_binary'].unique()
            logger.info(f"Target variable distribution: {self.df['y_binary'].value_counts().to_dict()}")
        
        logger.info(f"Final dataset shape: {self.df.shape}")
        logger.info(f"Final columns: {list(self.df.columns)}")
    
    def generate_summary_statistics(self):
        """Generate and log summary statistics."""
        logger.info("\n" + "="*60)
        logger.info("DATASET SUMMARY STATISTICS")
        logger.info("="*60)
        logger.info(f"\nOriginal shape: {self.original_shape}")
        logger.info(f"Final shape: {self.df.shape}")
        logger.info(f"\nNumeric columns: {len(self.df.select_dtypes(include=[np.number]).columns)}")
        logger.info(f"Categorical columns: {len(self.df.select_dtypes(include=['object']).columns)}")
        logger.info("="*60 + "\n")
    
    def build(self) -> pd.DataFrame:
        """Execute the complete feature building pipeline."""
        logger.info("Starting feature engineering pipeline...")
        
        # Load data
        self.load_data()
        
        # Parse JSON columns
        self.parse_json_columns()
        
        # Handle missing values
        self.handle_missing_values()
        
        # Create derived features
        self.create_derived_features()
        
        # Encode categorical features
        self.encode_categorical_features()
        
        # Handle outliers
        self.handle_outliers()
        
        # Remove low variance features
        self.remove_low_variance_features()
        
        # Drop unnecessary columns
        self.drop_unnecessary_columns()
        
        # Normalize features
        self.normalize_features()
        
        # Validate final data
        self.validate_final_data()
        
        # Generate summary
        self.generate_summary_statistics()
        
        logger.info(f"Pipeline complete! Saving to {self.output_csv}")
        
        return self.df
    
    def save(self):
        """Save processed data to CSV."""
        try:
            self.df.to_csv(self.output_csv, index=False)
            logger.info(f"Data successfully saved to {self.output_csv}")
        except Exception as e:
            logger.error(f"Error saving data: {e}")
            raise


print("✓ CodeQualityFeatureBuilder class defined")

✓ CodeQualityFeatureBuilder class defined


## Configuration and Execution

In [7]:
# Configuration - Adjust these paths as needed
input_file = "../../data/processed/dataset.csv"  # Input CSV file
output_file = "../../data/processed/dataset_processed.csv"  # Output file

print(f"Input file: {input_file}")
print(f"Output file: {output_file}")

Input file: ../../data/processed/dataset.csv
Output file: ../../data/processed/dataset_processed.csv


## Run the Pipeline

In [8]:
# Initialize the feature builder
builder = CodeQualityFeatureBuilder(input_file, output_file)

# Build features
processed_df = builder.build()

2025-12-26 01:19:58,772 - INFO - Starting feature engineering pipeline...
2025-12-26 01:19:58,774 - INFO - Loading data from ../../data/processed/dataset.csv
2025-12-26 01:19:58,850 - INFO - Data loaded successfully. Shape: (3088, 67)
2025-12-26 01:19:58,851 - INFO - Parsing JSON column: coupled_file_changes
2025-12-26 01:19:58,859 - INFO - Parsing JSON column: cross_file_call_edges
2025-12-26 01:19:59,007 - INFO - Parsing JSON column: smells
2025-12-26 01:19:59,110 - INFO - Parsing JSON column: pep8_examples
2025-12-26 01:19:59,137 - INFO - Parsing JSON column: indentation_irregularity
2025-12-26 01:19:59,194 - INFO - Parsing JSON column: god_class_proxies
2025-12-26 01:19:59,200 - INFO - Parsing JSON column: pep8_violations
2025-12-26 01:19:59,203 - INFO - Handling missing values...
2025-12-26 01:19:59,212 - INFO - Creating derived features...
2025-12-26 01:19:59,217 - INFO - Created 8 new features
2025-12-26 01:19:59,218 - INFO - Encoding categorical features...
2025-12-26 01:19:59,

## Save the Processed Dataset

In [9]:
# Save to CSV
builder.save()

print("\n" + "="*70)
print("✓ Feature engineering complete!")
print(f"✓ Output saved to: {output_file}")
print(f"✓ Ready for machine learning models")
print("="*70)

2025-12-26 01:19:59,457 - INFO - Data successfully saved to ../../data/processed/dataset_processed.csv



✓ Feature engineering complete!
✓ Output saved to: ../../data/processed/dataset_processed.csv
✓ Ready for machine learning models


## Feature Engineering Summary

### 8 Engineered Features Created:

1. **complexity_score** - Aggregated code complexity (mean of cyclomatic complexity metrics)
   - Combines: average_cyclomatic_complexity, max_cyclomatic_ratio, mean_cyclomatic_ratio

2. **code_health** - Overall health indicator
   - Weighted combination: 30% PEP8 compliance + 40% maintainability + 30% comment accuracy

3. **doc_quality** - Documentation quality score
   - Balanced: 50% documentation coverage + 50% comment percentage

4. **has_tests** - Test presence flag
   - Binary indicator: 1 if test_to_source_ratio > 0, else 0

5. **coupling_complexity** - Inter-file coupling metric
   - Weighted average: 50% inter_file_coupling + 50% call_graph_density

6. **smell_density** - Code smell frequency
   - Normalized count: number of smells per 100 lines of code

7. **effort_impact_ratio** - Development effort vs impact
   - Ratio: halstead_effort / (halstead_estimated_bugs + 1)

8. **file_maturity** - Combined age and change metrics
   - Log-age × change frequency: log(file_age_days + 1) × (1 + lines_added / LOC)

## Explore the Results

In [10]:
# Display first few rows
print("First 5 rows:")
display(processed_df.head())

print("\nDataset shape:", processed_df.shape)
print("\nColumn names:")
print(processed_df.columns.tolist())

First 5 rows:


Unnamed: 0,abbreviation_density,average_cyclomatic_complexity,avg_line_length,comment_code_mismatch_score,comment_lines,comment_percentage,decision_density,documentation_coverage,external_vs_internal_field_access_ratio,functions,...,test_to_source_ratio,total_imports,y_binary,complexity_score,code_health,doc_quality,has_tests,coupling_complexity,smell_density,effort_impact_ratio
0,0.0,0.666324,0.568106,0.0,1.0,1.0,0.623314,0.0,0.458333,0.125,...,0.0,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0]",0.718428,0.880992,0.4165,0,0.541667,0.150928,0.44942
1,0.0,0.306982,0.531561,0.0,0.25,1.0,0.326629,0.0,0.125,0.0625,...,1.0,0.230769,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",0.35237,0.932637,0.4525,1,0.083333,0.402474,0.0
2,0.333,0.204312,0.348837,0.0,0.0,0.0,0.1632,0.0,0.041667,0.0,...,0.0,0.230769,"[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0]",0.30012,0.929269,0.5,0,0.083333,0.551215,0.0
3,0.5,0.204312,0.325581,0.0,0.0,0.0,0.104,0.0,0.041667,0.0,...,0.0,0.230769,"[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]",0.27922,0.932637,0.5,0,0.083333,0.291447,0.019656
4,0.5,0.204312,0.320598,0.0,0.0,0.0,0.099429,0.0,0.041667,0.0,...,0.0,0.307692,"[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]",0.27922,0.932637,0.5,0,0.083333,0.272644,0.0



Dataset shape: (3088, 46)

Column names:
['abbreviation_density', 'average_cyclomatic_complexity', 'avg_line_length', 'comment_code_mismatch_score', 'comment_lines', 'comment_percentage', 'decision_density', 'documentation_coverage', 'external_vs_internal_field_access_ratio', 'functions', 'global_usages_total', 'globals_declared', 'halstead_difficulty', 'halstead_effort', 'halstead_estimated_bugs', 'halstead_volume', 'inter_file_coupling', 'large_parameter_list_indicator', 'lazy_class_indicator', 'lines_added', 'lines_of_code', 'long_method_indicator', 'maintainability_score', 'max_cyclomatic_ratio', 'max_line_length', 'max_lines_per_function', 'max_nesting_level', 'mean_cyclomatic_ratio', 'mean_lines_per_function', 'mean_param_entropy', 'nesting_variance', 'percent_lines_over_80', 'source_lines', 'test_files_found', 'test_function_count', 'test_lines', 'test_to_source_ratio', 'total_imports', 'y_binary', 'complexity_score', 'code_health', 'doc_quality', 'has_tests', 'coupling_complex

In [11]:
# Data types
print("Data types:")
display(processed_df.dtypes)

print("\nBasic statistics:")
display(processed_df.describe())

Data types:


abbreviation_density                       float64
average_cyclomatic_complexity              float64
avg_line_length                            float64
comment_code_mismatch_score                float64
comment_lines                              float64
comment_percentage                         float64
decision_density                           float64
documentation_coverage                     float64
external_vs_internal_field_access_ratio    float64
functions                                  float64
global_usages_total                        float64
globals_declared                           float64
halstead_difficulty                        float64
halstead_effort                            float64
halstead_estimated_bugs                    float64
halstead_volume                            float64
inter_file_coupling                        float64
large_parameter_list_indicator                bool
lazy_class_indicator                          bool
lines_added                    


Basic statistics:


Unnamed: 0,abbreviation_density,average_cyclomatic_complexity,avg_line_length,comment_code_mismatch_score,comment_lines,comment_percentage,decision_density,documentation_coverage,external_vs_internal_field_access_ratio,functions,...,test_lines,test_to_source_ratio,total_imports,complexity_score,code_health,doc_quality,has_tests,coupling_complexity,smell_density,effort_impact_ratio
count,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,...,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0
mean,0.154492,0.206129,0.402521,0.339418,0.198267,0.196313,0.209951,0.331658,0.205455,0.184545,...,0.203447,0.22641,0.235627,0.253421,0.813694,0.653175,0.349741,0.16851,0.246824,0.198678
std,0.22529,0.202331,0.172328,0.42143,0.33954,0.340474,0.242047,0.44174,0.29673,0.240025,...,0.366866,0.387395,0.22694,0.199899,0.142631,0.22139,0.476965,0.217356,0.201782,0.328418
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.075975,0.342193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.076923,0.100057,0.699478,0.5,0.0,0.0,0.112095,0.0
50%,0.053,0.101643,0.431894,0.0,0.0,0.0,0.1208,0.0,0.083333,0.125,...,0.0,0.0,0.153846,0.195622,0.898956,0.5,0.0,0.083333,0.211299,0.0
75%,0.25,0.306982,0.506645,0.846,0.25,0.25,0.381029,1.0,0.25,0.25,...,0.25,0.25,0.307692,0.325043,0.932637,0.938125,1.0,0.25,0.334071,0.25
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# Check for missing values
print("Missing values:")
missing = processed_df.isnull().sum()
print(missing[missing > 0])
print(f"\nTotal missing values: {processed_df.isnull().sum().sum()}")

Missing values:
Series([], dtype: int64)

Total missing values: 0


In [13]:
# Target variable distribution
if 'y_binary' in processed_df.columns:
    print("Target variable distribution:")
    print(processed_df['y_binary'].value_counts())
    print(f"\nTarget balance:")
    print(processed_df['y_binary'].value_counts(normalize=True))

Target variable distribution:
y_binary
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]    403
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]    175
[0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0]    149
[0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]     97
[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]     89
                                          ... 
[1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]      1
[1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]      1
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0]      1
[0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0]      1
[1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0]      1
Name: count, Length: 294, dtype: int64

Target balance:
y_binary
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]    0.130505
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]    0.056671
[0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0]    0.048251
[0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]    0.031412
[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]    0.028821
                                             ...   
[1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]    0.000324
[1, 1, 0, 0, 0,

## Analyze Engineered Features

In [14]:
# Display engineered features statistics
engineered_features = ['complexity_score', 'code_health', 'doc_quality', 'has_tests', 
                       'coupling_complexity', 'smell_density', 'effort_impact_ratio', 'file_maturity']

existing_features = [f for f in engineered_features if f in processed_df.columns]

if existing_features:
    print("Engineered Features Statistics:")
    display(processed_df[existing_features].describe())
    
    print("\n\nEngineered Features Correlation Matrix:")
    display(processed_df[existing_features].corr())

Engineered Features Statistics:


Unnamed: 0,complexity_score,code_health,doc_quality,has_tests,coupling_complexity,smell_density,effort_impact_ratio
count,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0,3088.0
mean,0.253421,0.813694,0.653175,0.349741,0.16851,0.246824,0.198678
std,0.199899,0.142631,0.22139,0.476965,0.217356,0.201782,0.328418
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.100057,0.699478,0.5,0.0,0.0,0.112095,0.0
50%,0.195622,0.898956,0.5,0.0,0.083333,0.211299,0.0
75%,0.325043,0.932637,0.938125,1.0,0.25,0.334071,0.25
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0




Engineered Features Correlation Matrix:


Unnamed: 0,complexity_score,code_health,doc_quality,has_tests,coupling_complexity,smell_density,effort_impact_ratio
complexity_score,1.0,-0.410814,0.553501,-0.233257,0.053731,0.05048,0.253557
code_health,-0.410814,1.0,-0.614805,0.120926,-0.339119,0.220539,-0.607025
doc_quality,0.553501,-0.614805,1.0,-0.205074,-0.155969,-0.083146,0.310195
has_tests,-0.233257,0.120926,-0.205074,1.0,0.093369,-0.157772,-0.04895
coupling_complexity,0.053731,-0.339119,-0.155969,0.093369,1.0,-0.094573,0.409782
smell_density,0.05048,0.220539,-0.083146,-0.157772,-0.094573,1.0,-0.290651
effort_impact_ratio,0.253557,-0.607025,0.310195,-0.04895,0.409782,-0.290651,1.0


In [15]:
# Feature importance insights
print("\nEngineered Features Insights:")
print("="*70)

if 'complexity_score' in processed_df.columns:
    print(f"✓ complexity_score: min={processed_df['complexity_score'].min():.4f}, "
          f"max={processed_df['complexity_score'].max():.4f}, "
          f"mean={processed_df['complexity_score'].mean():.4f}")

if 'code_health' in processed_df.columns:
    print(f"✓ code_health: min={processed_df['code_health'].min():.4f}, "
          f"max={processed_df['code_health'].max():.4f}, "
          f"mean={processed_df['code_health'].mean():.4f}")

if 'doc_quality' in processed_df.columns:
    print(f"✓ doc_quality: min={processed_df['doc_quality'].min():.4f}, "
          f"max={processed_df['doc_quality'].max():.4f}, "
          f"mean={processed_df['doc_quality'].mean():.4f}")

if 'has_tests' in processed_df.columns:
    print(f"✓ has_tests distribution: {processed_df['has_tests'].value_counts().to_dict()}")

if 'coupling_complexity' in processed_df.columns:
    print(f"✓ coupling_complexity: min={processed_df['coupling_complexity'].min():.4f}, "
          f"max={processed_df['coupling_complexity'].max():.4f}, "
          f"mean={processed_df['coupling_complexity'].mean():.4f}")

if 'smell_density' in processed_df.columns:
    print(f"✓ smell_density: min={processed_df['smell_density'].min():.4f}, "
          f"max={processed_df['smell_density'].max():.4f}, "
          f"mean={processed_df['smell_density'].mean():.4f}")

if 'effort_impact_ratio' in processed_df.columns:
    print(f"✓ effort_impact_ratio: min={processed_df['effort_impact_ratio'].min():.4f}, "
          f"max={processed_df['effort_impact_ratio'].max():.4f}, "
          f"mean={processed_df['effort_impact_ratio'].mean():.4f}")

if 'file_maturity' in processed_df.columns:
    print(f"✓ file_maturity: min={processed_df['file_maturity'].min():.4f}, "
          f"max={processed_df['file_maturity'].max():.4f}, "
          f"mean={processed_df['file_maturity'].mean():.4f}")

print("="*70)


Engineered Features Insights:
✓ complexity_score: min=0.0000, max=1.0000, mean=0.2534
✓ code_health: min=0.0000, max=1.0000, mean=0.8137
✓ doc_quality: min=0.0000, max=1.0000, mean=0.6532
✓ has_tests distribution: {0: 2008, 1: 1080}
✓ coupling_complexity: min=0.0000, max=1.0000, mean=0.1685
✓ smell_density: min=0.0000, max=1.0000, mean=0.2468
✓ effort_impact_ratio: min=0.0000, max=1.0000, mean=0.1987
