In [53]:
import os
import scipy

import pandas as pd
import polars as pl
import numpy as np

from tqdm import trange

In [26]:
dataset_path = "../../dataset/old_complete"

In [None]:
def check_dataset(dataset_folder_path):
    """
    This function checks the dataset folder and prints the total number of files within it.

    Args:
    - dataset_folder_path (str): The path to the dataset folder containing multiple metric files from different repositories.

    Returns:
    - None
    """
    folders_in_dataset = set(os.listdir(dataset_folder_path))

    # info print
    print(f"Dataset has {len(folders_in_dataset)} repos in total")
    return None

check_dataset(dataset_path)

Dataset has 100 repos in total


In [None]:
df = pd.read_csv(r"..\..\dataset\new\python-poetry_poetry_metrics.csv")
def df_transformation(df: pd.DataFrame) -> pd.DataFrame:
    """
    Transforms the given DataFrame by dropping unnecessary columns.

    Args:
    - df (pd.DataFrame): Input DataFrame containing multiple columns.

    Returns:
    - pd.DataFrame: Transformed DataFrame with specified columns removed.
    """
    df.drop(["Unnamed: 0", "Name", "LongName", "Parent", "Component", "Path", "Line", "Column", "EndLine", "EndColumn", "ID"],
            axis=1, inplace=True)
    return df
trans = df_transformation(df)
columns_list = list(df.columns)

# exclude the 0 variance columns that happen in all repos
global_columns_with_variance = set()
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        file_path = os.path.join(root, file)
        df = pl.read_csv(file_path, schema_overrides={col: pl.Float32 for col in columns_list})
        df = df.drop(["", "ID","Name","LongName","Parent","Component","Path","Line","Column","EndLine","EndColumn"])
        for col in columns_list:
            if df[col].var() > 0:
                global_columns_with_variance.add(col)
METHOD_METRICS_LIST = list(global_columns_with_variance)
zero_col_list = [metric for metric in columns_list if metric not in METHOD_METRICS_LIST]

In [None]:
def get_datasets_stats(folder_path):
    """
    Gathers statistics on null and NaN values for specific metrics in CSV files within a folder structure.

    Args:
    - folder_path (str): Path to the root folder containing the datasets.

    Returns:
    - pl.DataFrame: A DataFrame containing the count of null and NaN values for each metric in the files processed,
      along with the repository name.
    """
    suffix = "_metrics.csv"
    results = []
    columns_list = METHOD_METRICS_LIST
    num_columns = len(columns_list)
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(suffix):
                file_path = os.path.join(root, file)
                df = pl.read_csv(file_path, columns=columns_list, schema_overrides=[pl.Float32]*num_columns)
                if len(df) == 0:
                    print(f"Skipping empty {file}")
                    continue
                repo_name = file.split('_metrics')[0] # only repo name
                nulls_list = df.null_count().row(0)
                try:
                    nans_list = df.select(pl.all().is_nan().sum()).row(0)
                except Exception as e:
                    print(repo_name)
                    print(df.schema)
                    print(e)
                results.append((repo_name, *nulls_list, *nans_list))

    null_colnames = [x+"_nulls" for x in columns_list]
    nan_colnames = [x+"_NANs" for x in columns_list]
    df_schema = ["repo_name", *null_colnames, *nan_colnames]
    res_df = pl.DataFrame(results, schema=df_schema, orient="row")
    return res_df

### Method metrics null analysis

In [30]:
df_null_stats_methods = get_datasets_stats(dataset_path)

In [31]:
df_null_stats_methods

repo_name,HDIF_nulls,HVOL_nulls,Complexity Metric Rules_nulls,HTRP_nulls,CLOC_nulls,WarningInfo_nulls,LLOC_nulls,DLOC_nulls,Coupling Metric Rules_nulls,NOI_nulls,HNDB_nulls,TLOC_nulls,TCLOC_nulls,HEFF_nulls,NOS_nulls,Size Metric Rules_nulls,McCC_nulls,LOC_nulls,NUMPAR_nulls,TNOS_nulls,HPL_nulls,TLLOC_nulls,HPV_nulls,Anti Pattern_nulls,HCPL_nulls,HDIF_NANs,HVOL_NANs,Complexity Metric Rules_NANs,HTRP_NANs,CLOC_NANs,WarningInfo_NANs,LLOC_NANs,DLOC_NANs,Coupling Metric Rules_NANs,NOI_NANs,HNDB_NANs,TLOC_NANs,TCLOC_NANs,HEFF_NANs,NOS_NANs,Size Metric Rules_NANs,McCC_NANs,LOC_NANs,NUMPAR_NANs,TNOS_NANs,HPL_NANs,TLLOC_NANs,HPV_NANs,Anti Pattern_NANs,HCPL_NANs
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""aio-libs_aiohttp""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""aleju_imgaug""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""ansible_ansible""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""apache_tvm""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""AUTOMATIC1111_stable-diffusion…",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""unifyai_ivy""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""XX-net_XX-Net""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""yt-dlp_yt-dlp""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""ytdl-org_youtube-dl""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
# sum over columns, exclude first reponame column, sum
max_sum = df_null_stats_methods.select(
    pl.all().exclude("repo_name").sum()
).row(0)[0]
max_sum

0

### Calculating correlations

In [None]:
def calculate_and_save_per_repo_correlations(folder_path):
    """
    Calculates and saves correlation matrices for each repository's metrics data.

    Args:
    - folder_path (str): Path to the root folder containing metric files for repositories.

    Returns:
    - correlations (list): List of correlation matrices (as pandas DataFrames) for each repository.
    - lengths (list): List of row counts (lengths) of the data for each repository processed.
    """
    suffix = "_metrics.csv"
    columns_list = [x for x in METHOD_METRICS_LIST]
    correlations = []
    lengths = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(suffix):
                file_path = os.path.join(root, file)
                df = pl.read_csv(file_path, columns=columns_list, schema_overrides={col: pl.Float32 for col in columns_list})
                df = df.with_columns(
                    [pl.col(col).fill_nan(0).alias(col) for col in df.columns]
                )
                df = df.with_columns(
                    [pl.col(col).fill_null(0).alias(col) for col in df.columns]
                )
                # df = df.select([col for col in df.columns if df[col].var() > 0])
                # ensure no full zero columns remain
                df = df.with_columns(
                    [pl.col(col) + np.random.normal(0, 1e-6, len(df)) for col in df.columns]
                )
                if len(df) == 0:
                    print(f"Skipping empty {file}")
                    continue
                if len(df) == 1:
                    print(f"Skipping oneline {file}")
                    continue
                repo_name = file.split('_metrics')[0] # only repo name
                try:
                    df_corr = df.corr().to_pandas() #.fill_nan(0).cast(pl.Float32) # XXX
                    os.makedirs("outputs/file_correlations", exist_ok=True)
                    filepath = f"outputs/file_correlations/{file}_correlations.csv"
                    df_corr.to_csv(filepath)
                    correlations.append(df_corr)
                    lengths.append(df.shape[0])
                except Exception as e:
                    print(repo_name)
                    print(df_corr)
                    # print(df.schema)
                    print(e)

    return correlations, lengths

In [34]:
correlations_df_list, lengths_list = calculate_and_save_per_repo_correlations(dataset_path)

In [35]:
cols = correlations_df_list[0].columns

for df in correlations_df_list:
    for col in df.columns:
        if col not in cols:
            print(col)
            print(df.head(5))
            break

In [36]:
correlations_df_list[5].head()


Unnamed: 0,CLOC,DLOC,LLOC,LOC,McCC,NOI,NOS,NUMPAR,TCLOC,TLLOC,...,Coupling Metric Rules,Size Metric Rules,HCPL,HDIF,HEFF,HNDB,HPL,HPV,HTRP,HVOL
0,1.0,0.771421,0.217951,0.451634,0.327449,0.129194,0.335043,0.201601,0.999365,0.218133,...,0.011119,0.097587,0.173317,0.232343,0.14176,0.153349,0.172176,0.198299,0.14176,0.15335
1,0.771421,1.0,0.079896,0.272275,0.228429,0.060025,0.146748,0.203328,0.7702,0.080467,...,0.018055,0.114961,0.075147,0.114098,0.059984,0.069279,0.078494,0.089293,0.059984,0.069279
2,0.217951,0.079896,1.0,0.961856,0.394928,0.220735,0.770537,0.032588,0.21796,0.999147,...,-0.009031,0.341225,0.398641,0.282946,0.365925,0.4255,0.413329,0.370809,0.365925,0.425498
3,0.451634,0.272275,0.961856,1.0,0.457715,0.236566,0.810008,0.087061,0.45194,0.961582,...,-0.00674,0.344648,0.414771,0.316283,0.387435,0.43875,0.426459,0.389311,0.387435,0.438749
4,0.327449,0.228429,0.394928,0.457715,1.0,0.21062,0.574123,0.332042,0.326557,0.395914,...,-0.021704,0.333632,0.596449,0.554672,0.682334,0.637604,0.594694,0.554893,0.682334,0.637607


In [37]:
correlations_df_list[0].columns

Index(['CLOC', 'DLOC', 'LLOC', 'LOC', 'McCC', 'NOI', 'NOS', 'NUMPAR', 'TCLOC',
       'Complexity Metric Rules', 'Coupling Metric Rules', 'Size Metric Rules',
       'HCPL', 'HDIF', 'HEFF', 'HNDB', 'HPL', 'HPV', 'HTRP', 'HVOL'],
      dtype='object')

In [38]:
lengths_list[0]

1573

In [None]:
def dataset_correlations(folder_path):
    """
    Combines datasets from multiple repositories into a single DataFrame for correlation analysis.

    Args:
    - folder_path (str): Path to the root folder containing metrics CSV files for repositories.

    Returns:
    - pl.DataFrame: A concatenated DataFrame containing data from all repositories.
    """
    suffix = "_metrics.csv"
    columns_list = [x for x in METHOD_METRICS_LIST]
    df_list = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(suffix):
                file_path = os.path.join(root, file)
                df = pl.read_csv(file_path, columns=columns_list, schema_overrides={col: pl.Float32 for col in columns_list})
                if len(df) == 0:
                    print(f"Skipping empty {file}")
                    continue
                if len(df) == 1:
                    print(f"Skipping oneline {file}")
                    continue
                df_list.append(df)

    df_all = pl.concat(df_list)
    return df_all

In [40]:
df_all_dataset = dataset_correlations(dataset_path)
df_all_dataset

CLOC,DLOC,LLOC,LOC,McCC,NOI,NOS,NUMPAR,TCLOC,TLLOC,TLOC,TNOS,WarningInfo,Anti Pattern,Complexity Metric Rules,Coupling Metric Rules,Size Metric Rules,HCPL,HDIF,HEFF,HNDB,HPL,HPV,HTRP,HVOL
f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0.0,0.0,2.0,2.0,1.0,0.0,1.0,1.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6.0,6.0,1.0,7.0,1.0,0.0,0.0,2.0,6.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,3.0,3.0,1.0,0.0,1.0,1.0,0.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,1.0,2.0,3.0,1.0,0.0,1.0,1.0,1.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,1.0,2.0,3.0,1.0,0.0,0.0,2.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,0.0,12.0,12.0,2.0,0.0,9.0,2.0,0.0,12.0,12.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,10.0,14.0,4.0,0.0,8.0,1.0,4.0,10.0,14.0,8.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0,15.509775,0.00517,6.0,6.0,0.861654,15.509775
8.0,0.0,17.0,30.0,5.0,0.0,13.0,2.0,8.0,17.0,30.0,13.0,0.0,0.0,0.0,0.0,0.0,81.096512,2.647059,318.718201,0.040135,27.0,22.0,17.706566,120.404655
3.0,3.0,2.0,5.0,1.0,0.0,1.0,2.0,3.0,2.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
[col for col in df_all_dataset.columns if df_all_dataset[col].n_unique() == 1]

[]

In [42]:
df_all_dataset = df_all_dataset.select([col for col in df_all_dataset.columns if df_all_dataset[col].std() > 0])

df_all_corr = df_all_dataset.corr()

In [43]:
output_dir = 'outputs'
file_path = os.path.join(output_dir, 'method-all-correlations.csv')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_all_corr.write_csv(file_path)
print(f"File saved successfully to {file_path}")

File saved successfully to outputs\method-all-correlations.csv


In [44]:
df_all_corr

CLOC,DLOC,LLOC,LOC,McCC,NOI,NOS,NUMPAR,TCLOC,TLLOC,TLOC,TNOS,WarningInfo,Anti Pattern,Complexity Metric Rules,Coupling Metric Rules,Size Metric Rules,HCPL,HDIF,HEFF,HNDB,HPL,HPV,HTRP,HVOL
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,0.925949,0.358402,0.615358,0.347203,0.163798,0.342106,0.301332,0.99591,0.349854,0.599397,0.331333,0.415082,0.338745,0.262441,0.047716,0.417071,0.21945,0.262351,0.105025,0.175628,0.220285,0.258241,0.105025,0.175628
0.925949,1.0,0.186624,0.447534,0.202257,0.094544,0.151737,0.279526,0.919936,0.179368,0.432634,0.145315,0.288092,0.252641,0.154334,0.017789,0.30179,0.120978,0.153175,0.048763,0.091285,0.117372,0.143639,0.048763,0.091285
0.358402,0.186624,1.0,0.950789,0.642688,0.246924,0.796353,0.373064,0.361877,0.979198,0.933538,0.773239,0.695159,0.560807,0.46601,0.087687,0.683244,0.468528,0.454307,0.253668,0.402623,0.472174,0.517346,0.253668,0.402623
0.615358,0.447534,0.950789,1.0,0.666492,0.271703,0.802531,0.403813,0.61755,0.934246,0.983013,0.781773,0.724997,0.582549,0.487801,0.092378,0.710985,0.473515,0.480283,0.251204,0.401322,0.477357,0.530756,0.251204,0.401322
0.347203,0.202257,0.642688,0.666492,1.0,0.285188,0.791915,0.268293,0.349888,0.632744,0.656692,0.767182,0.647761,0.549994,0.706462,0.120567,0.545926,0.565703,0.553567,0.30192,0.469049,0.550744,0.630765,0.30192,0.469049
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.175628,0.091285,0.402623,0.401322,0.469049,0.112393,0.495992,0.144394,0.183114,0.415081,0.413036,0.504884,0.367185,0.274031,0.323266,0.052207,0.334843,0.933597,0.502085,0.848191,1.0,0.957956,0.81282,0.848191,1.0
0.220285,0.117372,0.472174,0.477357,0.550744,0.145733,0.578498,0.191706,0.228645,0.485495,0.489556,0.587013,0.435635,0.351116,0.400179,0.058119,0.390332,0.946988,0.690935,0.724138,0.957956,1.0,0.918976,0.724138,0.957956
0.258241,0.143639,0.517346,0.530756,0.630765,0.192324,0.624189,0.237951,0.268217,0.535963,0.547391,0.638926,0.480882,0.414957,0.47495,0.068326,0.418724,0.930406,0.803393,0.497699,0.81282,0.918976,1.0,0.497699,0.81282
0.105025,0.048763,0.253668,0.251204,0.30192,0.057584,0.346151,0.066667,0.109534,0.260179,0.25753,0.348645,0.213188,0.135698,0.168477,0.03297,0.201634,0.657036,0.270064,1.0,0.848191,0.724138,0.497699,1.0,0.848191


In [45]:
correlations_rows = (
    df_all_dataset.corr()
    .with_columns(pl.Series(name="index", values=df_all_dataset.columns))
    .unpivot(index = "index")
    .filter(pl.col('index') != pl.col('variable'))
)
correlations_rows

index,variable,value
str,str,f64
"""DLOC""","""CLOC""",0.925949
"""LLOC""","""CLOC""",0.358402
"""LOC""","""CLOC""",0.615358
"""McCC""","""CLOC""",0.347203
"""NOI""","""CLOC""",0.163798
…,…,…
"""HEFF""","""HVOL""",0.848191
"""HNDB""","""HVOL""",1.0
"""HPL""","""HVOL""",0.957956
"""HPV""","""HVOL""",0.81282


In [46]:
correlations_rows_important = correlations_rows.filter(pl.col('value').abs() > 0.7).sort(by='value', descending=True).gather_every(2)
correlations_rows_important.write_csv('outputs/method-all-important.csv')
correlations_rows_important

index,variable,value
str,str,f64
"""HVOL""","""HNDB""",1.0
"""HTRP""","""HEFF""",1.0
"""TCLOC""","""CLOC""",0.99591
"""LOC""","""TLOC""",0.983013
"""TLLOC""","""LLOC""",0.979198
…,…,…
"""HPL""","""HEFF""",0.724138
"""WarningInfo""","""TLLOC""",0.717936
"""TLOC""","""Size Metric Rules""",0.716519
"""Size Metric Rules""","""LOC""",0.710985


### Meta Analysis

In [59]:
def r_to_z(r_colname) -> pl.Expr:
    """
    Applies the Fisher transformation to a column of correlation coefficients in a Polars DataFrame.

    The Fisher transformation stabilizes the variance of correlation coefficients by converting
    them into z-scores.

    Args:
    - r_colname (str): The name of the column containing correlation coefficients (r values).

    Returns:
    - pl.Expr: A Polars expression that computes the Fisher-transformed z-scores.
    """
    return (0.5 * pl.Expr.log( (1 + pl.col(r_colname)) / (1 - pl.col(r_colname))))


def variance_z(n_colname) -> pl.Expr:
    """
    Computes the variance of the Fisher-transformed z-scores for correlation coefficients.

    Args:
    - n_colname (str): The column name in a Polars DataFrame representing the sample size (n).

    Returns:
    - pl.Expr: A Polars expression that calculates the variance of z-scores.
    """
    return 1/(pl.col(n_colname) - 3)

def z_to_r(z):
    """
    Converts a Fisher-transformed z-score back to the Pearson correlation coefficient (r).

    Args:
    - z (float or numpy array): The Fisher-transformed z-score(s).

    Returns:
    - float or numpy array: The corresponding Pearson correlation coefficient(s).
    """
    return (np.exp(2*z)-1)/(np.exp(2*z)+1)


def std_z(n_colname) -> pl.Expr:
    """
    Computes the standard deviation of Fisher-transformed z-scores for correlation coefficients.

    Args:
    - n_colname (str): The column name in a Polars DataFrame representing the sample size (n).

    Returns:
    - pl.Expr: A Polars expression that calculates the standard deviation of z-scores.
    """
    return pl.Expr.sqrt(variance_z(n_colname))


def making_fixed_effect_df(df):
    """
    Prepares a DataFrame with columns needed for fixed-effects meta-analysis calculations.

    Args:
    - df (Polars DataFrame): Input DataFrame containing columns such as 'correlation' and 'n' (sample size).

    Returns:
    - Polars DataFrame: A DataFrame augmented with columns used in fixed-effects meta-analysis calculations.
    """
    # transform input DataFrame by adding effect size and variance
    df_fixed = df.with_columns(
        # compute the effect size (Y) as the Fisher-transformed z-score of the 'correlation' column
        effect_size_Y=r_to_z('correlation'),
        # compute the within-study variance (V) as 1 / (n - 3), using the sample size 'n'
        variance_within_V=variance_z('n'),
    ).with_columns(
        # compute the raw weight (W), which is the inverse of the within-study variance
        raw_weight_W=1/pl.col('variance_within_V')
    ).with_columns(
        # calculate weighted effect size (WY) = W * Y
        WY=(pl.col('raw_weight_W') * pl.col('effect_size_Y')),
        # # calculate weighted squared effect size (WY^2) = W * Y^2
        WY_2=(pl.col('raw_weight_W') * (pl.col('effect_size_Y'))**2),
        # calculate squared weight (W^2)
        W_2=(pl.col('raw_weight_W')**2),
    )
    return df_fixed

def fixed_effect(df, alpha=0.05):
    """
    Conducts a fixed-effect meta-analysis on a given Polars DataFrame of correlations.

    Args:
    - df (Polars DataFrame): The input DataFrame containing correlation data.
    - alpha (float, optional): The significance level for confidence intervals and hypothesis testing (default is 0.05).

    Returns:
    - tuple: (p-value, fixed-effect correlation, lower confidence bound, upper confidence bound, significance flag)
    """
    # prepare the dataframe for meta-analysis calculations
    fixed_effect_df = making_fixed_effect_df(df)

    # calculate the fixed-effect meta-analytic z-score (fixed_effect_z)
    # this is the weighted average of the Fisher-transformed effect sizes (Y)
    # weighted sum of effect sizes (sum(WY)) divided by the sum of weights (sum(W))
    fixed_effect_z = fixed_effect_df.select(pl.sum('WY')/pl.sum('raw_weight_W')).item()

    # calculate the variance of the fixed-effect z-score (fixed_variance_z)
    # Variance is the inverse of the total weight (sum(W)).
    fixed_variance_z = fixed_effect_df.select(1/pl.sum('raw_weight_W')).item()

    # compute the std of the fixed-effect z-score (fixed_std_z)
    # std is the square root of the variance.
    fixed_std_z = np.sqrt(fixed_variance_z)

    # determine the z-value for the confidence interval
    # std_interval corresponds to the critical z-value for a two-tailed test at the given alpha.
    std_interval = scipy.stats.norm.isf(alpha/2)

    # compute the confidence interval in z-space (fixed_int_z)
    # use the standard normal critical value (std_interval) to compute the interval
    fixed_int_z = (fixed_effect_z-std_interval*fixed_std_z, fixed_effect_z+std_interval*fixed_std_z)

    # compute the test statistic z-value for hypothesis testing
    # z_value is the meta-analytic z-score divided by its standard deviation.
    z_value = fixed_effect_z/fixed_std_z

    # compute the p-value
    # the p-value is twice the area in the tail of the standard normal distribution beyond |z_value|.
    p_value = 2*scipy.stats.norm.sf(z_value)

    # convert the fixed-effect z-score back to the correlation scale (r)
    # the fixed-effect correlation coefficient is computed using the z_to_r function.
    fixed_effect_cor = z_to_r(fixed_effect_z)

    # convert the confidence interval from z-space back to the correlation scale
    # Transform the lower and upper bounds of the z-confidence interval back to r.
    fixed_int_corr = (z_to_r(fixed_int_z[0]), z_to_r(fixed_int_z[1]))
    return (p_value, fixed_effect_cor, fixed_int_corr[0], fixed_int_corr[1], p_value <= alpha)

def making_random_effects_df(df):
    """
    Prepares a DataFrame for random-effects meta-analysis by incorporating between-study variance.

    Args:
    - df (Polars DataFrame): Input DataFrame containing correlation data and sample sizes.

    Returns:
    - Polars DataFrame: A DataFrame augmented with columns required for random-effects meta-analysis.
    """
    # compute fixed-effect meta-analysis components
    # use the fixed-effect meta-analysis preparation function to calculate:
    # - Effect sizes (Y) and variances (V).
    # - Weighted sums and other fixed-effect calculations
    fixed_effect_df = making_fixed_effect_df(df)

    # calculate the Q statistic
    # Q statistic measures heterogeneity in effect sizes and is given by:
    # Q = sum(WY^2) - (sum(WY)^2 / sum(W))
    Q = fixed_effect_df.select(pl.sum('WY_2') - pl.sum('WY')**2 / pl.sum('raw_weight_W')).item()

    # calculate the C constant
    # C is a weighting term used in the random-effects model, calculated as:
    # C = sum(W) - (sum(W^2) / sum(W))
    C = fixed_effect_df.select(pl.sum('raw_weight_W') - pl.sum('W_2')/pl.sum('raw_weight_W')).item()

    # compute between-study variance (T^2)
    # T^2 is the between-study variance, also called tau-squared, and is given by:
    # T^2 = (Q - (k - 1)) / C
    # where k is the number of studies.
    T_2 = (Q - (len(fixed_effect_df)-1))/C

    fixed_effect_df = (
        # update the variance (V_star) to account for between-study variance
        # in random-effects meta-analysis, the total variance (V_star) includes both within-study
        # variance (V) and between-study variance (T^2):
        # V_star = V + T^2
        fixed_effect_df.with_columns(V_star=pl.col('variance_within_V') + T_2)
        # calculate updated weights (W_star)
        # the new weights (W_star) are the inverse of the updated variances (V_star):
        # W_star = 1 / V_star
        .with_columns(W_star=1/pl.col('V_star'))
        # compute the weighted effect sizes (W_star_Y)
        # weighted effect sizes are updated using the new weights:
        # W_star_Y = W_star * Y
        .with_columns(W_star_Y=pl.col('W_star')*pl.col('effect_size_Y'))
    )
    return fixed_effect_df

def random_effects(df, alpha=0.05):
    """
    Conducts random-effects meta-analysis on a given Polars DataFrame of correlations.

    Args:
    - df (Polars DataFrame): Input DataFrame containing correlation data and sample sizes.
    - alpha (float, optional): The significance level for confidence intervals and hypothesis testing (default is 0.05).

    Returns:
    - tuple: (p-value, pooled effect size, lower CI, upper CI, significance flag)
    """
    # Prepare the DataFrame for random-effects meta-analysis
    # the function 'making_random_effects_df' computes key components, including:
    # - Between-study variance (T^2).
    # - Updated variances (V*), weights (W*), and weighted effect sizes (W*Y).
    random_effects_df = making_random_effects_df(df)

    # calculate the pooled effect size in z-space
    # the random-effects pooled effect size (random_effect_z) is the weighted average
    # of Fisher-transformed z-scores using updated weights (W*):
    # random_effect_z = sum(W*Y) / sum(W*)
    random_effect_z = random_effects_df.select(pl.sum('W_star_Y')/pl.sum('W_star')).item()

    # calculate the variance of the pooled effect size
    # variance of the random-effects pooled effect size is the inverse of the sum of weights (W*):
    # random_variance_z = 1 / sum(W*)
    random_variance_z = random_effects_df.select(1/pl.sum('W_star')).item()

    # compute the standard deviation
    # standard deviation is the square root of the variance
    random_std_z = np.sqrt(random_variance_z)

    # determine the confidence interval in z-space
    # calculate the critical z-value for the specified alpha level (two-tailed test)
    # std_interval corresponds to the critical value of the standard normal distribution
    std_interval = scipy.stats.norm.isf(alpha/2)

    # compute the confidence interval for the pooled effect size in z-space:
    # [random_effect_z - z_critical * std_dev, random_effect_z + z_critical * std_dev]
    random_int_z = (random_effect_z-std_interval*random_std_z, random_effect_z+std_interval*random_std_z)

    # perform the hypothesis testing
    # compute the test statistic (z_value) for the pooled effect size:
    # z_value = random_effect_z / std_dev
    z_value = random_effect_z/random_std_z

    # calculate the two-tailed p-value:
    # p_value = 2 * P(Z > |z_value|)
    p_value = 2*scipy.stats.norm.sf(z_value)

    # convert pooled effect size back to correlation scale
    # the pooled effect size is computed in z-space; transform it back to the correlation scale:
    # random_effect_cor = z_to_r(random_effect_z).
    random_effect_cor = z_to_r(random_effect_z)

    # transform the confidence interval back to the correlation scale:
    # random_int_corr = [z_to_r(lower_bound), z_to_r(upper_bound)].
    random_int_corr = (z_to_r(random_int_z[0]), z_to_r(random_int_z[1]))
    return (p_value, random_effect_cor, random_int_corr[0], random_int_corr[1], p_value <= alpha)

In [54]:
def calculate_Q_and_test(df, alpha=0.05):
    """
    Calculates the Q-statistic, performs a significance test for heterogeneity,
    and returns whether the test was significant.

    Args:
    - df (Polars DataFrame): Input DataFrame containing correlation data and sample sizes.
    - alpha (float, optional): Significance level for the Q-test (default is 0.05).

    Returns:
    - tuple: (Q-statistic, p-value for Q, significance flag)
    """
    # compute fixed-effect meta-analysis components
    fixed_effect_df = making_fixed_effect_df(df)

    # calculate the Q-statistic
    Q = fixed_effect_df.select(pl.sum('WY_2') - pl.sum('WY') ** 2 / pl.sum('raw_weight_W')).item()

    # degrees of freedom for Q test (k - 1)
    k = len(fixed_effect_df)  # number of studies
    df_Q = k - 1

    # calculate the p-value for the Q-statistic
    p_value_Q = 1 - scipy.stats.chi2.cdf(Q, df_Q)

    # determine significance of the test
    is_significant = p_value_Q <= alpha

    # return the Q-statistic, p-value, and significance flag
    return Q, p_value_Q, is_significant


def count_significant_Qs(correlations_list, lengths_list, alpha=0.05):
    """
    Counts the number of significant and non-significant Q-statistics
    for correlations between pairs of metrics in a list of correlation matrices.

    Args:
    - correlations_list (list of Polars DataFrame): List of correlation matrices.
    - lengths_list (list of int): List of sample sizes corresponding to each correlation matrix.
    - alpha (float, optional): Significance level for Q-test (default is 0.05).

    Returns:
    - dict: A dictionary with counts of significant and non-significant Q-statistics.
    """
    columns_names = correlations_list[0].columns  # column names of the correlation matrices
    significant_count = 0  # count of significant Q-tests
    not_significant_count = 0  # count of non-significant Q-tests

    # iterate over each unique pair of metrics (columns)
    for i in trange(len(columns_names)):
        for j in range(i + 1, len(columns_names)):
            # prepare data for the pair (Metric A, Metric B)
            data = []
            for corr, length in zip(correlations_list, lengths_list):
                # only include valid correlations for analysis
                if not corr.isna().iloc[i, j] and abs(corr.iloc[i, j]) != 1 and length > 3:
                    data.append((corr.iloc[i, j], length))

            # create a Polars DataFrame for the pair
            df_data = pl.DataFrame(data, schema=['correlation', 'n'], orient='row')

            if len(df_data) == 0:
                continue

            # calculate Q-statistic and test its significance
            _, p_value_Q, is_significant = calculate_Q_and_test(df_data, alpha=alpha)

            if is_significant:
                significant_count += 1
            else:
                not_significant_count += 1

    return {
        'significant_count': significant_count,
        'not_significant_count': not_significant_count
    }

In [None]:
def meta_analysis_correlations(correlations_list, lengths_list, fixed=True):
    """
    Performs a meta-analysis on correlations between all unique pairs of metrics
    across multiple datasets.

    Args:
    - correlations_list (list of Polars DataFrame): List of correlation matrices.
    - lengths_list (list of int): List of sample sizes corresponding to each correlation matrix.
    - fixed (bool, optional): If True, performs a fixed-effects meta-analysis; otherwise, random-effects.

    Returns:
    - Polars DataFrame: Results of the meta-analysis for each pair of metrics, including:
        - Metric A: Name of the first metric in the pair.
        - Metric B: Name of the second metric in the pair.
        - p-value: Statistical significance of the pooled correlation.
        - effect: Pooled correlation effect size.
        - interval_l: Lower bound of the confidence interval.
        - interval_r: Upper bound of the confidence interval.
        - significant: Whether the pooled correlation is significant (p-value <= alpha).
    """
    columns_names = correlations_list[0].columns
    results = []

    # iterate over all unique pairs of columns (metrics)
    for i in trange(len(columns_names)):
        for j in range(i + 1, len(columns_names)):
            # analyzing correlations between columns i and j
            data = []
            for corr, length in zip(correlations_list, lengths_list):
                # include only valid correlation data points:
                # - Not missing (not NaN).
                # - Not equal to ±1 (perfect correlation).
                # - Sample size greater than 3 (to avoid invalid variances).
                if not corr.isna().iloc[i,j] and abs(corr.iloc[i, j]) != 1 and length > 3:
                    data.append((corr.iloc[i, j], length))
            # print(data)
            df_data = pl.DataFrame(data, schema=['correlation', 'n'], orient='row')
            if fixed:
                result = fixed_effect(df_data)
            else:
                result = random_effects(df_data)
            results.append((columns_names[i], columns_names[j], *result))

    return pl.DataFrame(results, schema=['Metric A', 'Metric B', 'p-value', 'effect', 'interval_l', 'interval_r', 'significant'], orient='row')

In [56]:
count_significant_Qs(correlations_df_list, lengths_list)

100%|██████████| 25/25 [00:02<00:00, 11.06it/s]


{'significant_count': 300, 'not_significant_count': 0}

Using random effects is more justified then.

In [26]:
meta_analysis_fixed_results = meta_analysis_correlations(correlations_df_list, lengths_list, fixed=True)
meta_analysis_fixed_results.write_csv("outputs/method-meta-analysis-fixed.csv")

100%|██████████| 25/25 [00:03<00:00,  6.57it/s]


In [27]:
meta_analysis_fixed_results.filter(
    (pl.col('significant') == 1) & (pl.col('effect').abs() > 0.7)
).sort(pl.col('effect').abs(), descending=True).write_csv('outputs/method-fixed-important.csv')

In [60]:
meta_analysis_random_results = meta_analysis_correlations(correlations_df_list, lengths_list, fixed=False)
meta_analysis_random_results.write_csv("outputs/method-meta-analysis-random.csv")

100%|██████████| 25/25 [00:02<00:00, 11.43it/s]


In [29]:
meta_analysis_random_results.filter(
    (pl.col('significant') == 1) & (pl.col('effect').abs() > 0.7)
).sort(pl.col('effect').abs(), descending=True).write_csv('outputs/method-random-important.csv')

In [30]:
meta_analysis_fixed_results["p-value"].unique()

p-value
f64
0.0
3.5460000000000004e-289
6.9391e-271
1.9315000000000002e-224
2.8950999999999997e-168
1.7351000000000002e-127
1.5798e-25
4.2199e-24


In [31]:
meta_analysis_fixed_results.sort(by='p-value', descending=True).gather_every(100).head(10)

Metric A,Metric B,p-value,effect,interval_l,interval_r,significant
str,str,f64,f64,f64,f64,i64
"""NUMPAR""","""Coupling Metric Rules""",4.2199e-24,0.015867,0.012796,0.018937,1
"""McCC""","""TLOC""",0.0,0.667346,0.665639,0.669046,1
"""TLOC""","""Anti Pattern""",0.0,0.573065,0.570999,0.575124,1


In [32]:
i = 2
j = -6
# analyzing correlations between columns i and j
data = []
for corr, length in zip(correlations_df_list, lengths_list):
    if not corr.isna().iloc[i,j] and abs(corr.iloc[i, j]) != 1 and length > 3:
        data.append((corr.iloc[i, j], length))
# print(data)
df_data = pl.DataFrame(data, schema=['correlation', 'n'], orient='row')
print(df_data)

shape: (100, 2)
┌─────────────┬──────┐
│ correlation ┆ n    │
│ ---         ┆ ---  │
│ f64         ┆ i64  │
╞═════════════╪══════╡
│ 0.594921    ┆ 1573 │
│ 0.698745    ┆ 7343 │
│ 0.589581    ┆ 5560 │
│ 0.382893    ┆ 5961 │
│ 0.336824    ┆ 1052 │
│ …           ┆ …    │
│ 0.35063     ┆ 3375 │
│ 0.696698    ┆ 4099 │
│ 0.362922    ┆ 4965 │
│ 0.44018     ┆ 2916 │
│ 0.827112    ┆ 457  │
└─────────────┴──────┘


In [33]:
result = fixed_effect(df_data)
result

(0.0, 0.44587188267942246, 0.44340784677268497, 0.44832917944237216, True)

In [34]:
import pandas as pd

def find_columns_to_drop(file_path, correlation_threshold=0.9):
    df = pd.read_csv(file_path)
    correlations = df[["Metric A", "Metric B", "effect"]]
    high_correlations = correlations[correlations["effect"].abs() >= correlation_threshold]
    columns_to_drop = set()
    processed_columns = set()
    
    for _, row in high_correlations.iterrows():
        col_a, col_b = row["Metric A"], row["Metric B"]
        if col_a not in processed_columns and col_b not in processed_columns:
            columns_to_drop.add(col_b)
            processed_columns.add(col_a)
            processed_columns.add(col_b)
        elif col_a in processed_columns:
            columns_to_drop.add(col_b)
            processed_columns.add(col_b)
        elif col_b in processed_columns:
            columns_to_drop.add(col_a)
            processed_columns.add(col_a)
    
    return list(columns_to_drop)

file_path = "outputs/method-random-important.csv"
columns_to_drop = find_columns_to_drop(file_path, correlation_threshold=0.9)
print("Columns to drop:", columns_to_drop)


Columns to drop: ['HTRP', 'TCLOC', 'TNOS', 'HPL', 'LOC', 'HCPL', 'Size Metric Rules', 'HPV', 'TLLOC', 'HNDB', 'HVOL', 'TLOC']


In [35]:
empty_cols = zero_col_list

final_list_90 = empty_cols.copy()
final_list_80 = empty_cols.copy()
final_list_70 = empty_cols.copy()

final_list_90.extend(find_columns_to_drop(file_path, 0.9))
final_list_80.extend(find_columns_to_drop(file_path, 0.8))
final_list_70.extend(find_columns_to_drop(file_path, 0.7))

In [36]:
final_list_90

['CD',
 'NII',
 'NL',
 'NLE',
 'TCD',
 'Documentation Metric Rules',
 'HTRP',
 'TCLOC',
 'TNOS',
 'HPL',
 'LOC',
 'HCPL',
 'Size Metric Rules',
 'HPV',
 'TLLOC',
 'HNDB',
 'HVOL',
 'TLOC']

In [37]:
final_list_80

['CD',
 'NII',
 'NL',
 'NLE',
 'TCD',
 'Documentation Metric Rules',
 'HTRP',
 'TCLOC',
 'TNOS',
 'HPL',
 'McCC',
 'HEFF',
 'LOC',
 'HDIF',
 'HCPL',
 'Size Metric Rules',
 'HPV',
 'TLLOC',
 'NOS',
 'HNDB',
 'HVOL',
 'TLOC',
 'DLOC']

In [38]:
final_list_70

['CD',
 'NII',
 'NL',
 'NLE',
 'TCD',
 'Documentation Metric Rules',
 'HTRP',
 'TNOS',
 'HPL',
 'HEFF',
 'Anti Pattern',
 'TLLOC',
 'McCC',
 'HDIF',
 'HPV',
 'TLOC',
 'LOC',
 'Size Metric Rules',
 'HVOL',
 'DLOC',
 'TCLOC',
 'HCPL',
 'NOS',
 'HNDB']

In [39]:
print(f"With threshold 0.9 we drop {len(final_list_90)} columns, with threshold 0.8 we drop {len(final_list_80)} columns, with threshold 0.7 we drop {len(final_list_70)} columns.")

With threshold 0.9 we drop 22 columns, with threshold 0.8 we drop 27 columns, with threshold 0.7 we drop 29 columns.


In [40]:
[x for x in final_list_80 if x not in final_list_90]

['McCC', 'HEFF', 'HDIF', 'NOS', 'DLOC']

In [41]:
[x for x in final_list_70 if x not in final_list_80]

