In [12]:
import json
import os

import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
dataset_path = "dataset"

In [3]:
def check_dataset(dataset_folder_path):
    folders_in_dataset = set(os.listdir(dataset_folder_path))

    # info print
    print(f"Dataset has {len(folders_in_dataset)} repos in total")
    return None

check_dataset(dataset_path)

Dataset has 27 repos in total


In [17]:
df = pd.read_csv(f"dataset\python-poetry_poetry-Method.csv")
def df_transformation(df: pd.DataFrame) -> pd.DataFrame:
    df.drop(["Name", "LongName", "Parent", "Component", "Path", "Line", "Column", "EndLine", "EndColumn", "ID"],
            axis=1, inplace=True)
    return df
trans = df_transformation(df)
METHOD_METRICS_LIST = list(trans.columns)

In [8]:
def get_datasets_stats(folder_path, class_stats=True):
    if class_stats:
        suffix = "-Class.csv"
    else:
        suffix = "-Method.csv"
    results = []
    columns_list = [x for x in (CLASS_METRICS_LIST if class_stats else METHOD_METRICS_LIST)]
    num_columns = len(columns_list)
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(suffix):
                file_path = os.path.join(root, file)
                df = pl.read_csv(file_path, columns=columns_list, dtypes=[pl.Float32]*num_columns)
                if len(df) == 0:
                    print(f"Skipping empty {file}")
                    continue
                repo_name = file.split('-')[0] # only repo name
                nulls_list = df.null_count().row(0)
                try:
                    nans_list = df.select(pl.all().is_nan().sum()).row(0)
                except Exception as e:
                    print(repo_name)
                    print(df.schema)
                    print(e)
                results.append((repo_name, *nulls_list, *nans_list))

    null_colnames = [x+"_nulls" for x in columns_list]
    nan_colnames = [x+"_NANs" for x in columns_list]
    df_schema = ["repo_name", *null_colnames, *nan_colnames]
    res_df = pl.DataFrame(results, schema=df_schema)
    return res_df

### Method metrics null analysis

In [18]:
df_null_stats_methods = get_datasets_stats(dataset_path, class_stats=False)

  df = pl.read_csv(file_path, columns=columns_list, dtypes=[pl.Float32]*num_columns)
  res_df = pl.DataFrame(results, schema=df_schema)


In [19]:
df_null_stats_methods

repo_name,CD_nulls,CLOC_nulls,DLOC_nulls,LLOC_nulls,LOC_nulls,McCC_nulls,NII_nulls,NL_nulls,NLE_nulls,NOI_nulls,NOS_nulls,NUMPAR_nulls,TCD_nulls,TCLOC_nulls,TLLOC_nulls,TLOC_nulls,TNOS_nulls,WarningBlocker_nulls,WarningCritical_nulls,WarningInfo_nulls,WarningMajor_nulls,WarningMinor_nulls,Anti Pattern_nulls,Complexity Metric Rules_nulls,Coupling Metric Rules_nulls,Documentation Metric Rules_nulls,Size Metric Rules_nulls,CD_NANs,CLOC_NANs,DLOC_NANs,LLOC_NANs,LOC_NANs,McCC_NANs,NII_NANs,NL_NANs,NLE_NANs,NOI_NANs,NOS_NANs,NUMPAR_NANs,TCD_NANs,TCLOC_NANs,TLLOC_NANs,TLOC_NANs,TNOS_NANs,WarningBlocker_NANs,WarningCritical_NANs,WarningInfo_NANs,WarningMajor_NANs,WarningMinor_NANs,Anti Pattern_NANs,Complexity Metric Rules_NANs,Coupling Metric Rules_NANs,Documentation Metric Rules_NANs,Size Metric Rules_NANs
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""encode_django""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""explosion_spaCy""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""facebookresearch_detectron2""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""freqtrade_freqtrade""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""geekan_MetaGPT""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""python""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""python""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""ray""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""sqlmapproject_sqlmap""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
# sum over columns, exclude first reponame column, sum
max_sum = df_null_stats_methods.select(
    pl.all().exclude("repo_name").sum()
).row(0)[0]
max_sum

0

### Calculating correlations

In [39]:
def calculate_and_save_per_repo_correlations(folder_path):
    suffix = "-Method.csv"
    columns_list = [x for x in METHOD_METRICS_LIST]
    num_columns = len(columns_list)
    correlations = []
    lengths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(suffix):
                file_path = os.path.join(root, file)
                df = pl.read_csv(file_path, columns=columns_list, schema_overrides={col: pl.Float32 for col in columns_list})
                df = df.with_columns(
                    [pl.col(col).fill_nan(0).alias(col) for col in df.columns]
                )
                df = df.with_columns(
                    [pl.col(col).fill_null(0).alias(col) for col in df.columns]
                )
                df = df.select([col for col in df.columns if df[col].var() > 0])
                if len(df) == 0:
                    print(f"Skipping empty {file}")
                    continue
                if len(df) == 1:
                    print(f"Skipping oneline {file}")
                    continue
                repo_name = file.split('-')[0] # only repo name
                try:
                    df_corr = df.corr().to_pandas() #.fill_nan(0).cast(pl.Float32) # XXX
                    filepath = f"{root}/correlations{suffix}"
                    df_corr.to_csv(filepath)
                    correlations.append(df_corr)
                    lengths.append(df.shape[0])
                except Exception as e:
                    print(repo_name)
                    print(df_corr)
                    # print(df.schema)
                    print(e)

    return correlations, lengths

In [40]:
correlations_df_list, lengths_list = calculate_and_save_per_repo_correlations(dataset_path)

In [43]:
correlations_df_list[0].head()


Unnamed: 0,CLOC,DLOC,LLOC,LOC,McCC,NOI,NOS,NUMPAR,TCLOC,TLLOC,TLOC,TNOS,WarningInfo,Anti Pattern,Complexity Metric Rules,Size Metric Rules
0,1.0,0.995885,0.597196,0.893689,0.483052,0.349519,0.43212,0.761423,0.999984,0.598584,0.888739,0.440221,0.817454,0.747172,0.536199,0.849381
1,0.995885,1.0,0.524926,0.85062,0.405224,0.291821,0.349372,0.736594,0.995397,0.526303,0.844806,0.357782,0.771897,0.705789,0.467658,0.809194
2,0.597196,0.524926,1.0,0.89325,0.91794,0.644668,0.949668,0.703076,0.601325,0.999804,0.897817,0.952356,0.866289,0.804567,0.866622,0.844293
3,0.893689,0.85062,0.89325,1.0,0.786475,0.559746,0.777489,0.810992,0.89601,0.894092,0.999879,0.78351,0.939945,0.863186,0.785983,0.944982
4,0.483052,0.405224,0.91794,0.786475,1.0,0.641254,0.954521,0.609482,0.487155,0.915781,0.790491,0.952203,0.83623,0.779994,0.95354,0.795239


In [44]:
lengths_list[0]

27

In [51]:
def dataset_correlations(folder_path):
    suffix = "-Method.csv"
    columns_list = [x for x in METHOD_METRICS_LIST]
    num_columns = len(columns_list)
    correlations = []
    lengths = []
    df_list = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(suffix):
                file_path = os.path.join(root, file)
                df = pl.read_csv(file_path, columns=columns_list, schema_overrides={col: pl.Float32 for col in columns_list})
                if len(df) == 0:
                    print(f"Skipping empty {file}")
                    continue
                if len(df) == 1:
                    print(f"Skipping oneline {file}")
                    continue
                df_list.append(df)

    df_all = pl.concat(df_list)
    return df_all

In [52]:
df_all_dataset = dataset_correlations(dataset_path)
df_all_dataset

CD,CLOC,DLOC,LLOC,LOC,McCC,NII,NL,NLE,NOI,NOS,NUMPAR,TCD,TCLOC,TLLOC,TLOC,TNOS,WarningBlocker,WarningCritical,WarningInfo,WarningMajor,WarningMinor,Anti Pattern,Complexity Metric Rules,Coupling Metric Rules,Documentation Metric Rules,Size Metric Rules
f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0.0,2.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,1.0,0.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,1.0,2.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,3.0,3.0,2.0,5.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,3.0,2.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,5.0,5.0,2.0,7.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,5.0,2.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,4.0,4.0,20.0,29.0,7.0,0.0,0.0,0.0,2.0,17.0,2.0,0.0,4.0,20.0,29.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
[col for col in df_all_dataset.columns if df_all_dataset[col].n_unique() == 1]

['CD',
 'NII',
 'NL',
 'NLE',
 'TCD',
 'Documentation Metric Rules']

In [63]:
df_all_dataset = df_all_dataset.select([col for col in df_all_dataset.columns if df_all_dataset[col].std() > 0])

df_all_corr = df_all_dataset.corr()

In [None]:
output_dir = 'outputs'
file_path = os.path.join(output_dir, 'method-all-correlations.csv')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_all_corr.write_csv(file_path)
print(f"File saved successfully to {file_path}")

File saved successfully to outputs\method-all-correlations.csv


In [65]:
df_all_corr

CLOC,DLOC,LLOC,LOC,McCC,NOI,NOS,NUMPAR,TCLOC,TLLOC,TLOC,TNOS,WarningInfo,Anti Pattern,Complexity Metric Rules,Coupling Metric Rules,Size Metric Rules
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,0.94638,0.540243,0.76514,0.437091,0.271711,0.461628,0.442901,0.997118,0.525552,0.748858,0.450809,0.568132,0.409211,0.340437,0.142012,0.57108
0.94638,1.0,0.396,0.635322,0.329026,0.214876,0.300318,0.421979,0.942764,0.384052,0.6203,0.29309,0.457269,0.332369,0.244378,0.122331,0.468966
0.540243,0.396,1.0,0.951166,0.745786,0.345355,0.878052,0.560936,0.542029,0.975794,0.936754,0.859155,0.800181,0.64184,0.550363,0.166763,0.782283
0.76514,0.635322,0.951166,1.0,0.733383,0.370975,0.846091,0.576965,0.765387,0.929681,0.984557,0.829397,0.814787,0.633015,0.547857,0.183504,0.799885
0.437091,0.329026,0.745786,0.733383,1.0,0.3163,0.806831,0.429903,0.437221,0.725518,0.720085,0.785654,0.697642,0.591867,0.706676,0.166453,0.61252
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.568132,0.457269,0.800181,0.814787,0.697642,0.295331,0.712746,0.62409,0.574036,0.803426,0.820616,0.719678,1.0,0.80476,0.692919,0.181951,0.971163
0.409211,0.332369,0.64184,0.633015,0.591867,0.223781,0.555011,0.666592,0.412546,0.645049,0.638179,0.563285,0.80476,1.0,0.683213,0.09082,0.751519
0.340437,0.244378,0.550363,0.547857,0.706676,0.238682,0.59146,0.296979,0.345063,0.564557,0.561075,0.606371,0.692919,0.683213,1.0,0.110692,0.517573
0.142012,0.122331,0.166763,0.183504,0.166453,0.378982,0.165766,0.116902,0.141468,0.16052,0.178429,0.159574,0.181951,0.09082,0.110692,1.0,0.143107


In [67]:
correlations_rows = (
    df_all_dataset.corr()
    .with_columns(pl.Series(name="index", values=df_all_dataset.columns))
    .unpivot(index = "index")
    .filter(pl.col('index') != pl.col('variable'))
)
correlations_rows

index,variable,value
str,str,f64
"""DLOC""","""CLOC""",0.94638
"""LLOC""","""CLOC""",0.540243
"""LOC""","""CLOC""",0.76514
"""McCC""","""CLOC""",0.437091
"""NOI""","""CLOC""",0.271711
…,…,…
"""TNOS""","""Size Metric Rules""",0.665495
"""WarningInfo""","""Size Metric Rules""",0.971163
"""Anti Pattern""","""Size Metric Rules""",0.751519
"""Complexity Metric Rules""","""Size Metric Rules""",0.517573


In [68]:
correlations_rows_important = correlations_rows.filter(pl.col('value').abs() > 0.7).sort(by='value', descending=True).gather_every(2)
correlations_rows_important.write_csv('outputs/method-all-important.csv')
correlations_rows_important

index,variable,value
str,str,f64
"""CLOC""","""TCLOC""",0.997118
"""LOC""","""TLOC""",0.984557
"""TLLOC""","""LLOC""",0.975794
"""TNOS""","""NOS""",0.975302
"""Size Metric Rules""","""WarningInfo""",0.971163
…,…,…
"""TLLOC""","""McCC""",0.725518
"""TLOC""","""McCC""",0.720085
"""WarningInfo""","""TNOS""",0.719678
"""WarningInfo""","""NOS""",0.712746


### Meta Analysis

In [69]:
import numpy as np
import polars as pl
import scipy

def r_to_z(r_colname) -> pl.Expr:
    return (0.5 * pl.Expr.log( (1 + pl.col(r_colname)) / (1 - pl.col(r_colname))))


def variance_z(n_colname) -> pl.Expr:
    return 1/(pl.col(n_colname) - 3)

def z_to_r(z):
    return (np.exp(2*z)-1)/(np.exp(2*z)+1)


def std_z(n_colname) -> pl.Expr:
    return pl.Expr.sqrt(variance_z(n_colname))


def making_fixed_effect_df(df):
    df_fixed = df.with_columns(
        effect_size_Y=r_to_z('correlation'),
        variance_within_V=variance_z('n'),
    ).with_columns(
        raw_weight_W=1/pl.col('variance_within_V')
    ).with_columns(
        WY=(pl.col('raw_weight_W') * pl.col('effect_size_Y')),
        WY_2=(pl.col('raw_weight_W') * (pl.col('effect_size_Y'))**2),
        W_2=(pl.col('raw_weight_W')**2),
    )
    return df_fixed

def fixed_effect(df, alpha=0.05):
    """
    Fixed-effect meta-analysis on given correlations polars.DataFrame.
    """
    fixed_effect_df = making_fixed_effect_df(df)
    fixed_effect_z = fixed_effect_df.select(pl.sum('WY')/pl.sum('raw_weight_W')).item()
    fixed_variance_z = fixed_effect_df.select(1/pl.sum('raw_weight_W')).item()
    fixed_std_z = np.sqrt(fixed_variance_z)
    std_interval = scipy.stats.norm.isf(alpha/2)
    fixed_int_z = (fixed_effect_z-std_interval*fixed_std_z, fixed_effect_z+std_interval*fixed_std_z)
    z_value = fixed_effect_z/fixed_std_z
    p_value = 2*scipy.stats.norm.sf(z_value)
    fixed_effect_cor = z_to_r(fixed_effect_z)
    fixed_int_corr = (z_to_r(fixed_int_z[0]), z_to_r(fixed_int_z[1]))
    return (p_value, fixed_effect_cor, fixed_int_corr[0], fixed_int_corr[1], p_value <= alpha)

def making_random_effects_df(df):
    fixed_effect_df = making_fixed_effect_df(df)
    Q = fixed_effect_df.select(pl.sum('WY_2') - pl.sum('WY')**2 / pl.sum('raw_weight_W')).item()
    C = fixed_effect_df.select(pl.sum('raw_weight_W') - pl.sum('W_2')/pl.sum('raw_weight_W')).item()
    T_2 = (Q - (len(fixed_effect_df)-1))/C
    fixed_effect_df = (
        fixed_effect_df.with_columns(V_star=pl.col('variance_within_V') + T_2)
        .with_columns(W_star=1/pl.col('V_star'))
        .with_columns(W_star_Y=pl.col('W_star')*pl.col('effect_size_Y'))
    )
    return fixed_effect_df

def random_effects(df, alpha=0.05):
    random_effects_df = making_random_effects_df(df)
    random_effect_z = random_effects_df.select(pl.sum('W_star_Y')/pl.sum('W_star')).item()
    random_variance_z = random_effects_df.select(1/pl.sum('W_star')).item()
    random_std_z = np.sqrt(random_variance_z)
    std_interval = scipy.stats.norm.isf(alpha/2)
    random_int_z = (random_effect_z-std_interval*random_std_z, random_effect_z+std_interval*random_std_z)
    z_value = random_effect_z/random_std_z
    p_value = 2*scipy.stats.norm.sf(z_value)
    random_effect_cor = z_to_r(random_effect_z)
    random_int_corr = (z_to_r(random_int_z[0]), z_to_r(random_int_z[1]))
    return (p_value, random_effect_cor, random_int_corr[0], random_int_corr[1], p_value <= alpha)

In [70]:
from tqdm import trange

def meta_analysis_correlations(correlations_list, lengths_list, fixed=True):
    columns_names = correlations_list[0].columns
    results = []
    for i in trange(len(columns_names)):
        for j in range(i + 1, len(columns_names)):
            # analyzing correlations between columns i and j
            data = []
            for corr, length in zip(correlations_list, lengths_list):
                if not corr.isna().iloc[i,j] and abs(corr.iloc[i, j]) != 1 and length > 3:
                    data.append((corr.iloc[i, j], length))
            # print(data)
            df_data = pl.DataFrame(data, schema=['correlation', 'n'], orient='row')
            if fixed:
                result = fixed_effect(df_data)
            else:
                result = random_effects(df_data)
            results.append((columns_names[i], columns_names[j], *result))

    return pl.DataFrame(results, schema=['Metric A', 'Metric B', 'p-value', 'effect', 'interval_l', 'interval_r', 'significant'], orient='row')

In [71]:
meta_analysis_fixed_results = meta_analysis_correlations(correlations_df_list, lengths_list, fixed=True)
meta_analysis_fixed_results.write_csv("outputs/method-meta-analysis-fixed.csv")

100%|██████████| 16/16 [00:00<00:00, 27.14it/s]


In [72]:
meta_analysis_fixed_results.filter(
    (pl.col('significant') == 1) & (pl.col('effect').abs() > 0.7)
).sort(pl.col('effect').abs(), descending=True).write_csv('outputs/method-fixed-important.csv')

In [73]:
meta_analysis_random_results = meta_analysis_correlations(correlations_df_list, lengths_list, fixed=False)
meta_analysis_random_results.write_csv("outputs/method-meta-analysis-random.csv")

100%|██████████| 16/16 [00:00<00:00, 22.37it/s]


In [74]:
meta_analysis_random_results.filter(
    (pl.col('significant') == 1) & (pl.col('effect').abs() > 0.7)
).sort(pl.col('effect').abs(), descending=True).write_csv('outputs/method-random-important.csv')

In [84]:
meta_analysis_fixed_results["p-value"].unique()

p-value
f64
0.0


In [75]:
meta_analysis_fixed_results.sort(by='p-value', descending=True).gather_every(100).head(10)

Metric A,Metric B,p-value,effect,interval_l,interval_r,significant
str,str,f64,f64,f64,f64,i64
"""CLOC""","""DLOC""",0.0,0.929271,0.928466,0.930068,1
"""TLLOC""","""TNOS""",0.0,0.902864,0.901773,0.903943,1


In [77]:
i = 2
j = -6
# analyzing correlations between columns i and j
data = []
for corr, length in zip(correlations_df_list, lengths_list):
    if not corr.isna().iloc[i,j] and abs(corr.iloc[i, j]) != 1 and length > 3:
        data.append((corr.iloc[i, j], length))
# print(data)
df_data = pl.DataFrame(data, schema=['correlation', 'n'], orient='row')
print(df_data)

shape: (28, 2)
┌─────────────┬───────┐
│ correlation ┆ n     │
│ ---         ┆ ---   │
│ f64         ┆ i64   │
╞═════════════╪═══════╡
│ 0.897817    ┆ 27    │
│ 0.933046    ┆ 2592  │
│ 0.963229    ┆ 522   │
│ 0.903754    ┆ 2012  │
│ 0.953661    ┆ 1657  │
│ …           ┆ …     │
│ 0.981204    ┆ 1021  │
│ 0.900973    ┆ 4189  │
│ 0.89214     ┆ 25244 │
│ 0.99126     ┆ 1945  │
│ 0.818055    ┆ 3975  │
└─────────────┴───────┘


In [78]:
result = fixed_effect(df_data)
result

(0.0, 0.9089264423058883, 0.9079006546413128, 0.9099413442099106, True)

In [2]:
import pandas as pd

def find_columns_to_drop(file_path, correlation_threshold=0.9):
    df = pd.read_csv(file_path)
    correlations = df[["Metric A", "Metric B", "effect"]]
    high_correlations = correlations[correlations["effect"].abs() >= correlation_threshold]
    columns_to_drop = set()
    processed_columns = set()
    
    for _, row in high_correlations.iterrows():
        col_a, col_b = row["Metric A"], row["Metric B"]
        if col_a not in processed_columns and col_b not in processed_columns:
            columns_to_drop.add(col_b)
            processed_columns.add(col_a)
            processed_columns.add(col_b)
        elif col_a in processed_columns:
            columns_to_drop.add(col_b)
            processed_columns.add(col_b)
        elif col_b in processed_columns:
            columns_to_drop.add(col_a)
            processed_columns.add(col_a)
    
    return list(columns_to_drop)

file_path = "outputs/method-random-important.csv"
columns_to_drop = find_columns_to_drop(file_path, correlation_threshold=0.9)
print("Columns to drop:", columns_to_drop)


Columns to drop: ['LOC', 'TCLOC', 'NOS', 'TLLOC', 'TNOS', 'DLOC', 'TLOC']


In [3]:
empty_cols = ['CD', 'NII', 'NL', 'NLE', 'TCD', 'WarningBlocker', 'WarningCritical', 'WarningMajor', 'WarningMinor', 'Documentation Metric Rules']

final_list_90 = empty_cols.copy()
final_list_80 = empty_cols.copy()
final_list_70 = empty_cols.copy()

final_list_90.extend(find_columns_to_drop(file_path, 0.9))
final_list_80.extend(find_columns_to_drop(file_path, 0.8))
final_list_70.extend(find_columns_to_drop(file_path, 0.7))

In [9]:
final_list_90

['CD',
 'NII',
 'NL',
 'NLE',
 'TCD',
 'Documentation Metric Rules',
 'LOC',
 'TCLOC',
 'NOS',
 'TLLOC',
 'TNOS',
 'DLOC',
 'TLOC']

In [5]:
final_list_80

['CD',
 'NII',
 'NL',
 'NLE',
 'TCD',
 'Documentation Metric Rules',
 'LOC',
 'TCLOC',
 'NOS',
 'McCC',
 'TLLOC',
 'TNOS',
 'DLOC',
 'Size Metric Rules',
 'TLOC']

In [6]:
final_list_70

['CD',
 'NII',
 'NL',
 'NLE',
 'TCD',
 'Documentation Metric Rules',
 'LOC',
 'TCLOC',
 'NOS',
 'Complexity Metric Rules',
 'McCC',
 'TLLOC',
 'Anti Pattern',
 'TNOS',
 'DLOC',
 'Size Metric Rules',
 'TLOC']

In [7]:
print(f"With threshold 0.9 we drop {len(final_list_90)} columns, with threshold 0.8 we drop {len(final_list_80)} columns, with threshold 0.7 we drop {len(final_list_70)} columns.")

With threshold 0.9 we drop 17 columns, with threshold 0.8 we drop 19 columns, with threshold 0.7 we drop 22 columns.


In [10]:
[x for x in final_list_80 if x not in final_list_90]

['McCC', 'Size Metric Rules']

In [11]:
[x for x in final_list_70 if x not in final_list_80]

