## Filtering and Saving Results

In [1]:
# Import required modules for file processing and comparison
import os
from scsc import simple_process_files
from scsc import full_comparison_output

# Get the absolute path of the current working directory
files_path = os.path.abspath(os.getcwd())
# Navigate up one level to the parent directory
parent_path = os.path.dirname(files_path)
# Construct the path to the target directory containing Python files
target_path = os.path.join(parent_path, 'datasets', 'medium')
# Load file names and contents from the target directory
file_names, file_contents = simple_process_files(target_path)

In [2]:
# Compare files using the 'ted' method and save results to a specified CSV file
res_ted = os.path.join(parent_path, 'generate_balanced_dataset', 'res_md_dataset_ted.csv')
full_comparison_output(file_names, file_contents, method='ted', csv_file=res_ted)

'Full comparison report generated: /Users/edsoneddy/Projects/Personal/scsc/notebooks/generate_balanced_dataset/res_md_dataset_ted.csv'

In [3]:
# Compare files using the 'lf' method and save results to a specified CSV file
res_lf = os.path.join(parent_path, 'generate_balanced_dataset', 'res_md_dataset_lf.csv')
full_comparison_output(file_names, file_contents, method='lf', csv_file=res_lf)

'Full comparison report generated: /Users/edsoneddy/Projects/Personal/scsc/notebooks/generate_balanced_dataset/res_md_dataset_lf.csv'

In [4]:
# Compare files using the 'mdiff' method and save results to a specified CSV file
res_mdiff = os.path.join(parent_path, 'generate_balanced_dataset', 'res_md_dataset_mdiff.csv')
full_comparison_output(file_names, file_contents, method='mdiff', csv_file=res_mdiff)

'Full comparison report generated: /Users/edsoneddy/Projects/Personal/scsc/notebooks/generate_balanced_dataset/res_md_dataset_mdiff.csv'

In [5]:
import pandas as pd
from pathlib import Path

def fixed_path(path):
    return os.path.join(parent_path, 'datasets', 'medium', path + '.py')

# Read the result CSV files into DataFrames
paths = [Path(res_lf), Path(res_mdiff), Path(res_ted)]
dfs = []
for p in paths:
    df = pd.read_csv(p, index_col=0)
    df.index = df.index.astype(str)
    df.columns = df.columns.astype(str)
    dfs.append(df)

# Initialize the result DataFrame with zeros
rows = dfs[0].shape[0]
cols = dfs[0].shape[1]
df_index = dfs[0].index
df_columns = dfs[0].columns
df_result = pd.DataFrame(0, index=df_index, columns=df_columns)

# Define thresholds and minimum methods for consensus
THRESHOLD = 0.75
MIN_METHODS = 3

# Initialize a set to store unique results
res_set = set()

res_pairs_true = set()
res_pairs_false = set()

for i in range(rows):
    for j in range(cols):
        val_lf = dfs[0].iat[i, j]
        val_mdiff = dfs[1].iat[i, j]
        val_ted = dfs[2].iat[i, j]

        if (
            sum(
                [
                    val_lf >= THRESHOLD,
                    val_mdiff >= THRESHOLD,
                    val_ted >= THRESHOLD,
                ]
            )
            >= MIN_METHODS
        ):
            df_result.iat[i, j] = 1
            if i != j and i < j:
                res_set.add(fixed_path(df_result.index[i]))
                res_set.add(fixed_path(df_result.columns[j]))
                res_pairs_true.add((fixed_path(df_result.index[i]), fixed_path(df_result.columns[j])))
        else:
            df_result.iat[i, j] = 0
            if i != j and i < j:
                res_pairs_false.add((fixed_path(df_result.index[i]), fixed_path(df_result.columns[j])))


# Save the all results to a combined CSV file
combined_res_path = os.path.join(parent_path, "generate_balanced_dataset", "res_all_methods.csv")
df_result.to_csv(combined_res_path)

In [8]:
# Convert the set of unique results to a list for further processing
results_array = list(res_set)

# Save the unique file to different directory
output_dir = os.path.join(parent_path, "datasets", "balanced")
# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Copy each unique file to the output directory
for file_path in results_array:
    file_name = os.path.basename(file_path)
    output_path = os.path.join(output_dir, file_name)
    with open(file_path, 'r') as src_file:
        content = src_file.read()
    with open(output_path, 'w') as dst_file:
        dst_file.write(content)


In [9]:
res_pairs_true = list(res_pairs_true)[:100]  # Limit to the first 100 pairs for true pairs
res_pairs_false = list(res_pairs_false)[:100]  # Limit to the first 100 pairs for false pairs

# Saving the true and false pairs to text files
true_positive_pairs_path = os.path.join(parent_path, "generate_balanced_dataset", "true_positive_pairs.txt")
true_negative_pairs_path = os.path.join(parent_path, "generate_balanced_dataset", "true_negative_pairs.txt")
with open(true_positive_pairs_path, 'w') as true_file:
    for pair in res_pairs_true:
        true_file.write(f"{pair[0]} {pair[1]}\n")
with open(true_negative_pairs_path, 'w') as false_file:
    for pair in res_pairs_false:
        false_file.write(f"{pair[0]} {pair[1]}\n")