### Generation of CSV files for datasets
This notebook is responsible for generating CSV files for the datasets used in the project. It creates a balanced dataset of file pairs, including both similar and dissimilar pairs, and saves it in a structured format for further analysis and model training.

In [3]:
# Import required modules for file processing and comparison
import os
from scsc import Compare, simple_process_files
import pandas as pd
import random

# Get the absolute path of the current working directory
files_path = os.path.abspath(os.getcwd())
# Navigate up one level to the parent directory
parent_path = os.path.dirname(files_path)
# Construct the path to the target directory containing Python files
target_path = os.path.join(parent_path, "datasets", "small")
# Load file names and contents from the target directory
file_names, file_contents = simple_process_files(target_path)

# Shuffle the file names and contents together to ensure random pairing
arr_shuffled = list(zip(file_names, file_contents))
random.shuffle(arr_shuffled)
file_names, file_contents = zip(*arr_shuffled)

"""
    Maximum Limits and Thresholds for Pair Classification
    -----------------------------------------------
    To create a balanced dataset of similar and dissimilar pairs
    we set maximum limits for positive and negative pairs
"""
MAX_POSITIVE_PAIRS = (
    500  # Set a maximum limit for positive pairs to balance the dataset
)
MAX_NEGATIVE_PAIRS = (
    500  # Set a maximum limit for negative pairs to balance the dataset
)
THRESHOLD = (
    0.85  # Set a threshold for similarity score to classify pairs as similar or not
)

labels = []
files = []

scores_ted = []
scores_mdiff = []
scores_lf = []

positive_cases = 0
negative_cases = 0

def fixed_file_name(file_name):
    """
    Extract the base name of the file without extension for comparison.
    This helps in identifying similar files that may have different extensions or paths.
    """
    return os.path.splitext(os.path.basename(file_name))[0] + ".py"

for i in range(len(file_names)):

    for j in range(i + 1, len(file_names)):
        score_ted = Compare(file_contents[i], file_contents[j], method="ted")
        score_mdiff = Compare(file_contents[i], file_contents[j], method="mdiff")
        score_lf = Compare(file_contents[i], file_contents[j], method="lf")

        if (
            score_ted >= THRESHOLD
            and score_mdiff >= THRESHOLD
            and score_lf >= THRESHOLD
            and positive_cases < MAX_POSITIVE_PAIRS
        ):
            labels.append(1)
            files.append((fixed_file_name(file_names[i]), fixed_file_name(file_names[j])))
            scores_ted.append(score_ted)
            scores_mdiff.append(score_mdiff)
            scores_lf.append(score_lf)
            positive_cases += 1
        elif negative_cases < MAX_NEGATIVE_PAIRS:
            labels.append(0)
            files.append((fixed_file_name(file_names[i]), fixed_file_name(file_names[j])))
            scores_ted.append(score_ted)
            scores_mdiff.append(score_mdiff)
            scores_lf.append(score_lf)
            negative_cases += 1


data = {
    'File_1': [pair[0] for pair in files],
    'File_2': [pair[1] for pair in files],
    'Label': labels
}

output_file = os.path.join(target_path, 'small_dataset.csv')


df = pd.DataFrame(data)
df.to_csv(output_file, index=False)


features_data = {
    'File_1': [pair[0] for pair in files],
    'File_2': [pair[1] for pair in files],
    'TED': scores_ted,
    'Myers Diff': scores_mdiff,
    'Local Fingerprint': scores_lf,
    'Labels': labels
}

features_df = pd.DataFrame(features_data)
features_output_file = os.path.join(target_path, 'small_features_dataset.csv')
features_df.to_csv(features_output_file, index=False)