In [1]:
# imports

import numpy as np
import re
import pandas as pd
import networkx as nx
import squarify

# to import data of the found clones
import json

# to make visualisations 
import matplotlib.pyplot as plt
import plotly.express as px
import ipywidgets as widgets
import seaborn as sns

In [2]:
with open("C:/SE_master/rascall_series2_working/smallsq_clone_report_type1.json") as json_file:
    smallsql_type1 = json.load(json_file)

with open("C:/SE_master/rascall_series2_working/hsqldb_clone_report_type1.json") as json_file:
    hsql_type1 = json.load(json_file)

In [3]:
def extract_exact_filename(entry):
    """Extracts only the filename of the entire file src information.
    The filename is the last component ending with .java"""
    full = re.search(r'(.+?\.java)\|', entry).group(1)
    return full.split("/")[-1]

Heatmap

In [4]:
def create_heatmap(choice) -> None:
    if choice == "smallsql":
        data = smallsql_type1 
    elif choice == "hsqldb":
        data = hsql_type1
        
    pairs = []

    if "example_clone_classes" not in data:
        print("Wrong input data")
        return

    for clone_class, clones in data["example_clone_classes"].items():
        file_names = [extract_exact_filename(e) for e in clones]
        unique_files_in_class = sorted(list(set(file_names)))

        for i in range(len(unique_files_in_class)):
            for j in range(i + 1, len(unique_files_in_class)):
                pairs.append((unique_files_in_class[i], unique_files_in_class[j]))

    files = sorted({f for p in pairs for f in p})

    # Create empty matrix / dataframe
    df = pd.DataFrame(0, index=files, columns=files)

    # Fill the matrix with the number of shared clone classes
    for a, b in pairs:
        # Increment the count for file pair (a, b) and (b, a) by 1
        df.loc[a, b] += 1
        df.loc[b, a] += 1

    # Plot
    fig = px.imshow(df, width=1000, height=800, color_continuous_scale="Reds", 
                    labels=dict(color="Shared Clone Classes"), zmin=0, zmax=df.values.max())
    
    fig.update_xaxes(tickangle=45) 
    
    fig.update_layout(title=f"Clone Heatmap for {choice.upper()} (Shared Clone Classes)")
    fig.update_layout(xaxis_title="Files (x-axis)", yaxis_title="Files (y axis)")
    fig.show()

widgets.interact(create_heatmap, choice=["smallsql", "hsqldb"])

interactive(children=(Dropdown(description='choice', options=('smallsql', 'hsqldb'), value='smallsql'), Output…

<function __main__.create_heatmap(choice) -> None>

## Dot plot

In [5]:
def extract_last_line(data, filename) -> int:
    """"""
    last_line = 0
    for key, value in data['last_line_per_file'].items():
        if key.endswith(filename + '|'):
            last_line = value
    
    return last_line

data = smallsql_type1
def make_dot_plots(choice):
    if choice == "smallsql":
        data = smallsql_type1
        FRAGMENT_REGEX = re.compile(r'\|.*?(smallsql.*?\.java)\|\(.*?<(\d+),\d+>,<(\d+),\d+>\)')
    elif choice == "hsqldb":
        data = hsql_type1
        FRAGMENT_REGEX = re.compile(r'\|.*?(hsqldb.*?\.java)\|\(.*?<(\d+),\d+>,<(\d+),\d+>\)')
    else:
        print("Wrong choice!")
        return

    parsed_clones = []
    all_files = set()

    for clone_class_id, fragments in data["example_clone_classes"].items():
        for fragment in fragments:
            regex = FRAGMENT_REGEX.search(fragment)
            if not regex:
                print(f"Warning: regex at {fragment} goes wrong.")
            full_path = regex.group(1) 
            file_name = extract_exact_filename(fragment)
            start_line = int(regex.group(2))
            end_line = int(regex.group(3))
            
            parsed_clones.append({
                'clone_class': clone_class_id,
                'file_name': file_name,
                'start_line': start_line,
                'end_line': end_line
            })
            all_files.add(file_name)

    file_list = sorted(list(all_files))

    def create_dot_plot(file_x_name, file_y_name):
        file_x_fragments = [c for c in parsed_clones if c['file_name'] == file_x_name]
        file_y_fragments = [c for c in parsed_clones if c['file_name'] == file_y_name]

        rows = []

        for cc_id in data["example_clone_classes"].keys():
            fx_list = [f for f in file_x_fragments if f['clone_class'] == cc_id]
            fy_list = [f for f in file_y_fragments if f['clone_class'] == cc_id]

            for fx in fx_list:
                for fy in fy_list:
                    x_vals = np.linspace(fx['start_line'], fx['end_line'], 20)
                    y_vals = np.linspace(fy['start_line'], fy['end_line'], 20)

                    for x, y in zip(x_vals, y_vals):
                        rows.append({
                            "x": x,
                            "y": y,
                            "clone_class": cc_id
                        })

        if len(rows) == 0:
            print("No clones")
            return
        df = pd.DataFrame(rows)

        fig = px.scatter(
            df,
            x="x",
            y="y",
            color="clone_class",
            title=f"Clone Dot Plot: {file_x_name} vs {file_y_name}",
            labels={
                "x": f"Lines in {file_x_name}",
                "y": f"Lines in {file_y_name}"
            }
        )

        # Important zoom-related settings
        fig.update_layout(
            dragmode="zoom",
            hovermode="closest"
        )

        max_x = extract_last_line(data, file_x_name)
        max_y = extract_last_line(data, file_y_name)
        #max_x = max([f['end_line'] for f in file_x_fragments] + [0])
        #max_y = max([f['end_line'] for f in file_y_fragments] + [0])


        fig.update_xaxes(range=[0, max_x], autorange=False)
        fig.update_yaxes(range=[0, max_y], autorange=False)

        fig.show()

    # Set up interactive plot
    file_x_widget = widgets.Dropdown(options=file_list, description='File X')
    file_y_widget = widgets.Dropdown(options=file_list, description='File Y')

    print("Select two filenames below to generate the Dot Plot showing shared code fragments.")

    # Display the interactive control
    widgets.interact(
        create_dot_plot,
        file_x_name=file_x_widget,
        file_y_name=file_y_widget
    )

widgets.interact(make_dot_plots, choice=["smallsql", "hsqldb"])

interactive(children=(Dropdown(description='choice', options=('smallsql', 'hsqldb'), value='smallsql'), Output…

<function __main__.make_dot_plots(choice)>

## Tree map

In [6]:
# Helper functions, some using regex

def extract_filename(entry, choice):
    """
    Extracts the filename and its directory path, ensuring the path starts 
    from the project root name (choice) for consistent Treemap hierarchy.
    """
    match = re.search(r'///(.+?\.java)\|', entry)

    if not match:
        print(f"Extraction failed for entry: {entry}")
        return None, None
    
    full_path = match.group(1)
    parts = full_path.split("/")
    filename = parts[-1]

    start_index = len(parts) - 1 - parts[::-1].index(choice)

    directory_with_root = "/".join(parts[start_index:-1])
    
    return filename, directory_with_root

def extract_total_lines(data, filename, choice) -> int:
    """"""
    lpf_dict = data["lines_per_file"]
    lines_of_file = [lines_per_file[1] for lines_per_file in lpf_dict.items() if extract_filename(lines_per_file[0], choice)[0] == filename]
    if len(lines_of_file) != 1:
        return sum(lines_of_file)
    return lines_of_file[0]

def extract_clone_lines(data, filename, choice) -> int:
    """"""
    lines_per_clone = None
    for key, value in data['lines_per_clone'].items():
        if key.endswith(filename):
            lines_per_clone = value
    
    if lines_per_clone != None:
        return lines_per_clone
    else:
        print("extract_clone_lines goes wrong")
        return 0

# Create Treemap

def create_tree_maps(choice) -> None:
    if choice == "smallsql":
        data = smallsql_type1
    elif choice == "hsqldb":
        data = hsql_type1

    file_metrics = dict()

    for cc_id, fragments in data["example_clone_classes"].items():
        for fragment in fragments:
            # Get information about current file
            filename, directory = extract_filename(fragment, choice)
            total_file_length = extract_total_lines(data, filename, choice)
            lines_for_clones = extract_clone_lines(data, filename, choice)

            key = f"{directory}/{filename}"

            # Insert the information inside a dict
            if key not in file_metrics:
                file_metrics[key] = {
                    'directory': directory,
                    'total_lines': total_file_length,
                    'cloned_lines': lines_for_clones,
                    'unique_cloned_lines': set(),
                    'filename': filename
                }

    # Convert metrics to a pd.DataFrame
    df_list = []
    for metrics in file_metrics.values():
        dir_parts = metrics['directory'].split('/')
        project_name = dir_parts[0]
        if len(dir_parts) > 1:
            module_name = dir_parts[1]
        else:
            # If the file is in the root, label the module as 'Project Root'
            module_name = "Project Root"
        subpath = "/".join(dir_parts[2:]) if len(dir_parts) > 2 else ""

        cloned_percentage = (metrics['cloned_lines'] / metrics['total_lines']) * 100

        # Store data for Treemap in the DataFrame
        df_list.append({
            'Project': project_name,
            'Module': module_name,
            'Subpath': subpath if subpath else 'No subpath',  # Use None for leaf level if no subpath
            'File': metrics['filename'],
            'Total Lines': metrics['total_lines'],
            'Cloned Lines': metrics['cloned_lines'],
            'Cloned % (Color)': cloned_percentage
        })

    df = pd.DataFrame(df_list)

    path_levels = [px.Constant("All"), 'Project', 'Module', 'File']

    fig = px.treemap(
        df,
        path=path_levels,
        values='Cloned Lines',
        color='Cloned % (Color)',
        color_continuous_scale='Reds',
        range_color=[0, df['Cloned % (Color)'].max()],
        title='Clone Detection Treemap: Area = N Cloned Lines, Color = Cloned % ',
        hover_data={'Total Lines': True, 'Cloned Lines': True, 'Cloned % (Color)': ':.2f'},
        height=600,
        width=1400
    )

    fig.update_layout(
        margin=dict(t=50, l=10, r=10, b=10)
    )

    fig.show()

widgets.interact(create_tree_maps, choice=["smallsql", "hsqldb"])

interactive(children=(Dropdown(description='choice', options=('smallsql', 'hsqldb'), value='smallsql'), Output…

<function __main__.create_tree_maps(choice) -> None>