In [1]:
# imports

import numpy as np
import re
import pandas as pd
import networkx as nx
import squarify

# to import data of the found clones
import json

# to make visualisations 
import matplotlib.pyplot as plt
import plotly.express as px
import ipywidgets as widgets
import seaborn as sns

In [2]:
with open("C:/SE_master/rascall_series2_working/smallsq_clone_report_type1.json") as json_file:
    smallsql_type1 = json.load(json_file)

with open("C:/SE_master/rascall_series2_working/hsqldb_clone_report_type1.json") as json_file:
    hsql_type1 = json.load(json_file)

In [3]:
def extract_filename(entry):
    """Extracts only the filename of the entire file src information.
    The filename is the last component ending with .java"""
    full = re.search(r'(.+?\.java)\|', entry).group(1)
    return full.split("/")[-1]

Heatmap

In [4]:
def create_heatmap(choice) -> None:
    if choice == "smallsql":
        data = smallsql_type1 
    elif choice == "hsql":
        data = hsql_type1
        
    pairs = []

    if "example_clone_classes" not in data:
        print("Wrong input data")
        return

    for clone_class, clones in data["example_clone_classes"].items():
        file_names = [extract_filename(e) for e in clones]
        unique_files_in_class = sorted(list(set(file_names)))

        for i in range(len(unique_files_in_class)):
            for j in range(i + 1, len(unique_files_in_class)):
                pairs.append((unique_files_in_class[i], unique_files_in_class[j]))

    files = sorted({f for p in pairs for f in p})

    # Create empty matrix / dataframe
    df = pd.DataFrame(0, index=files, columns=files)

    # Fill the matrix with the number of shared clone classes
    for a, b in pairs:
        # Increment the count for file pair (a, b) and (b, a) by 1
        df.loc[a, b] += 1
        df.loc[b, a] += 1

    # Plot
    fig = px.imshow(df, width=1000, height=800, color_continuous_scale="Reds", 
                    labels=dict(color="Shared Clone Classes"), zmin=0, zmax=df.values.max())
    
    fig.update_xaxes(tickangle=45) 
    
    fig.update_layout(title=f"Clone Heatmap for {choice.upper()} (Shared Clone Classes)")
    fig.update_layout(xaxis_title="Files (x-axis)", yaxis_title="Files (y axis)")
    fig.show()

widgets.interact(create_heatmap, choice=["smallsql", "hsql"])

interactive(children=(Dropdown(description='choice', options=('smallsql', 'hsql'), value='smallsql'), Output()…

<function __main__.create_heatmap(choice) -> None>

## Dot plot

In [5]:
import pandas as pd
import numpy as np
import plotly.express as px

data = smallsql_type1

FRAGMENT_REGEX = re.compile(r'\|.*?(smallsql.*?\.java)\|\(.*?<(\d+),\d+>,<(\d+),\d+>\)')
parsed_clones = []
all_files = set()

for clone_class_id, fragments in data["example_clone_classes"].items():
    for fragment in fragments:
        regex = FRAGMENT_REGEX.search(fragment)
        if not regex:
            print(f"Warning: regex at {fragment} goes wrong.")
        full_path = regex.group(1) 
        file_name = extract_filename(fragment)
        start_line = int(regex.group(2))
        end_line = int(regex.group(3))
        
        parsed_clones.append({
            'clone_class': clone_class_id,
            'file_name': file_name,
            'start_line': start_line,
            'end_line': end_line
        })
        all_files.add(file_name)

file_list = sorted(list(all_files))

def create_dot_plot(file_x_name, file_y_name):
    file_x_fragments = [c for c in parsed_clones if c['file_name'] == file_x_name]
    file_y_fragments = [c for c in parsed_clones if c['file_name'] == file_y_name]

    rows = []

    for cc_id in data["example_clone_classes"].keys():
        fx_list = [f for f in file_x_fragments if f['clone_class'] == cc_id]
        fy_list = [f for f in file_y_fragments if f['clone_class'] == cc_id]

        for fx in fx_list:
            for fy in fy_list:
                x_vals = np.linspace(fx['start_line'], fx['end_line'], 20)
                y_vals = np.linspace(fy['start_line'], fy['end_line'], 20)

                for x, y in zip(x_vals, y_vals):
                    rows.append({
                        "x": x,
                        "y": y,
                        "clone_class": cc_id
                    })

    if len(rows) == 0:
        print("No clones")
        return
    df = pd.DataFrame(rows)

    fig = px.scatter(
        df,
        x="x",
        y="y",
        color="clone_class",
        title=f"Clone Dot Plot: {file_x_name} vs {file_y_name}",
        labels={
            "x": f"Lines in {file_x_name}",
            "y": f"Lines in {file_y_name}"
        }
    )

    # Important zoom-related settings
    fig.update_layout(
        dragmode="zoom",
        hovermode="closest"
    )

    max_x_line = max([f['end_line'] for f in file_x_fragments] + [0])
    max_y_line = max([f['end_line'] for f in file_y_fragments] + [0])


    fig.update_xaxes(range=[0, max_x_line], autorange=False)
    fig.update_yaxes(range=[0, max_y_line], autorange=False)

    fig.show()

# Set up interactive plot
file_x_widget = widgets.Dropdown(options=file_list, description='File X')
file_y_widget = widgets.Dropdown(options=file_list, description='File Y')

print("Select two filenames below to generate the Dot Plot showing shared code fragments.")

# Display the interactive control
widgets.interact(
    create_dot_plot,
    file_x_name=file_x_widget,
    file_y_name=file_y_widget
)

Select two filenames below to generate the Dot Plot showing shared code fragments.


interactive(children=(Dropdown(description='File X', options=('BasicTestCase.java', 'BenchTest.java', 'Express…

<function __main__.create_dot_plot(file_x_name, file_y_name)>

## Tree map

In [6]:
import pandas as pd
import re
import plotly.express as px

# Regex functions
def extract_filename(entry):
    """Extracts the filename and its directory path."""
    match = re.search(r'(.+?\.java)\|', entry)

    if not match:
        print("Something went wrong")
        return None
    
    full_path = match.group(1)
    parts = full_path.split("/")
    filename = parts[-1]
    directory = "/".join(parts[parts.index('smallsql'):-1])
    return filename, directory

def extract_line_numbers(entry):
    """Extracts the start and end line numbers of the clone fragment."""
    # Find start and end line using regex
    line_match = re.search(r'<(\d+),\d+>,<(\d+),\d+>', entry)

    if not line_match:
        return 0
    
    start_line = int(line_match.group(1))
    end_line = int(line_match.group(2))

    return end_line - start_line + 1

def extract_total_lines(data, filename) -> int:
    """"""
    lpf_dict = data["lines_per_file"]
    lines_of_file = [lines_per_file[1] for lines_per_file in lpf_dict.items() if extract_filename(lines_per_file[0])[0] == filename]
    if len(lines_of_file) != 1:
        print("Something went wrong in extract_total_lines")
    return lines_of_file[0]

# Create Treemap

file_metrics = {}
all_cloned_lines = 0

for cc_id, fragments in data["example_clone_classes"].items():
    for fragment in fragments:
        filename, directory = extract_filename(fragment)
        line_length = extract_line_numbers(fragment)
        total_file_length = extract_total_lines(data, filename)
        
        if filename not in file_metrics:
            file_metrics[filename] = {
                'directory': directory,
                'total_lines': total_file_length,
                'cloned_lines': 0, # will be updated later
                'cloned_line_fragments': set(), # will be updated later
                'filename': filename
            }

        file_metrics[filename]['cloned_lines'] += line_length
        all_cloned_lines += line_length
        

        # size_match = re.search(r'\|\((\d+),(\d+),', fragment)
        # if not size_match:
        #     print("Something went wrong with size")
        #     continue

# Convert metrics dictionary to a DataFrame
df_list = []
for metrics in file_metrics.values():

    if metrics['total_lines'] == 0:
        print("Something went wrong")
        continue
    
    cloned_percentage = (metrics['cloned_lines'] / metrics['total_lines']) * 100
    
    # Store data for Treemap
    df_list.append({
        'Path': f"smallsql/{metrics['directory']}/{metrics['filename']}",
        'Module': metrics['directory'].split('/')[1] if '/' in metrics['directory'] else metrics['directory'],
        'File': metrics['filename'],
        'Total Lines (Area)': metrics['total_lines'],
        'Cloned Lines': metrics['cloned_lines'],
        'Cloned % (Color)': cloned_percentage
    })

df = pd.DataFrame(df_list)

fig = px.treemap(
    df,
    path=[px.Constant("All"), 'Module', 'File'], # Hierarchy: Root -> Module -> File
    values='Total Lines (Area)',
    color='Cloned % (Color)',
    color_continuous_scale='Reds', # Use a heat map scale (low clone % = light, high clone % = dark red)
    range_color=[0, df['Cloned % (Color)'].max()],
    title='Clone Detection Treemap: Area=Total Lines, Color=Cloned % ',
    hover_data={'Total Lines (Area)': True, 'Cloned Lines': True, 'Cloned % (Color)': ':.2f'},
    height=600
)

fig.update_layout(
    margin=dict(t=50, l=10, r=10, b=10),
)

# Display a note about the interactive nature
print("This is an interactive Treemap visualization. Hover over any rectangle to see the metrics, and double-click to zoom in/out of directories.")

# Display the figure
fig.show()

This is an interactive Treemap visualization. Hover over any rectangle to see the metrics, and double-click to zoom in/out of directories.
