In [None]:
import subprocess

import ipywidgets as widgets
import pandas as pd

from util.detection import metrics, metrics_without_count, add_basic_plagiarism_rules
from util.graph import create_plagiarism_graph, create_plotly_plagiarism_graph
from util.io import read_comparison_csv
from util.jupyter import SelectFileButton

%load_ext autoreload
%autoreload 2

### 1. Select comparison file

Run the following cell and select any comparison CSV file.

In [None]:
file_button = SelectFileButton()
file_button

(Optionally) change `excluded_types` according to the selected comparison file to exclude certain types from being visualized in the following graph.

In [None]:
excluded_types = []  # Might need to change depending on selected comparison file
df = read_comparison_csv(file_button.file, excluded_types)
print(file_button.file)

### 2. Add plagiarism detection rules

This step is highly customizable. By default, in the function `add_basic_plagiarism_rules`, 4 rules will be created (based on the default metrics) that are used for plagiarism detection:
- `rule0`: Matches if all metrics are $0$.
- `rule1`: Matches if `ASTCountDiffMetric` is $< 0.2$ and the sum of all metrics is $< 0.05$.
- `rule2`: Matches if `ASTCountDiffMetric` is $< 0.2$ and $\geq 3$ metrics (excluding `ASTCountDiffMetric`) are $0$.
- `rule3`: Matches if `ASTCountDiffMetric` is $< 0.2$ and `RenamedASTDiffMetric` is $< 0.01$.

Note that these rules are *not* mutually exclusive!

Select the checkboxes for the rules that should be applied in plagiarism detection.

In [None]:
df = add_basic_plagiarism_rules(df)
checkboxes = [widgets.Checkbox(description=c) for c in df.columns if "rule" in c]
checkboxes[0].value = True  # Check first box by default
widgets.HBox(checkboxes)

### 3. Create plagiarism graph

Based on the selected checkboxes above, plagiarisms are detected and then visualized in a graph. Edges on the graph (=plagiarism match between two students) can be clicked. The behavior in the below implementation is to open the difference highlighting program [Meld](https://meld.de.softonic.com/) with the two suspected files (implementation note: the Meld executable must be part of the `Path` environment variable; this is to avoid having to specify absolute paths).

In [None]:
def show_diff(data: pd.DataFrame, blocking: bool = True):
    for i in range(len(data)):
        row = data.iloc[i]
        if blocking:
            subprocess.run(["Meld", row["file1"], row["file2"]])
        else:
            subprocess.Popen(["Meld", row["file1"], row["file2"]])


df_ = df.copy()
df_["is_plag"] = False
for checkbox in checkboxes:
    if checkbox.value:
        df_["is_plag"] |= df[checkbox.description]
graph = create_plagiarism_graph(df_[df_["is_plag"]])

create_plotly_plagiarism_graph(graph, edge_click_callback=lambda x: show_diff(x, blocking=False))