# **INLP Semester Project**

## **C++ Source Code Plgiarism Checker**



In [None]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=f1980d378964213b8186abd185c3d8c6e7930f3f4d20d6028e8e879d9f651a63
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


This portion imports the necessary libraries for various tasks such as natural language processing, similarity calculation, machine learning, and PDF generation.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, logging
from difflib import SequenceMatcher
import spacy
import matplotlib.pyplot as plt
import os
import pickle
from pygments.lexers import get_lexer_by_name
from fpdf import FPDF
from google.colab import files

This part initializes the English language model for NLP tasks using SpaCy and a transformer model for code embeddings using the Hugging Face library.

In [None]:
# Disable some of the transformers' verbose logging
logging.set_verbosity_error()

# Load the English language model in Spacy
nlp = spacy.load("en_core_web_sm")

# Load a transformer model for code embeddings
tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
model = AutoModel.from_pretrained("huggingface/CodeBERTa-small-v1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/994k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/483k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

Function to get token sequences from code snippets using Pygments

In [None]:
def get_token_sequence(code):
    lexer = get_lexer_by_name("cpp")
    return [token for token, _ in lexer.get_tokens(code)]

This function extracts various features from the C++ code, including functions, variables, control flow constructs, loops, and basic code metrics.

In [None]:
def extract_features_from_code(code):
    doc = nlp(code)

    functions = []
    variables = []
    control_flow = []
    loops = []
    for token in doc:
        if token.pos_ == 'NOUN' and token.dep_ == 'compound':
            if token.head.pos_ == 'NOUN':
                variables.append(token.text)
        elif token.pos_ == 'NOUN' and token.dep_ == 'nsubj':
            if token.head.pos_ == 'VERB':
                functions.append(token.text)
        elif token.text in ['if', 'else', 'switch']:
            control_flow.append(token.text)
        elif token.text in ['for', 'while', 'do']:
            loops.append(token.text)

    code_metrics = {
        'lines_of_code': len(code.split('\n')),
        'cyclomatic_complexity': len(control_flow) + len(loops) + 1
    }

    return functions, variables, control_flow, loops, code_metrics

Function to calculate similarity between two C++ code samples

In [None]:
def calculate_similarity(code1, code2):
    features1 = extract_features_from_code(code1)
    features2 = extract_features_from_code(code2)

    similarity_scores = {}

    # Function similarity
    functions1 = set(features1[0])
    functions2 = set(features2[0])
    function_similarity = len(functions1.intersection(functions2)) / max(len(functions1), len(functions2)) if len(functions1) > 0 and len(functions2) > 0 else 0
    similarity_scores['function_similarity'] = function_similarity

    # Variable similarity
    variables1 = set(features1[1])
    variables2 = set(features2[1])
    variable_similarity = len(variables1.intersection(variables2)) / max(len(variables1), len(variables2)) if len(variables1) > 0 and len(variables2) > 0 else 0
    similarity_scores['variable_similarity'] = variable_similarity

    # Control flow similarity
    control_flow1 = set(features1[2])
    control_flow2 = set(features2[2])
    control_flow_similarity = len(control_flow1.intersection(control_flow2)) / max(len(control_flow1), len(control_flow2)) if len(control_flow1) > 0 and len(control_flow2) > 0 else 0
    similarity_scores['control_flow_similarity'] = control_flow_similarity

    # Loop similarity
    loops1 = set(features1[3])
    loops2 = set(features2[3])
    loop_similarity = len(loops1.intersection(loops2)) / max(len(loops1), len(loops2)) if len(loops1) > 0 and len(loops2) > 0 else 0
    similarity_scores['loop_similarity'] = loop_similarity

    # Code structure similarity
    tokens1 = get_token_sequence(code1)
    tokens2 = get_token_sequence(code2)
    code_similarity = SequenceMatcher(None, tokens1, tokens2).ratio()
    similarity_scores['code_structure_similarity'] = code_similarity

    # Token embedding similarity using transformer model
    inputs1 = tokenizer(code1, return_tensors='pt', truncation=True, max_length=512)
    inputs2 = tokenizer(code2, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        embeddings1 = model(**inputs1).last_hidden_state.mean(dim=1)
        embeddings2 = model(**inputs2).last_hidden_state.mean(dim=1)
    embedding_similarity = torch.nn.functional.cosine_similarity(embeddings1, embeddings2).item()
    similarity_scores['embedding_similarity'] = embedding_similarity

    # Code metrics similarity
    metrics1 = features1[4]
    metrics2 = features2[4]
    metrics_similarity = 1 - sum(abs(value1 - value2) for value1, value2 in zip(metrics1.values(), metrics2.values())) / sum(metrics1.values()) if sum(metrics1.values()) > 0 else 0
    similarity_scores['code_metrics_similarity'] = metrics_similarity

    # Overall similarity (average of all similarities)
    overall_similarity = sum(similarity_scores.values()) / len(similarity_scores)
    similarity_scores['overall_similarity'] = overall_similarity

    return similarity_scores, features1, features2

Function to generate a report

In [None]:
def generate_report(code1, code2, similarity_scores, features1, features2):
    report = 'Plagiarism Report\n\n'
    report += 'Similarity Scores:\n'
    for score_name, score_value in similarity_scores.items():
        report += f'{score_name.replace("_", " ").capitalize()}: {score_value:.2f}\n'

    report += '\nDetailed Comparison:\n'

    report += 'Functions (Total: {} in Code 1, {} in Code 2):\n'.format(len(features1[0]), len(features2[0]))
    for function in features1[0]:
        report += f'  {function}\n'
    report += '\nShared Functions:\n'
    for function in set(features1[0]).intersection(set(features2[0])):
        report += f'  {function}\n'

    report += '\nVariables (Total: {} in Code 1, {} in Code 2):\n'.format(len(features1[1]), len(features2[1]))
    for variable in features1[1]:
        report += f'  {variable}\n'
    report += '\nShared Variables:\n'
    for variable in set(features1[1]).intersection(set(features2[1])):
        report += f'  {variable}\n'

    report += '\nConditionals (Total: {} in Code 1, {} in Code 2):\n'.format(len(features1[2]), len(features2[2]))
    for conditional in features1[2]:
        report += f'  {conditional}\n'
    report += '\nShared Conditionals:\n'
    for conditional in set(features1[2]).intersection(set(features2[2])):
        report += f'  {conditional}\n'

    report += '\nLoops (Total: {} in Code 1, {} in Code 2):\n'.format(len(features1[3]), len(features2[3]))
    for loop in features1[3]:
        report += f'  {loop}\n'
    report += '\nShared Loops:\n'
    for loop in set(features1[3]).intersection(set(features2[3])):
        report += f'  {loop}\n'

    report += '\nCode Metrics for Code 1:\n'
    for metric, value in features1[4].items():
        report += f'  {metric}: {value}\n'

    report += '\nCode Metrics for Code 2:\n'
    for metric, value in features2[4].items():
        report += f'  {metric}: {value}\n'

    # Display the plagiarized regions
    report += '\nPlagiarized Regions:\n'
    matches = SequenceMatcher(None, code1, code2).get_matching_blocks()
    start1 = start2 = 0
    for match in matches:
        if match.size > 0:
            start1 += match.a
            start2 += match.b
            report += f'Code 1 ({start1}:{start1 + match.size}):\n{code1[start1:start1 + match.size]}\n'
            report += f'Code 2 ({start2}:{start2 + match.size}):\n{code2[start2:start2 + match.size]}\n'
            start1 += match.size
            start2 += match.size

    return report

Function to generate graph for similarity and feature comparison

In [None]:
def generate_graphs(similarity_scores, features1, features2):
    # Plotting function similarity with horizontal bar graph
    labels = list(similarity_scores.keys())
    values = list(similarity_scores.values())

    plt.figure(figsize=(10, 6))
    plt.barh(labels, values, color='blue')
    plt.xlabel('Scores')
    plt.ylabel('Similarity Measures')
    plt.title('Similarity Scores for Code Comparison')
    plt.tight_layout()
    similarity_scores_path = 'similarity_scores.png'
    plt.savefig(similarity_scores_path)
    plt.close()

    # Plotting feature comparison (functions, variables, etc.) as pie charts
    labels = ['Functions', 'Variables', 'Control Flow', 'Loops']
    values_code1 = [len(features1[0]), len(features1[1]), len(features1[2]), len(features1[3])]
    values_code2 = [len(features2[0]), len(features2[1]), len(features2[2]), len(features2[3])]

    # Pie chart for Code 1
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    plt.pie(values_code1, labels=labels, autopct='%1.1f%%', startangle=140)
    plt.title('Feature Distribution in Code 1')

    # Pie chart for Code 2
    plt.subplot(1, 2, 2)
    plt.pie(values_code2, labels=labels, autopct='%1.1f%%', startangle=140)
    plt.title('Feature Distribution in Code 2')

    plt.tight_layout()
    feature_comparison_path = 'feature_comparison.png'
    plt.savefig(feature_comparison_path)
    plt.close()

    return similarity_scores_path, feature_comparison_path

Function to save the report as a PDF with beautification

In [None]:
def save_report_as_pdf(report, similarity_scores_path, feature_comparison_path, filename):
    pdf = FPDF()
    pdf.add_page()

    pdf.set_font("Arial", 'B', 16)
    pdf.cell(200, 10, txt="Plagiarism Report", ln=True, align='C')

    pdf.set_font("Arial", size=12)
    pdf.ln(10)

    for line in report.split('\n'):
        if "Similarity Scores:" in line:
            pdf.set_font("Arial", 'B', 14)
            pdf.cell(200, 10, txt="Similarity Scores", ln=True, align='L')
            pdf.set_font("Arial", size=12)
        elif "Detailed Comparison:" in line:
            pdf.ln(5)
            pdf.set_font("Arial", 'B', 14)
            pdf.cell(200, 10, txt="Detailed Comparison", ln=True, align='L')
            pdf.set_font("Arial", size=12)
        elif "Plagiarized Regions:" in line:
            pdf.ln(5)
            pdf.set_font("Arial", 'B', 14)
            pdf.cell(200, 10, txt="Plagiarized Regions", ln=True, align='L')
            pdf.set_font("Arial", size=12)
        else:
            pdf.cell(200, 10, txt=line, ln=True, align='L')

    pdf.add_page()
    pdf.image(similarity_scores_path, x=10, y=10, w=190)
    pdf.add_page()
    pdf.image(feature_comparison_path, x=10, y=10, w=190)

    pdf.output(filename)

Function to read code from files

In [None]:
def read_code_from_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()

Prompt user to upload files

In [None]:
def upload_files():
    while True:
        uploaded = files.upload()
        file_paths = list(uploaded.keys())
        if len(file_paths) < 2:
            print("Upload at least 2 files for plagiarism detection.")
        elif len(file_paths) > 2:
            print("You can upload a maximum of 2 files for plagiarism detection.")
        else:
            return file_paths

Two Files upload dialog and all necessary functions called

In [None]:
file1, file2 = upload_files()
code_sample1 = read_code_from_file(file1)
code_sample2 = read_code_from_file(file2)

similarity_scores, features1, features2 = calculate_similarity(code_sample1, code_sample2)
report = generate_report(code_sample1, code_sample2, similarity_scores, features1, features2)
similarity_scores_path, feature_comparison_path = generate_graphs(similarity_scores, features1, features2)
save_report_as_pdf(report, similarity_scores_path, feature_comparison_path, 'plagiarism_report.pdf')

Saving 1.cpp to 1 (1).cpp
Saving 2.cpp to 2 (1).cpp


This section prints the report.

In [None]:
print(report)

Plagiarism Report

Similarity Scores:
Function similarity: 1.00
Variable similarity: 1.00
Control flow similarity: 1.00
Loop similarity: 1.00
Code structure similarity: 1.00
Embedding similarity: 1.00
Code metrics similarity: 1.00
Overall similarity: 1.00

Detailed Comparison:
Functions (Total: 9 in Code 1, 9 in Code 2):
  #
  node
  head
  cin>>quantity
  quantity
  Sale"<<endl
  case
  case
  cout<<"Enter

Shared Functions:
  head
  quantity
  case
  #
  cout<<"Enter
  node
  Sale"<<endl
  cin>>quantity

Variables (Total: 15 in Code 1, 15 in Code 2):
  int
  int
  string
  float
  int
  =
  int
  head
  head
  list
  bool
  int
  cout<<"Total
  current->qty
  float

Shared Variables:
  =
  cout<<"Total
  bool
  head
  current->qty
  int
  list
  float
  string

Conditionals (Total: 9 in Code 1, 9 in Code 2):
  if
  else
  else
  if
  else
  if
  if
  else
  switch

Shared Conditionals:
  switch
  if
  else

Loops (Total: 5 in Code 1, 5 in Code 2):
  while
  for
  while
  while
  whil