# **INLP Semester Project**

## **C++ Source Code Plgiarism Checker**

### **Group members:**
1. Muhammad Ahtasham
2. Muhammad Aly Yousaf


This portion imports the necessary libraries for various tasks such as natural language processing, similarity calculation, machine learning, and PDF generation.

In [None]:
import os
import spacy
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from transformers import AutoTokenizer, AutoModel
import torch
import pickle
from pygments.lexers import get_lexer_by_name
from fpdf import FPDF

This part initializes the English language model for NLP tasks using SpaCy and a transformer model for code embeddings using the Hugging Face library.

In [None]:
# Load the English language model in Spacy
nlp = spacy.load("en_core_web_sm")

# Load a transformer model for code embeddings
tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
model = AutoModel.from_pretrained("huggingface/CodeBERTa-small-v1")

These functions define various tasks such as extracting features from code, calculating similarity, generating reports, saving reports as PDFs, and training machine learning models.




In [None]:
# Function to get token sequences from code snippets using Pygments
def get_token_sequence(code):
    lexer = get_lexer_by_name("cpp")
    return [token for token, _ in lexer.get_tokens(code)]

This function extracts various features from the C++ code, including functions, variables, control flow constructs, loops, and basic code metrics.

In [None]:
# Function to get token sequences from code snippets using Pygments
def get_token_sequence(code):
    lexer = get_lexer_by_name("cpp")
    return [token for token, _ in lexer.get_tokens(code)]

# Function to extract features from C++ code
def extract_features_from_code(code):
    doc = nlp(code)

    functions = []
    variables = []
    control_flow = []
    loops = []
    for token in doc:
        if token.pos_ == 'NOUN' and token.dep_ == 'compound':
            if token.head.pos_ == 'NOUN':
                variables.append(token.text)
        elif token.pos_ == 'NOUN' and token.dep_ == 'nsubj':
            if token.head.pos_ == 'VERB':
                functions.append(token.text)
        elif token.text in ['if', 'else', 'switch']:
            control_flow.append(token.text)
        elif token.text in ['for', 'while', 'do']:
            loops.append(token.text)

    code_metrics = {
        'lines_of_code': len(code.split('\n')),
        'cyclomatic_complexity': len(control_flow) + len(loops) + 1
    }

    return functions, variables, control_flow, loops, code_metrics

# Function to calculate similarity between two C++ code samples
def calculate_similarity(code1, code2):
    features1 = extract_features_from_code(code1)
    features2 = extract_features_from_code(code2)

    similarity_scores = {}

    # Function similarity
    functions1 = set(features1[0])
    functions2 = set(features2[0])
    function_similarity = len(functions1.intersection(functions2)) / max(len(functions1), len(functions2)) if len(functions1) > 0 and len(functions2) > 0 else 0
    similarity_scores['function_similarity'] = function_similarity

    # Variable similarity
    variables1 = set(features1[1])
    variables2 = set(features2[1])
    variable_similarity = len(variables1.intersection(variables2)) / max(len(variables1), len(variables2)) if len(variables1) > 0 and len(variables2) > 0 else 0
    similarity_scores['variable_similarity'] = variable_similarity

    # Control flow similarity
    control_flow1 = set(features1[2])
    control_flow2 = set(features2[2])
    control_flow_similarity = len(control_flow1.intersection(control_flow2)) / max(len(control_flow1), len(control_flow2)) if len(control_flow1) > 0 and len(control_flow2) > 0 else 0
    similarity_scores['control_flow_similarity'] = control_flow_similarity

    # Loop similarity
    loops1 = set(features1[3])
    loops2 = set(features2[3])
    loop_similarity = len(loops1.intersection(loops2)) / max(len(loops1), len(loops2)) if len(loops1) > 0 and len(loops2) > 0 else 0
    similarity_scores['loop_similarity'] = loop_similarity

    # Code structure similarity
    tokens1 = get_token_sequence(code1)
    tokens2 = get_token_sequence(code2)
    code_similarity = SequenceMatcher(None, tokens1, tokens2).ratio()
    similarity_scores['code_structure_similarity'] = code_similarity

    # Token embedding similarity using transformer model
    inputs1 = tokenizer(code1, return_tensors='pt')
    inputs2 = tokenizer(code2, return_tensors='pt')
    with torch.no_grad():
        embeddings1 = model(**inputs1).last_hidden_state.mean(dim=1)
        embeddings2 = model(**inputs2).last_hidden_state.mean(dim=1)
    embedding_similarity = cosine_similarity(embeddings1, embeddings2).item()
    similarity_scores['embedding_similarity'] = embedding_similarity

    # Code metrics similarity
    metrics1 = features1[4]
    metrics2 = features2[4]
    metrics_similarity = 1 - sum(abs(value1 - value2) for value1, value2 in zip(metrics1.values(), metrics2.values())) / sum(metrics1.values()) if sum(metrics1.values()) > 0 else 0
    similarity_scores['code_metrics_similarity'] = metrics_similarity

    # Overall similarity (average of all similarities)
    overall_similarity = sum(similarity_scores.values()) / len(similarity_scores)
    similarity_scores['overall_similarity'] = overall_similarity

    return similarity_scores, features1, features2

# Function to generate a report
def generate_report(code1, code2, similarity_scores, features1, features2):
    report = 'Plagiarism Report\n\n'

    report += 'Similarity Scores:\n'
    for score_name, score_value in similarity_scores.items():
        report += f'{score_name.replace("_", " ").capitalize()}: {score_value:.2f}\n'

    report += '\nDetailed Comparison:\n'

    report += 'Functions (Total: {} in Code 1, {} in Code 2):\n'.format(len(features1[0]), len(features2[0]))
    for function in features1[0]:
        report += f'  {function}\n'
    report += '\nShared Functions:\n'
    for function in set(features1[0]).intersection(set(features2[0])):
        report += f'  {function}\n'

    report += '\nVariables (Total: {} in Code 1, {} in Code 2):\n'.format(len(features1[1]), len(features2[1]))
    for variable in features1[1]:
        report += f'  {variable}\n'
    report += '\nShared Variables:\n'
    for variable in set(features1[1]).intersection(set(features2[1])):
        report += f'  {variable}\n'

    report += '\nConditionals (Total: {} in Code 1, {} in Code 2):\n'.format(len(features1[2]), len(features2[2]))
    for conditional in features1[2]:
        report += f'  {conditional}\n'
    report += '\nShared Conditionals:\n'
    for conditional in set(features1[2]).intersection(set(features2[2])):
        report += f'  {conditional}\n'

    report += '\nLoops (Total: {} in Code 1, {} in Code 2):\n'.format(len(features1[3]), len(features2[3]))
    for loop in features1[3]:
        report += f'  {loop}\n'
    report += '\nShared Loops:\n'
    for loop in set(features1[3]).intersection(set(features2[3])):
        report += f'  {loop}\n'

    report += '\nCode Metrics for Code 1:\n'
    for metric, value in features1[4].items():
        report += f'  {metric}: {value}\n'

    report += '\nCode Metrics for Code 2:\n'
    for metric, value in features2[4].items():
        report += f'  {metric}: {value}\n'

    # Display the plagiarized regions
    report += '\nPlagiarized Regions:\n'
    matches = SequenceMatcher(None, code1, code2).get_matching_blocks()
    start1 = start2 = 0
    for match in matches:
        if match.size > 0:
            start1 += match.a
            start2 += match.b
            report += f'Code 1 ({start1}:{start1 + match.size}):\n{code1[start1:start1 + match.size]}\n'
            report += f'Code 2 ({start2}:{start2 + match.size}):\n{code2[start2:start2 + match.size]}\n'
            start1 += match.size
            start2 += match.size

    return report


# Function to save the report as a PDF
def save_report_as_pdf(report, filename):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    for line in report.split('\n'):
        pdf.cell(200, 10, txt=line, ln=True, align='L')
    pdf.output(filename)

# Function to train and save a machine learning model
def train_and_save_model(dataset_folder):
    corpus = []
    labels = []
    for root, _, files in os.walk(dataset_folder):
        for file in files:
            if file.endswith('.cpp'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    code = f.read()
                    corpus.append(code)
                    # Assign labels based on the folder structure or other criteria
                    label = 0 if 'plagiarized' in root else 1  # Example: binary labels (0 for plagiarized, 1 for non-plagiarized)
                    labels.append(label)

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    y = labels

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Naive Bayes classifier
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    # Save the trained model and vectorizer
    with open('trained_model.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)
        pickle.dump(model, f)

These are examples of C++ code snippets that will be used to demonstrate the functionalities of the provided functions.

In [None]:
# Example usage with sample code inputs
code_sample1 = '''
#include <iostream>
using namespace std;

// Function to calculate the factorial of a number
int factorial(int n) {
    if (n <= 1)
        return 1;
    else
        return n * factorial(n - 1);
}

// Function to check if a number is prime
bool isPrime(int n) {
    if (n <= 1)
        return false;
    for (int i = 2; i * i <= n; i++) {
        if (n % i == 0)
            return false;
    }
    return true;
}

int main() {
    int num1 = 10;
    int num2 = 20;
    int sum = num1 + num2;

    // Loop to print numbers from 1 to 10
    for (int i = 1; i <= 10; i++) {
        cout << i << " ";
    }
    cout << endl;

    // Loop to calculate and print the factorial of numbers from 1 to 5
    for (int i = 1; i <= 5; i++) {
        cout << "Factorial of " << i << " is " << factorial(i) << endl;
    }

    // Loop to check and print prime numbers between 1 and 20
    cout << "Prime numbers between 1 and 20: ";
    for (int i = 1; i <= 20; i++) {
        if (isPrime(i))
            cout << i << " ";
    }
    cout << endl;

    // Conditional to check the sum of two numbers
    if (sum > 50) {
        cout << "Sum is greater than 50" << endl;
    } else {
        cout << "Sum is less than or equal to 50" << endl;
    }

    return 0;
}
'''

code_sample2 = '''
#include <iostream>
using namespace std;

int main() {
    int x, y;
    char op;

    cout << "Enter first number: ";
    cin >> x;
    cout << "Enter second number: ";
    cin >> y;
    cout << "Enter operation (+, -, *, /): ";
    cin >> op;

    switch(op) {
        case '+':
            cout << "Result: " << (x + y) << endl;
            break;
        case '-':
            cout << "Result: " << (x - y) << endl;
            break;
        case '*':
            cout << "Result: " << (x * y) << endl;
            break;
        case '/':
            if (y != 0)
                cout << "Result: " << (x / y) << endl;
            else
                cout << "Division by zero error!" << endl;
            break;
        default:
            cout << "Invalid operation!" << endl;
    }

    return 0;
}
'''

This part calculates the similarity between two code samples,generate graphs generates a plagiarism report, saves it as a PDF, and prints the report.

In [None]:
similarity_scores, features1, features2 = calculate_similarity(code_sample1, code_sample2)
generate_graphs(similarity_scores, features1, features2)
report = generate_report(code_sample1, code_sample2, similarity_scores, features1, features2)
save_report_as_pdf(report, 'plagiarism_report.pdf')

This section involves training a machine learning model using a dataset of C++ code files and saving the trained model.

In [None]:
# Train and save the model
dataset_folder = '/content/drive/MyDrive/cpd/dataset'
train_and_save_model(dataset_folder)

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


This section prints the report.

In [None]:
print(report)

Plagiarism Report

Similarity Scores:
Function similarity: 0.00
Variable similarity: 0.00
Control flow similarity: 1.00
Loop similarity: 0.00
Code structure similarity: 0.10
Embedding similarity: 0.84
Code metrics similarity: 0.59
Overall similarity: 0.36

Detailed Comparison:
Functions (Total: 2 in Code 1, 1 in Code 2):
  Function
  =

Shared Functions:

Variables (Total: 1 in Code 1, 2 in Code 2):
  int

Shared Variables:

Conditionals (Total: 8 in Code 1, 2 in Code 2):
  if
  else
  if
  if
  if
  if
  if
  else

Shared Conditionals:
  if
  else

Loops (Total: 4 in Code 1, 0 in Code 2):
  for
  for
  for
  for

Shared Loops:

Code Metrics for Code 1:
  lines_of_code: 57
  cyclomatic_complexity: 13

Code Metrics for Code 2:
  lines_of_code: 38
  cyclomatic_complexity: 3

Plagiarized Regions:
Code 1 (0:43):

#include <iostream>
using namespace std;


Code 2 (0:43):

#include <iostream>
using namespace std;


Code 1 (112:113):
n
Code 2 (118:119):
;
Code 1 (184:186):
ri
Code 2 (221:223)

In [None]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40702 sha256=8b33ca693358c5f61644f9f2e822b4fb74bb58f3f711d8917be88c2f2f245d86
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
