In [9]:
import re
import json
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Function to clean code
def clean_code(code):
    code = re.sub(r'\/\*[\s\S]*?\*\/', '', code)  # Remove multiline comments
    code = re.sub(r'\/\/.*', '', code)  # Remove single line comments
    code = re.sub(r'\s*\n\s*', '\n', code)  # Remove extra whitespace around newlines
    return code.strip()

# Function to label code (0 for safe, 1 for unsafe)
def label_code(snippet):
    unsafe_patterns = {
        'Runtime.getRuntime().exec': 'Command Execution',
        'ProcessBuilder': 'Command Execution',
        'FileOutputStream': 'File I/O',
        'FileInputStream': 'File I/O',
        'BufferedWriter': 'File I/O',
        'BufferedReader': 'File I/O',
        'ObjectInputStream': 'Deserialization',
        'ObjectOutputStream': 'Serialization',
        'setAccessible': 'Reflection',
        'getDeclaredField': 'Reflection',
        'getDeclaredMethod': 'Reflection',
        'URLClassLoader': 'URL Class Loading',
        'URLConnection': 'Network Communication',
        'Socket': 'Network Communication',
        'ServerSocket': 'Network Communication',
        'printStackTrace': 'Information Disclosure',
        'getSystemProperty': 'Information Disclosure',
        'System.loadLibrary': 'Library Loading',
        'Class.forName': 'Dynamic Class Loading',
        'Method.invoke': 'Reflection',
        'ObjectInputStream.readObject': 'Deserialization',
        'DriverManager.getConnection': 'SQL Injection',
        'Statement.execute': 'SQL Injection',
        'ResultSet.getString': 'SQL Injection',
        'MessageDigest.getInstance("MD5")': 'Insecure Hashing',
        'MessageDigest.getInstance("SHA1")': 'Insecure Hashing',
        'Random': 'Insecure Randomness',
        'Math.random': 'Insecure Randomness',
        'Cipher.getInstance("DES")': 'Insecure Encryption',
        'Cipher.getInstance("Blowfish")': 'Insecure Encryption',
        'SSLContext': 'SSL Context',
        'TrustManager': 'SSL Trust Management',
        'HostnameVerifier': 'SSL Hostname Verification',
        'new File': 'File Creation',
        'XStream': 'XML Serialization',
        'JavaSerializer': 'Java Serialization',
        'System.out.println': 'Information Disclosure',
        'password': 'Sensitive Data Exposure',
        'FileWriter': 'Sensitive Data Exposure',
        'FileReader': 'Sensitive Data Exposure',
        'BufferedReader': 'Sensitive Data Exposure',
        'System.out.println': 'Information Disclosure',
        'ObjectInputStream': 'Unsafe Deserialization',
        'file.setReadable': 'Improper File Permissions'
    }

    unsafe_lines = []
    for line in snippet.split('\n'):
        for pattern, vuln_type in unsafe_patterns.items():
            if pattern in line:
                unsafe_lines.append((line.strip(), vuln_type))
    return unsafe_lines

# Load pre-trained model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
model = RobertaModel.from_pretrained('microsoft/codebert-base')

# Load existing preprocessed dataset
with open('java.json', 'r') as f:
    data = json.load(f)

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# Tokenize and get embeddings for the code snippets
inputs = tokenizer([d['code'] for d in data], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).numpy()

In [12]:
# Create labels
labels = np.array([d['label'] for d in data])

# Train a simple classifier
clf = RandomForestClassifier()
clf.fit(embeddings, labels)

# Testing the classifier with a new Java code snippet
test_code = """
    import java.sql.Connection;
    import java.sql.DriverManager;
    import java.io.File;
    import javax.crypto.Cipher;
    import java.util.Random;

    public class Test {
        public static void main(String[] args) {
            // Unsafe code snippets
            File file = new File("test.txt");
            Connection conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/mydb", "user", "pass");
            Cipher cipher = Cipher.getInstance("DES");
            Random rand = new Random();
            System.out.println("Hello World");
        }
    }
"""

cleaned_test_code = clean_code(test_code)
unsafe_lines = label_code(cleaned_test_code)

print("Unsafe Lines of Code with Vulnerability Types:\n")
for line, vuln_type in unsafe_lines:
    print(f"{line} - {vuln_type}")

test_inputs = tokenizer([cleaned_test_code], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    test_outputs = model(**test_inputs)
test_embeddings = test_outputs.last_hidden_state.mean(dim=1).numpy()
prediction = clf.predict(test_embeddings)
# print("\nPrediction:", prediction)  # Should output 1 for unsafe

Unsafe Lines of Code with Vulnerability Types:

import java.util.Random; - Insecure Randomness
File file = new File("test.txt"); - File Creation
Connection conn = DriverManager.getConnection("jdbc:mysql: - SQL Injection
Cipher cipher = Cipher.getInstance("DES"); - Insecure Encryption
Random rand = new Random(); - Insecure Randomness
System.out.println("Hello World"); - Information Disclosure


In [None]:
!pip install gradio




In [15]:
import gradio as gr

def detect_unsafe_code(code):
    cleaned_code = clean_code(code)
    unsafe_lines = label_code(cleaned_code)

    # Format the output
    formatted_output = "\n".join([f"{line[0]} - {line[1]}" for line in unsafe_lines])
    return formatted_output

iface = gr.Interface(
    fn=detect_unsafe_code,
    inputs="text",
    outputs="text",
    title="Java Code Safety Analyzer",
    description="Enter your Java code to detect unsafe lines."
)

iface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://d5eb353a85d6cc1a54.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


