In [12]:
import re
import json
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Function to clean code
def clean_code(code):
    code = re.sub(r'\/\*[\s\S]*?\*\/', '', code)  # Remove multiline comments
    code = re.sub(r'\/\/.*', '', code)  # Remove single line comments
    code = re.sub(r'#.*', '', code)  # Remove python comments
    code = re.sub(r'\s*\n\s*', '\n', code)  # Remove extra whitespace around newlines
    return code.strip()

# Function to label code (0 for safe, 1 for unsafe)
def label_code(snippet):
    unsafe_patterns = {
        'eval': 'Code Injection',                        # Executes code from a string
        'innerHTML': 'Cross-Site Scripting (XSS)',       # Inserts HTML content into an element
        'outerHTML': 'Cross-Site Scripting (XSS)',       # Similar to innerHTML but for the element itself
        'document.write': 'Cross-Site Scripting (XSS)',  # Writes directly to the HTML document
        'setTimeout': 'Potential Code Injection',        # Executes code after a delay
        'setInterval': 'Potential Code Injection',       # Repeatedly executes code at intervals
        'Function': 'Code Injection',                    # Creates new functions from strings
        'location.href': 'Open Redirect',                # Changes the URL of the current page
        'document.location': 'Open Redirect',            # Similar to location.href
        'XMLHttpRequest': 'Sensitive Data Exposure',     # Handles HTTP requests, can be misused
        '<script>': 'Cross-Site Scripting (XSS)',        # Directly injects JavaScript code
        'document.body.innerHTML': 'Cross-Site Scripting (XSS)'  # Sets the HTML content of the body element
    }

    unsafe_lines = []
    for line in snippet.split('\n'):
        for pattern, vuln_type in unsafe_patterns.items():
            if pattern in line:
                unsafe_lines.append((line.strip(), vuln_type))
    return unsafe_lines

# Load pre-trained model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
model = RobertaModel.from_pretrained('microsoft/codebert-base')

# Load existing preprocessed dataset
with open('javascript.json', 'r') as f:
    data = json.load(f)


In [8]:
# Tokenize and get embeddings for the code snippets
inputs = tokenizer([d['code'] for d in data], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).numpy()

In [9]:
# Create labels
labels = np.array([d['label'] for d in data])

# Train a simple classifier
clf = RandomForestClassifier()
clf.fit(embeddings, labels)

# Testing the classifier with a new code snippet
test_code = """
    var userInput = '{"name": "John", "age": 30}';
    var userObj;
    try {
        userObj = JSON.parse(userInput);  // Safely parse JSON without using eval
    } catch (e) {
        console.error("Invalid JSON input");
    }

    var userInput = "alert('Hello, world!')";
    eval(userInput);  // Unsafe
    var safe = 1243;
    document.getElementById('output').innerHTML = userInput;  // Unsafe
    document.write(userInput);  // Unsafe


"""
cleaned_test_code = clean_code(test_code)
unsafe_lines = label_code(cleaned_test_code)
# unsafe_lines = label_code(test_code)

print("Unsafe Lines of Code with Vulnerability Types:\n")
for line, vuln_type in unsafe_lines:
    print(f"{line} - {vuln_type}")

test_inputs = tokenizer([cleaned_test_code], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    test_outputs = model(**test_inputs)
test_embeddings = test_outputs.last_hidden_state.mean(dim=1).numpy()
prediction = clf.predict(test_embeddings)
# print("\nPrediction:", prediction)



Unsafe Lines of Code with Vulnerability Types:

eval(userInput); - Code Injection
document.getElementById('output').innerHTML = userInput; - Cross-Site Scripting (XSS)
document.write(userInput); - Cross-Site Scripting (XSS)


In [2]:
!pip install gradio


Defaulting to user installation because normal site-packages is not writeable
Collecting gradio
  Downloading gradio-4.39.0-py3-none-any.whl (12.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m473.8 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting fastapi
  Downloading fastapi-0.111.1-py3-none-any.whl (92 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 KB[0m [31m256.9 kB/s[0m eta [36m0:00:00[0m eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting pydantic>=2.0
  Downloading pydantic-2.8.2-py3-none-any.whl (423 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.9/423.9 KB[0m [31m643.4 kB/s[0m eta [36m0:00:00[0m1m641.1 kB/s[0m eta [36m0:00:01[0m
Collecting importlib-resources<7.0,>=1.3
  Downloading importlib_resources-6.4.0-py3-none-any.whl (38 kB)
Collecting typer<1.0,>=0.12
  Downloading typer-0.12.3-py3-none-any.wh

Collecting starlette<0.38.0,>=0.37.2
  Downloading starlette-0.37.2-py3-none-any.whl (71 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 KB[0m [31m691.9 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m
Collecting pygments<3.0.0,>=2.13.0
  Downloading pygments-2.18.0-py3-none-any.whl (1.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m971.9 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting markdown-it-py>=2.2.0
  Downloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.5/87.5 KB[0m [31m789.5 kB/s[0m eta [36m0:00:00[0m1m960.2 kB/s[0m eta [36m0:00:01[0m
Collecting uvloop!=0.15.0,!=0.15.1,>=0.14.0
  Downloading uvloop-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [1]:
import gradio as gr

# Function to detect unsafe code
def detect_unsafe_code(code):
    # Clean and label the provided code
    cleaned_code = clean_code(code)
    unsafe_lines = label_code(cleaned_code)

    # Format the output
    formatted_output = "\n".join([f"{line[0]} - {line[1]}" for line in unsafe_lines])
    return formatted_output

# Create and launch the Gradio interface
iface = gr.Interface(
    fn=detect_unsafe_code,
    inputs="text",
    outputs="text",
    title="JavaScript Code Safety Analyzer",
    description="Enter your JavaScript code to detect unsafe lines."
)

iface.launch()


ModuleNotFoundError: No module named 'gradio'