In [5]:
import re
import csv
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint
import os

In [3]:
import pytesseract
from PIL import Image
from transformers import pipeline
from langid import classify
import regex

def analyze_code_snippet(image_path):
    # Step 1: Extract text from the image using OCR
    try:
        image = Image.open(image_path)
        extracted_text = pytesseract.image_to_string(image)
        print(f"Extracted Text:\n{extracted_text}")
    except Exception as e:
        print("Error extracting text:", e)
        return

    # Step 2: Identify programming language
    try:
        code_detector = pipeline("text-classification", model="huggingface/CodeBERTa-language-id")
        language_prediction = code_detector(extracted_text)
        language = language_prediction[0]['label']
        confidence = language_prediction[0]['score']
        code_explainer = pipeline("summarization", model="ashwinR/CodeExplainer")
        explanation = code_explainer(extracted_text)[0]['summary_text']
        print(f"Code Explanation:\n{explanation}")
        print(f"Detected Language: {language} (Confidence: {confidence:.2%})")
    except Exception as e:
        print("Error identifying language:", e)
        language = None
        confidence = None

    # Step 3: Search for author's name
    author_pattern = r'author\s*[:=]\s*["\']?([\w\s]+)["\']?'
    author_match = regex.search(author_pattern, extracted_text, regex.IGNORECASE)
    author = author_match.group(1) if author_match else "Unknown"
    print(f"Author: {author}")

    

    # Step 4: Show accuracy metrics
    if confidence is not None:
        accuracy_score = confidence * 100
        print(f"Accuracy: {accuracy_score:.2f}%")
    else:
        print("Accuracy: N/A")

    return {"language": language, "author": author, "confidence": confidence}





# Example usage
if __name__ == "__main__":
    image_path = "D:\AI-Projects\code-classification\data\escline_InstallCert_InstallCert_part6.png"  # Replace with your image path
    analyze_code_snippet(image_path)


Extracted Text:
private static class SavingTrustManager implements X509TrustManager {

private final X509TrustManager tm;
private X509Certificate[] chain;

SavingTrustManager(X509TrustManager tm) {
this.tm = tm;
}

public X509Certificate[] getAcceptedIssuers() {
// This change has been done due to the following resolution advised for Java 1.7+
// http://infposs.blogspot.kr/2013/06/installcert-and-java-7.html
return new X509Certificate[0];
//throw new UnsupportedOperationException() ;

}

public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
throw new UnsupportedOperationException();

}

public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
this.chain = chain;
tm.checkServerTrusted(chain, authType) ;




Some weights of the model checkpoint at huggingface/CodeBERTa-language-id were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Device set to use cpu


Code Explanation:

1. Create a private class SavingTrustManager that will hold the X509TrustManager and its chain.
2. Create an empty array.
3. In the constructor, we will return an empty list.
4. In this case we will call the getAcceptedIssuers() method of the class.
5. Return the empty array if there are no acceptable issuers.
6. In other case, the call to this method will return null.


Detected Language: java (Confidence: 99.99%)
Author: Unknown
Accuracy: 99.99%


In [2]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
#%pip install pytesseract

In [None]:
#%pip install transformers torch 

In [None]:
#%pip install langid

In [None]:
#%pip install regex

In [None]:
import os
import pytesseract
from PIL import Image
from transformers import pipeline
import re
import csv
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Step 1: Extract text and generate dataset
def extract_text(image_path):
    try:
        image = Image.open(image_path)
        extracted_text = pytesseract.image_to_string(image)
        return extracted_text.strip()
    except Exception as e:
        print(f"Error extracting text from {image_path}: {e}")
        return None

def detect_language_and_functionality(text):
    try:
        # Language detection
        code_detector = pipeline("text-classification", model="huggingface/CodeBERTa-language-classification")
        language_prediction = code_detector(text)
        language = language_prediction[0]['label']
        
        # Code explanation
        code_explainer = pipeline("text2text-generation", model="Salesforce/codet5-large")
        explanation_prompt = f"Explain the following {language} code:\n{text}"
        explanation = code_explainer(explanation_prompt, max_length=512)[0]['generated_text']
        
        return language, explanation
    except Exception as e:
        print(f"Error detecting language or functionality: {e}")
        return None, None

def extract_author(text):
    author_pattern = r'author\s*[:=]\s*["\']?([\w\s]+)["\']?'
    author_match = re.search(author_pattern, text, re.IGNORECASE)
    return author_match.group(1) if author_match else "Unknown"

def process_images(image_dir, output_csv):
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['filename', 'language', 'functionality', 'author']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for image_file in os.listdir(image_dir):
            if image_file.endswith(('.png', '.jpg', '.jpeg')):
                image_path = os.path.join(image_dir, image_file)
                print(f"Processing {image_file}...")

                # Extract text from image
                text = extract_text(image_path)
                if not text:
                    continue

                # Detect language and functionality
                language, functionality = detect_language_and_functionality(text)

                # Extract author (if available)
                author = extract_author(text)

                # Write to CSV
                writer.writerow({
                    'filename': image_file,
                    'language': language,
                    'functionality': functionality,
                    'author': author
                })

# Step 2: Train a CNN model
def load_dataset(image_dir, label_file):
    # Load labels
    labels = pd.read_csv(label_file)
    labels['functionality'] = labels['functionality'].astype(str)  # Ensure functionality is string
    
    # Prepare an ImageDataGenerator for images
    datagen = ImageDataGenerator(
        rescale=1.0 / 255.0,
        validation_split=0.2
    )
    
    train_generator = datagen.flow_from_dataframe(
        dataframe=labels,
        directory=image_dir,
        x_col='filename',
        y_col=['language', 'functionality'],  # Multi-output labels
        target_size=(128, 128),  # Resize images to 128x128
        batch_size=32,
        subset='training',
        class_mode='multi_output'
    )
    
    validation_generator = datagen.flow_from_dataframe(
        dataframe=labels,
        directory=image_dir,
        x_col='filename',
        y_col=['language', 'functionality'],  # Multi-output labels
        target_size=(128, 128),
        batch_size=32,
        subset='validation',
        class_mode='multi_output'
    )
    
    return train_generator, validation_generator

def build_cnn(input_shape, num_languages, num_functionalities):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Dropout(0.2),
        
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Dropout(0.2),
        
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        
        # Language output
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_languages, activation='softmax', name='language_output'),
        
        # Functionality output
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_functionalities, activation='softmax', name='functionality_output'),
    ])
    
    return model

def train_cnn(image_dir, label_file):
    # Load dataset
    train_generator, validation_generator = load_dataset(image_dir, label_file)
    
    # Model parameters
    input_shape = (128, 128, 3)
    num_languages = len(train_generator.class_indices['language'])
    num_functionalities = len(train_generator.class_indices['functionality'])
    
    # Build and compile CNN model
    model = build_cnn(input_shape, num_languages, num_functionalities)
    model.compile(
        optimizer='adam',
        loss={
            'language_output': 'categorical_crossentropy',
            'functionality_output': 'categorical_crossentropy'
        },
        metrics=['accuracy']
    )
    
    # Train model
    history = model.fit(
        train_generator,
        epochs=10,
        validation_data=validation_generator
    )
    
    # Save model
    model.save('code_snippet_cnn.h5')
    print("Model saved as 'code_snippet_cnn.h5'")
    return model

# Step 3: Use the trained CNN for prediction
def predict_snippet(model_path, image_path):
    # Load the trained model
    model = load_model(model_path)
    
    # Preprocess input image
    image = load_img(image_path, target_size=(128, 128))
    image_array = img_to_array(image) / 255.0  # Normalize
    image_array = image_array.reshape(1, 128, 128, 3)
    
    # Predict
    predictions = model.predict(image_array)
    language_pred = predictions[0]
    functionality_pred = predictions[1]
    
    print("Predicted Language:", language_pred)
    print("Predicted Functionality:", functionality_pred)

# Main execution pipeline
if __name__ == "__main__":
    # Step 1: Process images to generate dataset
    image_dir = "images/"  # Directory containing code snippet images
    output_csv = "labels.csv"  # Output CSV file
    process_images(image_dir, output_csv)
    
    # Step 2: Train CNN on the generated dataset
    train_cnn(image_dir, output_csv)


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

def train_cnn(image_dir, label_file):
    # Load dataset
    train_generator, validation_generator = load_dataset(image_dir, label_file)
    
    # Model parameters
    input_shape = (128, 128, 3)
    num_languages = len(train_generator.class_indices['language'])
    num_functionalities = len(train_generator.class_indices['functionality'])
    
    # Build and compile CNN model
    model = build_cnn(input_shape, num_languages, num_functionalities)
    model.compile(
        optimizer='adam',
        loss={
            'language_output': 'categorical_crossentropy',
            'functionality_output': 'categorical_crossentropy'
        },
        metrics=['accuracy']
    )
    
    # Set up a checkpoint to save the model with the best validation accuracy
    checkpoint = ModelCheckpoint(
        filepath='best_model.h5',  # Save model to this file
        monitor='val_loss',  # Monitor validation loss (you can also monitor 'val_accuracy')
        verbose=1,
        save_best_only=True,  # Only save the model with the best validation loss
        mode='min'  # Save when 'val_loss' is minimized
    )
    
    # Train model
    history = model.fit(
        train_generator,
        epochs=10,
        validation_data=validation_generator,
        callbacks=[checkpoint]  # Add the checkpoint callback
    )
    
    print("Training complete. Best model saved as 'best_model.h5'")
    return model, history
