In [1]:
import os
import pytesseract
from PIL import Image
from transformers import pipeline
from langid import classify
import regex
import csv

def analyze_code_snippet(image_path):
    # Step 1: Extract text from the image using OCR
    try:
        image = Image.open(image_path)
        extracted_text = pytesseract.image_to_string(image)
    except Exception as e:
        print(f"Error extracting text from {image_path}: {e}")
        return {"language": None, "author": "Unknown", "confidence": None, "functionality": None}

    # Step 2: Identify programming language
    try:
        code_detector = pipeline("text-classification", model="huggingface/CodeBERTa-language-id")
        language_prediction = code_detector(extracted_text)
        language = language_prediction[0]['label']
        confidence = language_prediction[0]['score']
    except Exception as e:
        print(f"Error identifying language in {image_path}: {e}")
        language = None
        confidence = None

    # Step 3: Summarize code functionality
    try:
        code_explainer = pipeline("summarization", model="ashwinR/CodeExplainer")
        functionality = code_explainer(extracted_text)[0]['summary_text']
    except Exception as e:
        print(f"Error summarizing code in {image_path}: {e}")
        functionality = "Unknown"

    # Step 4: Search for author's name
    author_pattern = r'author\s*[:=]\s*["\']?([\w\s]+)["\']?'
    author_match = regex.search(author_pattern, extracted_text, regex.IGNORECASE)
    author = author_match.group(1) if author_match else "Unknown"

    return {"language": language, "author": author, "confidence": confidence, "functionality": functionality}


def process_images_in_folder(folder_path, output_csv):
    # Get all image files in the folder
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    
    # Initialize CSV file
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["Image Path", "Functionality", "Language", "Author", "Confidence"])
        
        # Process each image
        for image_file in image_files:
            image_path = os.path.join(folder_path, image_file)
            print(f"Processing {image_path}...")
            result = analyze_code_snippet(image_path)
            csv_writer.writerow([
                image_path,
                result["functionality"],
                result["language"],
                result["author"],
                f"{result['confidence']:.2%}" if result["confidence"] is not None else "N/A"
            ])
            print(f"Processed {image_path}")


# # Example usage
# if __name__ == "__main__":
#     folder_path = "D:\\AI-Projects\\code-classification\\data"  # Replace with your folder path
#     output_csv = "labels.csv"
#     process_images_in_folder(folder_path, output_csv)
#     print(f"Processing complete. Results saved to {output_csv}.")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:

# Step 1: Load and preprocess the data


# Step 2: Build a CNN model




# Step 3: Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

# Step 4: Save the best model
model.save("best_model.h5")

# Step 5: Plot training metrics
def plot_metrics(history):
    # Loss
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()

plot_metrics(history)


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt


In [4]:
csv_path = "labels.csv"
data = pd.read_csv(csv_path)

# Encode categorical labels (Functionality and Language)
encoder = LabelEncoder()
data['Functionality'] = encoder.fit_transform(data['Functionality'])
data['Language'] = encoder.fit_transform(data['Language'])

# Features and labels
X = data.drop(columns=["Image Path", "Functionality", "Language", "Author", "Confidence"])
y = data["Functionality"]  # Replace with "Language" for a different target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
data

In [None]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
#     tf.keras.layers.Dropout(0.3),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dropout(0.3),
#     tf.keras.layers.Dense(len(np.unique(y)), activation='softmax')
# ])
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),  # Explicit input layer
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(len(np.unique(y)), activation='softmax')
])

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

# Step 4: Save the best model
model.save("best_model.h5")

In [None]:
def train_cnn(image_dir, label_file):
    # Load dataset
    train_generator, validation_generator = load_dataset(image_dir, label_file)
    
    # Model parameters
    input_shape = (128, 128, 3)
    num_languages = len(train_generator.class_indices['language'])
    num_functionalities = len(train_generator.class_indices['functionality'])
    
    # Build and compile CNN model
    model = build_cnn(input_shape, num_languages, num_functionalities)
    model.compile(
        optimizer='adam',
        loss={
            'language_output': 'categorical_crossentropy',
            'functionality_output': 'categorical_crossentropy'
        },
        metrics=['accuracy']
    )
    
    # Set up a checkpoint to save the model with the best validation accuracy
    checkpoint = ModelCheckpoint(
        filepath='best_model.h5',  # Save model to this file
        monitor='val_loss',  # Monitor validation loss (you can also monitor 'val_accuracy')
        verbose=1,
        save_best_only=True,  # Only save the model with the best validation loss
        mode='min'  # Save when 'val_loss' is minimized
    )
    
    # Train model
    history = model.fit(
        train_generator,
        epochs=10,
        validation_data=validation_generator,
        callbacks=[checkpoint]  # Add the checkpoint callback
    )
    
    print("Training complete. Best model saved as 'best_model.h5'")
    return model, history