In [41]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import javalang
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

# Función para generar AST y vectorizar (similar a la usada durante el entrenamiento)
def generate_ast_vector(content):
    tokens = list(javalang.tokenizer.tokenize(content))
    parser = javalang.parser.Parser(tokens)
    ast = parser.parse()
    return vectorize_ast(ast)

# Función para vectorizar AST (simplificado para el ejemplo)
def vectorize_ast(ast):
    vector = []
    for path, node in ast:
        vector.append(type(node).__name__)
    return " ".join(vector)

# Cargar el modelo y el vectorizador
kmeans = joblib.load(r'C:\Users\Flavio Ruvalcaba\Documents\Escuela\Universidad\8Semestre\PlagiarismDetector\Modeling\kmeans_model.pkl')
vectorizer = joblib.load(r'C:\Users\Flavio Ruvalcaba\Documents\Escuela\Universidad\8Semestre\PlagiarismDetector\Preprocessing\tfidf_vectorizer.pkl')
all_files_content = joblib.load(r'C:\Users\Flavio Ruvalcaba\Documents\Escuela\Universidad\8Semestre\PlagiarismDetector\Preprocessing\all_files_content.pkl')

# Código Java nuevo para probar (esto sería una lista de códigos que quieres evaluar)
new_codes = [
    """
    public class HelloWorld {
        public static void main(String[] args) {
            System.out.println("Hello, World!");
        }
    }
    """,
    """
    public class Sum {
        public static void main(String[] args) {
            int a = 5;
            int b = 10;
            int sum = a + b;
            System.out.println("Sum is: " + sum);
        }
    }
    """,
    """
    public class Product {
        public static void main(String[] args) {
            int x = 4;
            int y = 5;
            int product = x * y;
            System.out.println("Product is: " + product);
        }
    }
    """,
    """
    public class Greeting {
        public static void main(String[] args) {
            String greeting = "Hello, everyone!";
            System.out.println(greeting);
        }
    }
    """,
    """
    public class Factorial {
        public static void main(String[] args) {
            int number = 5;
            int fact = 1;
            for(int i = 1; i <= number; i++) {
                fact = fact * i;
            }
            System.out.println("Factorial is: " + fact);
        }
    }
    """,
    """
    public class Fibonacci {
        public static void main(String[] args) {
            int n = 10, t1 = 0, t2 = 1;
            for (int i = 1; i <= n; ++i) {
                System.out.print(t1 + " + ");
                int sum = t1 + t2;
                t1 = t2;
                t2 = sum;
            }
        }
    }
    """,
    """
    public class ReverseString {
        public static void main(String[] args) {
            String input = "example";
            String reversed = new StringBuilder(input).reverse().toString();
            System.out.println("Reversed string is: " + reversed);
        }
    }
    """,
    """
    public class Palindrome {
        public static void main(String[] args) {
            String input = "radar";
            boolean isPalindrome = input.equals(new StringBuilder(input).reverse().toString());
            System.out.println("Is palindrome: " + isPalindrome);
        }
    }
    """
]

# Etiquetas verdaderas para el conjunto de prueba (0: No plagio, 1: Plagio)
# Suponiendo que los primeros 4 son originales (0) y los últimos 4 son plagiados (1)
true_labels = [0, 0, 0, 0, 1, 1, 1, 1]

# Asegurarse de que new_codes y true_labels tienen la misma longitud
assert len(new_codes) == len(true_labels), "La longitud de los códigos y las etiquetas verdaderas debe coincidir."

# Inicializar listas para almacenar las predicciones y las etiquetas verdaderas
pred_labels = []
test_true_labels = []

# Preprocesar y evaluar cada nuevo código
for i, new_code in enumerate(new_codes):
    try:
        new_vector = vectorizer.transform([generate_ast_vector(new_code)])
        cluster_label = kmeans.predict(new_vector)[0]

        # Determinar la etiqueta predicha (esto es un ejemplo, la lógica puede variar)
        # Aquí asumimos que si el código pertenece a un cluster que tiene plagios, lo clasificamos como plagio
        similar_files = [all_files_content[idx] for idx, label in enumerate(kmeans.labels_) if label == cluster_label]
        predicted_label = 1 if any("plagio" in file for file in similar_files) else 0
        
        pred_labels.append(predicted_label)
        test_true_labels.append(true_labels[i])  # Asegúrate de que las etiquetas verdaderas coincidan con los códigos de prueba
        
        print(f"Code {i+1}:")
        print(f"Predicted Label: {predicted_label}, True Label: {true_labels[i]}")
        print(f"Cluster {cluster_label} contains {len(similar_files)} similar files.")
        print()
    except Exception as e:
        print(f"Error al procesar el código {i+1}: {e}")

# Asegurarse de que pred_labels y test_true_labels tienen la misma longitud
assert len(pred_labels) == len(test_true_labels), "Las predicciones y las etiquetas verdaderas deben tener la misma longitud."

# Calcular métricas de evaluación
precision = precision_score(test_true_labels, pred_labels, average='macro')
accuracy = accuracy_score(test_true_labels, pred_labels)
recall = recall_score(test_true_labels, pred_labels, average='macro')
f1 = f1_score(test_true_labels, pred_labels, average='macro')

print(f"Precision: {precision}")
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Code 1:
Predicted Label: 0, True Label: 0
Cluster 4 contains 21 similar files.

Code 2:
Predicted Label: 0, True Label: 0
Cluster 6 contains 161 similar files.

Code 3:
Predicted Label: 0, True Label: 0
Cluster 6 contains 161 similar files.

Code 4:
Predicted Label: 0, True Label: 0
Cluster 4 contains 21 similar files.

Code 5:
Predicted Label: 0, True Label: 1
Cluster 3 contains 336 similar files.

Code 6:
Predicted Label: 0, True Label: 1
Cluster 3 contains 336 similar files.

Code 7:
Predicted Label: 0, True Label: 1
Cluster 4 contains 21 similar files.

Code 8:
Predicted Label: 0, True Label: 1
Cluster 7 contains 124 similar files.

Precision: 0.25
Accuracy: 0.5
Recall: 0.5
F1 Score: 0.3333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
