In [None]:
import os
import javalang
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, average_precision_score
from sklearn.model_selection import train_test_split

base_dir = "C:/Users/adria/OneDrive - Instituto Tecnologico y de Estudios Superiores de Monterrey/AppAvanzadas/IR-Plag-Dataset/IR-Plag-Dataset"

## 1. Carga y Preprocesamiento del Dataset
def load_dataset(base_dir):
    data = []
    
    for case_id in os.listdir(base_dir):
        case_path = os.path.join(base_dir, case_id)
        if not os.path.isdir(case_path):
            continue

        # Archivos originales
        original_path = os.path.join(case_path, "original")
        if os.path.exists(original_path):
            for file in os.listdir(original_path):
                if file.endswith(".java"):
                    data.append({
                        "case_id": case_id,
                        "file_path": os.path.join(original_path, file),
                        "label": "original"
                    })

        # Archivos no plagiados
        non_plag_path = os.path.join(case_path, "non-plagiarized")
        if os.path.exists(non_plag_path):
            for author in os.listdir(non_plag_path):
                author_path = os.path.join(non_plag_path, author)
                if os.path.isdir(author_path):
                    for file in os.listdir(author_path):
                        if file.endswith(".java"):
                            data.append({
                                "case_id": case_id,
                                "file_path": os.path.join(author_path, file),
                                "label": "non-plagiarized"
                            })

        # Archivos plagiados
        plag_path = os.path.join(case_path, "plagiarized")
        if os.path.exists(plag_path):
            for level in os.listdir(plag_path):
                level_path = os.path.join(plag_path, level)
                if os.path.isdir(level_path):
                    for author in os.listdir(level_path):
                        author_path = os.path.join(level_path, author)
                        if os.path.isdir(author_path):
                            for file in os.listdir(author_path):
                                if file.endswith(".java"):
                                    data.append({
                                        "case_id": case_id,
                                        "file_path": os.path.join(author_path, file),
                                        "label": "plagiarized",
                                        "plagiarism_level": level
                                    })
    
    return pd.DataFrame(data)

# Cargar dataset
print("Cargando dataset...")
base_dir = "C:/Users/adria/OneDrive - Instituto Tecnologico y de Estudios Superiores de Monterrey/AppAvanzadas/IR-Plag-Dataset/IR-Plag-Dataset"
df = load_dataset(base_dir)

## 2. Extracción de Características (AST)
def extract_enhanced_ast(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            code = f.read()
        
        tree = javalang.parse.parse(code)
        tokens = []
        
        for path, node in tree:
            node_type = type(node).__name__
            tokens.append(node_type)
            
            # Añadir información específica de nodos importantes
            if isinstance(node, javalang.tree.MethodDeclaration):
                tokens.append(f"METHOD_{node.name}")
                tokens.append(f"PARAMS_{len(node.parameters)}")
                
            if isinstance(node, (javalang.tree.IfStatement, 
                               javalang.tree.ForStatement,
                               javalang.tree.WhileStatement)):
                tokens.append(f"CONTROL_{node_type.upper()}")
                
            if isinstance(node, javalang.tree.Literal):
                tokens.append("LITERAL_VALUE")
                
        return ' '.join(tokens)
    except Exception as e:
        print(f"Error parsing {file_path}: {str(e)}")
        return None

print("Extrayendo características AST...")
df['ast_tokens'] = df['file_path'].apply(extract_enhanced_ast)
df = df.dropna(subset=['ast_tokens'])  # Eliminar archivos con errores de parsing

## 3. División en Train/Test
# Separamos manteniendo la proporción por case_id
train_cases, test_cases = train_test_split(
    df['case_id'].unique(), 
    test_size=0.3, 
    random_state=42
)

train_df = df[df['case_id'].isin(train_cases)].copy()
test_df = df[df['case_id'].isin(test_cases)].copy()

print(f"\nDistribución del dataset:")
print(f"- Total: {len(df)} archivos")
print(f"- Train: {len(train_df)} archivos ({len(train_cases)} casos)")
print(f"- Test: {len(test_df)} archivos ({len(test_cases)} casos)")

## 4. Vectorización y Modelado
# Entrenamos el vectorizador solo con los datos de entrenamiento
# 4. Vectorización y Modelado (Versión Corregida)
print("\nEntrenando vectorizador...")
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    min_df=2
)
vectorizer.fit(train_df['ast_tokens'])

# SOLUCIÓN 1: La forma más segura - usando reset_index() correctamente
train_originals = train_df[train_df['label'] == 'original'].copy()
train_originals = train_originals.reset_index() 

train_original_vectors = {
    row['case_id']: vectorizer.transform([row['ast_tokens']])
    for _, row in train_originals.iterrows()
}


# Preparamos los vectores para los originales de train
train_originals = train_df[train_df['label'] == 'original'].set_index('case_id')
train_original_vectors = {
    case_id: vectorizer.transform([train_df.loc[idx, 'ast_tokens']])
    for idx, case_id in train_originals.reset_index()[['case_id']].itertuples(index=True)
}

## 5. Evaluación del Modelo
def evaluate_model(vectors, df, original_vectors, threshold=0.7):
    results = []
    
    for idx, row in df.iterrows():
        if row['label'] == 'original':
            continue
            
        case_id = row['case_id']
        if case_id not in original_vectors:
            continue
            
        current_vec = vectors[idx]
        original_vec = original_vectors[case_id]
        
        sim = cosine_similarity(current_vec, original_vec)[0][0]
        predicted = 'plagiarized' if sim > threshold else 'non-plagiarized'
        
        results.append({
            'case_id': case_id,
            'file_path': row['file_path'],
            'true_label': row['label'],
            'predicted_label': predicted,
            'similarity': sim,
            'plagiarism_level': row.get('plagiarism_level', None)
        })
    
    return pd.DataFrame(results)

print("\nEvaluando en training set...")
train_vectors = vectorizer.transform(train_df['ast_tokens'])
train_results = evaluate_model(train_vectors, train_df, train_original_vectors)

print("\nEvaluando en test set...")
test_vectors = vectorizer.transform(test_df['ast_tokens'])
test_original_vectors = {
    case_id: vectorizer.transform([test_df.loc[(test_df['case_id'] == case_id) & (test_df['label'] == 'original'), 'ast_tokens'].values[0]])
    for case_id in test_df[test_df['label'] == 'original']['case_id'].unique()
}
test_results = evaluate_model(test_vectors, test_df, test_original_vectors)

## 6. Visualización de Resultados
def plot_metrics(results_df, title):
    # Matriz de confusión
    cm = confusion_matrix(
        results_df['true_label'] == 'plagiarized',
        results_df['predicted_label'] == 'plagiarized',
        normalize='true'
    )
    
    plt.figure(figsize=(16, 6))
    
    plt.subplot(1, 2, 1)
    sns.heatmap(cm, annot=True, fmt='.2%', cmap='Blues', 
                xticklabels=['Non-Plagiarized', 'Plagiarized'],
                yticklabels=['Non-Plagiarized', 'Plagiarized'])
    plt.title(f'Matriz de Confusión\n{title}')
    plt.xlabel('Predicho')
    plt.ylabel('Real')
    
    # Distribución de similitudes
    plt.subplot(1, 2, 2)
    sns.histplot(
        data=results_df,
        x='similarity',
        hue='true_label',
        element='step',
        stat='density',
        common_norm=False,
        bins=20
    )
    plt.axvline(x=0.7, color='r', linestyle='--', label='Umbral')
    plt.title(f'Distribución de Similitudes\n{title}')
    plt.xlabel('Similitud Cosina')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

print("\nGenerando visualizaciones...")
plot_metrics(train_results, 'Training Set')
plot_metrics(test_results, 'Test Set')

## 7. Reporte de Métricas
def print_metrics(results_df, set_name):
    print(f"\n{'='*50}")
    print(f"Resultados para {set_name}")
    print(f"{'='*50}")
    
    # Reporte de clasificación
    print("\nReporte de Clasificación:")
    print(classification_report(
        results_df['true_label'] == 'plagiarized',
        results_df['predicted_label'] == 'plagiarized',
        target_names=['Non-Plagiarized', 'Plagiarized']
    ))
    
    # Métricas por nivel de plagio (si existe)
    if 'plagiarism_level' in results_df.columns:
        print("\nMétricas por Nivel de Plagio:")
        levels = sorted(results_df['plagiarism_level'].dropna().unique())
        for level in levels:
            level_df = results_df[results_df['plagiarism_level'] == level]
            if len(level_df) > 0:
                accuracy = np.mean(level_df['true_label'] == level_df['predicted_label'])
                print(f"- Nivel {level}: {accuracy:.2%} accuracy")
    
    # Ejemplos de falsos negativos/positivos
    print("\nEjemplos destacados:")
    fp = results_df[(results_df['true_label'] == 'non-plagiarized') & 
                   (results_df['predicted_label'] == 'plagiarized')]
    fn = results_df[(results_df['true_label'] == 'plagiarized') & 
                   (results_df['predicted_label'] == 'non-plagiarized')]
    
    print(f"- Falsos positivos: {len(fp)} ejemplos")
    print(f"- Falsos negativos: {len(fn)} ejemplos")
    
    if len(fp) > 0:
        example = fp.iloc[0]
        print(f"\nEjemplo falso positivo (similitud={example['similarity']:.2f}):")
        print(f"Archivo: {example['file_path']}")
    
    if len(fn) > 0:
        example = fn.iloc[0]
        print(f"\nEjemplo falso negativo (similitud={example['similarity']:.2f}):")
        print(f"Archivo: {example['file_path']}")

print_metrics(train_results, 'Training Set')
print_metrics(test_results, 'Test Set')

## 8. Optimización del Umbral
print("\nOptimizando umbral de decisión...")
precisions, recalls, thresholds = precision_recall_curve(
    test_results['true_label'] == 'plagiarized',
    test_results['similarity']
)

# Calcular F1-score para cada threshold
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

plt.figure()
plt.plot(thresholds, precisions[:-1], label='Precisión')
plt.plot(thresholds, recalls[:-1], label='Recall')
plt.plot(thresholds, f1_scores[:-1], label='F1-score')
plt.axvline(x=best_threshold, color='r', linestyle='--', label=f'Mejor umbral ({best_threshold:.2f})')
plt.xlabel('Umbral de Similitud')
plt.ylabel('Puntuación')
plt.title('Optimización del Umbral de Decisión')
plt.legend()
plt.show()

print(f"\nEl mejor umbral según F1-score es: {best_threshold:.2f}")

Cargando dataset...
Extrayendo características AST...

Distribución del dataset:
- Total: 467 archivos
- Train: 274 archivos (4 casos)
- Test: 193 archivos (3 casos)

Entrenando vectorizador...


KeyError: 0