In [1]:
!pip install javalang




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: C:\Users\Flavio Ruvalcaba\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import os
import javalang
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [3]:
# Función para leer archivos de un directorio
def read_files_from_directory(directory):
    files_content = []
    for filename in os.listdir(directory):
        if filename.endswith(".java"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                files_content.append(file.read())
    return files_content

In [4]:
# Función para generar AST y vectorizar
def generate_ast_vectors(files_content):
    vectors = []
    errors = []
    for idx, content in enumerate(files_content):
        try:
            tokens = list(javalang.tokenizer.tokenize(content))
            parser = javalang.parser.Parser(tokens)
            ast = parser.parse()
            vector = vectorize_ast(ast)
            vectors.append(vector)
        except javalang.tokenizer.LexerError as e:
            print(f"LexerError in file {idx}: {e}")
            errors.append((idx, content))
        except javalang.parser.JavaSyntaxError as e:
            print(f"JavaSyntaxError in file {idx}: {e}")
            errors.append((idx, content))
    return vectors, errors

In [5]:
# Función para vectorizar AST (simplificado para el ejemplo)
def vectorize_ast(ast):
    vector = []
    for path, node in ast:
        vector.append(type(node).__name__)
    return " ".join(vector)

In [6]:
# Función principal para cargar datos y generar vectores
def load_data_and_generate_vectors(base_dir):
    categories = ['noplag', 'plagio']
    data = {}
    for category in categories:
        data[category] = []
        for phase in ['train', 'test', 'validation']:
            path = os.path.join(base_dir, phase, category)
            data[category] += read_files_from_directory(path)
    return data

In [7]:
# Ruta base del dataset
base_dir = 'C:/Users/Flavio Ruvalcaba/Documents/Escuela/Universidad/8Semestre/PlagiarismDetector/finalDataset/split/'

# Cargar y vectorizar datos
data = load_data_and_generate_vectors(base_dir)

# Combinar y vectorizar datos para clustering
all_files_content = data['noplag'] + data['plagio']
vectors, errors = generate_ast_vectors(all_files_content)

# Vectorizar utilizando TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(vectors)

# Guardar los vectores, el vectorizador y el contenido de los archivos
joblib.dump(X, 'data_vectors.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(all_files_content, 'all_files_content.pkl')

# Imprimir resumen de errores
print(f"Total files with errors: {len(errors)}")
for error in errors:
    print(error[0], error[1][:100])  # Mostrar el índice y los primeros 100 caracteres del contenido problemático

Total files with errors: 0
