In [50]:
!pip install imblearn

Collecting imblearn
  Obtaining dependency information for imblearn from https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl.metadata
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/5a/fa/267de06c95210580f4b82b45cec1ce1e9ce1f21a01a684367db89e7da70d/imbalanced_learn-0.12.3-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
   ---------------------------------------- 0.0/258.3 kB ? eta -:--:--
   --- ----------------------------------- 20.5/258.3 kB 682.7 kB/s eta 0:00:01
   -------------- ------------------------- 92.2/258.3 kB 1.3 MB/s eta 0:00:01
   -----------------------


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: C:\Users\Flavio Ruvalcaba\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [51]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

def load_files_from_directory(directory):
    files = []
    labels = []
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            with open(os.path.join(root, filename), 'r', encoding='utf-8') as f:
                files.append(f.read())
                if 'plagio' in root:
                    labels.append(1)
                else:
                    labels.append(0)
    return files, labels

# Paths de las carpetas de datos
test_noplag_path = r"C:\Users\Flavio Ruvalcaba\Documents\Escuela\Universidad\8Semestre\PlagiarismDetector\finalDataset\split\test\noplag"
test_plagio_path = r"C:\Users\Flavio Ruvalcaba\Documents\Escuela\Universidad\8Semestre\PlagiarismDetector\finalDataset\split\test\plagio"
train_noplag_path = r"C:\Users\Flavio Ruvalcaba\Documents\Escuela\Universidad\8Semestre\PlagiarismDetector\finalDataset\split\train\noplag"
train_plagio_path = r"C:\Users\Flavio Ruvalcaba\Documents\Escuela\Universidad\8Semestre\PlagiarismDetector\finalDataset\split\train\plagio"

# Cargar datos de entrenamiento y prueba
train_files_noplag, train_labels_noplag = load_files_from_directory(train_noplag_path)
train_files_plagio, train_labels_plagio = load_files_from_directory(train_plagio_path)
test_files_noplag, test_labels_noplag = load_files_from_directory(test_noplag_path)
test_files_plagio, test_labels_plagio = load_files_from_directory(test_plagio_path)

# Combinar datos de entrenamiento y prueba
train_files = train_files_noplag + train_files_plagio
train_labels = train_labels_noplag + train_labels_plagio
test_files = test_files_noplag + test_files_plagio
test_labels = test_labels_noplag + test_labels_plagio

# Transformación TF-IDF con bigramas y eliminación de stopwords
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X_train = vectorizer.fit_transform(train_files)
X_test = vectorizer.transform(test_files)


In [52]:
# Balancear los datos de entrenamiento usando SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, train_labels)


In [53]:
# Definir el modelo
model = RandomForestClassifier(random_state=42)

# Definir los parámetros para el grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Configurar GridSearchCV para encontrar los mejores hiperparámetros
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Ajustar el grid search al conjunto de entrenamiento balanceado
grid_search.fit(X_train_balanced, y_train_balanced)

# Mejor conjunto de hiperparámetros
print(f"Mejores parámetros: {grid_search.best_params_}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Mejores parámetros: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [54]:
# Evaluar el mejor modelo encontrado por GridSearchCV
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(test_labels, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(test_labels, y_pred))

# Evaluar el modelo con validación cruzada
cv_scores = cross_val_score(best_model, X_train_balanced, y_train_balanced, cv=5, scoring='accuracy')
print(f"Validación cruzada accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


Accuracy: 0.7019
              precision    recall  f1-score   support

           0       0.62      0.39      0.48       146
           1       0.73      0.87      0.79       270

    accuracy                           0.70       416
   macro avg       0.67      0.63      0.64       416
weighted avg       0.69      0.70      0.68       416

Validación cruzada accuracy: 0.7554 ± 0.1113
