In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

mercor_ai_detection_path = kagglehub.competition_download('mercor-ai-detection')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Star with my solution

In [None]:
# ============================================================
#  INSTALACIÓN DE DEPENDENCIAS COMPATIBLES
# ============================================================
!pip install -U \
    sentence-transformers==2.6.1 \
    transformers==4.39.3 \
    huggingface_hub==0.22.2 \
    scikit-learn==1.5.2 \
    numpy==1.26.4 \
    xgboost==2.1.1 \
    joblib==1.4.2



## Codigo para Realizar clasificacion de Texto

In [None]:
# ====================================================
# MERCOR AI TEXT DETECTION - Embeddings + XGBoost
# ====================================================

# ====================================================
# 1 Cargar modelo de embeddings
# ====================================================

print("=== Fase 1 Star ===/n")

from sentence_transformers import SentenceTransformer
print("Paciencia Cargando modelo de embeddings.../n")
model_emb = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
print("/n----- El Modelo se cargado correctamente -----")

print("=== Fase 1 Finish ===/n")

# ====================================================
# 2 Importar librerías POr si las dudas Nota para equipo : Borrar si no hay problemas
# ====================================================
print("=== Fase 2 Star===/n")

print("------ Por si las dudas volvere a cargar las librerias/n -------/n")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score
from xgboost import XGBClassifier
import joblib
print("=== Fase 2 Finish ===/n")

# ====================================================
# 3 Cargar datos
# ====================================================
print("=== Fase 3 Star ===/n")

print("------ Paciencia Inicia Carga de Datos --------/n")
train_path = '/kaggle/input/mercor-ai-detection/train.csv'
test_path = '/kaggle/input/mercor-ai-detection/test.csv'
sample_path = '/kaggle/input/mercor-ai-detection/sample_submission.csv'

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sample = pd.read_csv(sample_path)

print("Los Datos cargados correctamente./n")
print("Tamaño dataset entrenamiento:", df_train.shape)
print("Tamaño dataset test:", df_test.shape)

print("=== Fase 3 Finish===/n")

# ====================================================
# 4️ Preparar textos y etiquetas
# ====================================================

print("=== Fase 4 Star ===/n")

# Unir las columnas relevantes en un solo texto
df_train['text'] = df_train['topic'].fillna('') + " " + df_train['answer'].fillna('')
df_test['text'] = df_test['topic'].fillna('') + " " + df_test['answer'].fillna('')

X = df_train['text']
y = df_train['is_cheating']

# Dividir datos en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("=== Fase 4 Finish ===/n")

# ====================================================
# 5 Crear Embeddings (SentenceTransformer)
# ====================================================
print("=== Fase 5 Star  ===/n")

print(" Generando embeddings del conjunto de entrenamiento...")
X_train_emb = model_emb.encode(X_train.tolist(), show_progress_bar=True)

print("🔹 Generando embeddings del conjunto de validación...")
X_val_emb = model_emb.encode(X_val.tolist(), show_progress_bar=True)

print("=== Fase 5 Finish===/n")

print("=== Fase 6 Star Modelo XGBooST ===/n")


# ====================================================
# 6 Entrenar modelo XGBoost
# ====================================================
xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42,
    use_label_encoder=False
)

print("---- Paciencia Entrenando modelo XGBoost... ----")
xgb.fit(X_train_emb, y_train)

print("=== Fase 6 Finish Backgrond===/n")

# ====================================================
# 7 Evaluar modelo
# ====================================================
print("=== Fase 7 Star Evaluacion de Modelo ===/n")
print("\n ----- Evaluando modelo... --------")
X_val_pred = xgb.predict(X_val_emb)
X_val_prob = xgb.predict_proba(X_val_emb)[:, 1]

roc = roc_auc_score(y_val, X_val_prob)
f1 = f1_score(y_val, X_val_pred)
acc = accuracy_score(y_val, X_val_pred)

print("Resultados de Validación/n")
print(classification_report(y_val, X_val_pred))
print(f"ROC AUC: {roc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {acc:.4f}")

print("=== Fase 7 Finish Evaluacion de Modelo ===/n")

# ====================================================
# 8 Generar predicciones sobre el test
# ====================================================

print("=== Fase 8 Star Embeddings ===/n")

print("\n------ Generando embeddings del conjunto de prueba... -------")
X_test_emb = model_emb.encode(df_test['text'].tolist(), show_progress_bar=True)
test_pred = xgb.predict_proba(X_test_emb)[:, 1]

# Crear archivo de submission
submission = pd.DataFrame({
    'id': df_test['id'],
    'label': test_pred
})

submission.to_csv('submission.csv', index=False)
print("\n Archivo 'submission.csv' generado correctamente./n")
print("== Fase 8 Finish  Embeddings  ===/n")




=== Fase 1 Star ===/n
Paciencia Cargando modelo de embeddings.../n
/n----- El Modelo se cargado correctamente -----
=== Fase 1 Finish ===/n
=== Fase 2 Star===/n
------ Por si las dudas volvere a cargar las librerias/n -------/n
=== Fase 2 Finish ===/n
=== Fase 3 Star ===/n
------ Paciencia Inicia Carga de Datos --------/n
Los Datos cargados correctamente./n
Tamaño dataset entrenamiento: (269, 4)
Tamaño dataset test: (264, 3)
=== Fase 3 Finish===/n
=== Fase 4 Star ===/n
=== Fase 4 Finish ===/n
=== Fase 5 Star  ===/n
 Generando embeddings del conjunto de entrenamiento...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

🔹 Generando embeddings del conjunto de validación...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

=== Fase 5 Finish===/n
=== Fase 6 Star Modelo XGBooST ===/n
---- Paciencia Entrenando modelo XGBoost... ----


Parameters: { "use_label_encoder" } are not used.



=== Fase 6 Finish Backgrond===/n
=== Fase 7 Star Evaluacion de Modelo ===/n

 ----- Evaluando modelo... --------
Resultados de Validación/n
              precision    recall  f1-score   support

           0       0.72      0.75      0.73        24
           1       0.79      0.77      0.78        30

    accuracy                           0.76        54
   macro avg       0.76      0.76      0.76        54
weighted avg       0.76      0.76      0.76        54

ROC AUC: 0.8722
F1 Score: 0.7797
Accuracy: 0.7593
=== Fase 7 Finish Evaluacion de Modelo ===/n
=== Fase 8 Star Embeddings ===/n

------ Generando embeddings del conjunto de prueba... -------


Batches:   0%|          | 0/9 [00:00<?, ?it/s]


 Archivo 'submission.csv' generado correctamente./n
== Fase 8 Finish  Embeddings  ===/n


## Generacion del archivo . CSV para comparar

In [None]:
# Comando de visualizacion
!ls -lh

total 8.0K
-rw-r--r-- 1 root root 7.2K Oct 22 04:07 submission.csv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Descarga del archivo
from IPython.display import FileLink
FileLink('submission.csv')
