In [12]:
import pandas as pd

df = pd.read_csv('youtoxic_english_1000.csv')
X = df['Text']
y = df['IsToxic']

In [13]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def bert_encode(texts):
    encoded = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    with torch.no_grad():
        outputs = model(**encoded)
    return outputs.last_hidden_state[:, 0, :].numpy()

X_encoded = bert_encode(X.tolist())

In [14]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
svm = SVC(probability=True, random_state=42)
nb = GaussianNB()

modelos = [rf, gb, svm, nb]

In [15]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('svm', svm), ('nb', nb)],
    voting='soft'
)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)

# Predicciones en datos de entrenamiento
y_train_pred = ensemble.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

# Predicciones en datos de prueba
y_test_pred = ensemble.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Cálculo del overfitting
overfitting = train_accuracy - test_accuracy

print("Precisión en entrenamiento:", train_accuracy)
print("Precisión en prueba:", test_accuracy)
print("Overfitting:", overfitting)


print("\nInforme de clasificación en prueba:")
print(classification_report(y_test, y_test_pred))


Precisión en entrenamiento: 0.89875
Precisión en prueba: 0.76
Overfitting: 0.13875000000000004

Informe de clasificación en prueba:
              precision    recall  f1-score   support

       False       0.74      0.75      0.74        93
        True       0.78      0.77      0.77       107

    accuracy                           0.76       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.76      0.76       200



In [19]:
import joblib

# Guardamos el modelo
joblib.dump(ensemble, 'ensemble_model.joblib')

['ensemble_model.joblib']