In [None]:
import pandas as pd
import joblib

X_train = joblib.load('X_train_tfidf.pkl')
X_test = joblib.load('X_test_tfidf.pkl')

# Cargar etiquetas
y_train = pd.read_csv('y_train.csv')['IsToxic']
y_test = pd.read_csv('y_test.csv')['IsToxic']


## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Linear SVC': LinearSVC(),
    'Random Forest': RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"\n=== {name} ===")
    print(classification_report(y_test, preds))



=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.69      0.85      0.76       108
           1       0.76      0.54      0.63        92

    accuracy                           0.71       200
   macro avg       0.72      0.70      0.70       200
weighted avg       0.72      0.71      0.70       200


=== Naive Bayes ===
              precision    recall  f1-score   support

           0       0.66      0.81      0.73       108
           1       0.70      0.50      0.58        92

    accuracy                           0.67       200
   macro avg       0.68      0.66      0.65       200
weighted avg       0.68      0.67      0.66       200


=== Linear SVC ===
              precision    recall  f1-score   support

           0       0.70      0.77      0.73       108
           1       0.70      0.62      0.66        92

    accuracy                           0.70       200
   macro avg       0.70      0.69      0.69       200
weig

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report
import numpy as np

# Modelo
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# === 1. Cross-Validation Scores ===
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))
print("Standard Deviation:", np.std(cv_scores))

# === 2. Cross-Validated Predictions for Classification Report ===
y_train_pred = cross_val_predict(model, X_train, y_train, cv=5)
print("\nClassification Report (Train):")
print(classification_report(y_train, y_train_pred))


Cross-Validation Accuracy Scores: [0.725   0.70625 0.70625 0.69375 0.725  ]
Mean Accuracy: 0.71125
Standard Deviation: 0.012119199643540812

Classification Report (Train):
              precision    recall  f1-score   support

           0       0.71      0.77      0.74       430
           1       0.71      0.64      0.67       370

    accuracy                           0.71       800
   macro avg       0.71      0.71      0.71       800
weighted avg       0.71      0.71      0.71       800



In [None]:
X_train_text = pd.read_csv('train_data.csv')['text']
X_test_text = pd.read_csv('test_data.csv')['text']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Pipeline para unir vectorización y modelo
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Puedes modificar parámetros luego
    ('clf', LogisticRegression(max_iter=1000))
])

# Parámetros para GridSearch
param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],       # unigrama o unigrama+bigramas
    'tfidf__min_df': [2, 5],                     # palabras que aparecen en al menos 2 o 5 documentos
    'tfidf__max_df': [0.9, 0.95],                # descartar palabras muy frecuentes
    'tfidf__max_features': [3000, 5000],         # limitar vocabulario

    'clf__C': [0.001, 0.01, 0.1, 1, 10],
    'clf__solver': ['liblinear', 'saga'],
    'clf__penalty': ['l1', 'l2'],
    'clf__class_weight': [None, 'balanced']
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=2)

# Entrenar GridSearch
grid.fit(X_train_text, y_train)  # <-- si tienes el texto original para vectorizar

# Resultados
print("Mejores parámetros:", grid.best_params_)
print("Mejor F1-score (validación cruzada):", grid.best_score_)


Fitting 5 folds for each of 640 candidates, totalling 3200 fits
Mejores parámetros: {'clf__C': 0.001, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'tfidf__max_df': 0.9, 'tfidf__max_features': 3000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Mejor F1-score (validación cruzada): 0.7037745222767379


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Suponiendo que tienes X_train_text (texto crudo) y y_train (etiquetas)

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],          # unigrams o unigrams + bigrams
    'tfidf__min_df': [2, 5],                        # palabras que aparecen en al menos 2 o 5 documentos
    'tfidf__max_df': [0.9, 0.95],                   # eliminar palabras muy comunes
    'tfidf__max_features': [3000, 5000],            # limitar vocabulario

    'clf__C': [0.001, 0.01, 0.1, 1, 10],
    'clf__solver': ['liblinear', 'saga'],
    'clf__penalty': ['l1', 'l2'],
    'clf__class_weight': [None, 'balanced']
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=2)

# Entrenar GridSearch
grid.fit(X_train_text, y_train)

print("Mejores parámetros:", grid.best_params_)
print("Mejor F1-score (validación cruzada):", grid.best_score_)

# Para evaluar en test
best_model = grid.best_estimator_
y_test_pred = best_model.predict(X_test_text)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))


Fitting 5 folds for each of 640 candidates, totalling 3200 fits
Mejores parámetros: {'clf__C': 0.001, 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'tfidf__max_df': 0.9, 'tfidf__max_features': 3000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Mejor F1-score (validación cruzada): 0.7037745222767379
              precision    recall  f1-score   support

           0       0.81      0.61      0.70       108
           1       0.65      0.84      0.73        92

    accuracy                           0.71       200
   macro avg       0.73      0.72      0.71       200
weighted avg       0.74      0.71      0.71       200



In [None]:
joblib.dump(gbm, 'log_regr.pkl')

['log_regr.pkl']

## GBM

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

gbm = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3)

gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.68      0.84      0.76       108
           1       0.75      0.54      0.63        92

    accuracy                           0.70       200
   macro avg       0.72      0.69      0.69       200
weighted avg       0.71      0.70      0.70       200



In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_gbm = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 5]
}

grid_gbm = GridSearchCV(GradientBoostingClassifier(), param_grid_gbm, cv=5, scoring='f1', n_jobs=-1)
grid_gbm.fit(X_train, y_train)

print("Mejores parámetros GBM:", grid_gbm.best_params_)
print("Mejor F1 (GBM):", grid_gbm.best_score_)

Mejores parámetros GBM: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Mejor F1 (GBM): 0.6292761602917867


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', GradientBoostingClassifier())
])

param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__min_df': [2, 5],
    'tfidf__max_df': [0.9, 0.95],
    'tfidf__max_features': [3000, 5000],

    'clf__n_estimators': [100, 200],
    'clf__learning_rate': [0.1, 0.05],
    'clf__max_depth': [3, 5]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=2)
grid.fit(X_train_text, y_train)

print("Mejores parámetros:", grid.best_params_)
print("Mejor F1-score (validación cruzada):", grid.best_score_)

best_model = grid.best_estimator_
y_test_pred = best_model.predict(X_test_text)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))


Fitting 5 folds for each of 128 candidates, totalling 640 fits
Mejores parámetros: {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 200, 'tfidf__max_df': 0.9, 'tfidf__max_features': 5000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Mejor F1-score (validación cruzada): 0.6596075488325731
              precision    recall  f1-score   support

           0       0.68      0.80      0.73       108
           1       0.70      0.55      0.62        92

    accuracy                           0.69       200
   macro avg       0.69      0.68      0.68       200
weighted avg       0.69      0.69      0.68       200



In [None]:
joblib.dump(gbm, 'modelo_gbm.pkl')

['modelo_gbm.pkl']