In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix

train + test

In [None]:
np.random.seed(42)
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

data = pd.concat([train, test])
data['text'] = data['text'].fillna('')

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.15, random_state=42, stratify = data['label'])

In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model (e.g., 'all-MiniLM-L6-v2')
model = SentenceTransformer("facebook/bart-base")


In [None]:
# Get document vectors
train_vectors = [model.encode(text) for text in X_train]
test_vectors = [model.encode(text) for text in X_test]

data_vectors = [model.encode(text) for text in data['text']]
data_y = data['label']

## SVM

In [None]:
from sklearn import svm
# Define parameter grid
param_grid = [
    {'C': [1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9], 'kernel': ['linear']},
    {'C': [1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9], 'kernel': ['rbf']},
    {'C': [1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9], 'kernel': ['sigmoid']},
    {'C': [1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9], 'kernel': ['poly'], 'degree': [1, 2, 3, 4]},
]

# Create a stratified k-fold cross-validator
cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

# Create the SVM classifier
clf = svm.SVC(probability=True)

# Create the grid search object
grid_search_svm = GridSearchCV(clf, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)

# Fit the grid search
grid_search_svm.fit(train_vectors, y_train)

# Print the best parameters
print("Best parameters: ", grid_search_svm.best_params_)
print("Best score: ", grid_search_svm.best_score_)

In [None]:
predicted = grid_search_svm.predict(test_vectors)

In [None]:
print("Balanced accuracy: ", balanced_accuracy_score(y_test, predicted))
print(classification_report(y_test, predicted))

In [None]:
cm = confusion_matrix(y_test, predicted)
sns.heatmap(cm, annot=True, fmt="d", cmap="YlGnBu"  )
plt.show()

In [None]:
grid_search_svm.fit(data_vectors, data_y)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
param_grid_lr = {'C': [0.0006, 0.0007, 0.0008, 0.0009, 0.0010,0.0011,0.0012,],'penalty': ['l1', 'l2'], 'solver': ['liblinear'],'class_weight': ['balanced']}

cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

# Creamos los objetos GridSearchCV
grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)

grid_search_lr.fit(train_vectors, y_train)

# Imprimimos los mejores parámetros y la mejor puntuación para cada modelo
print("Logistic Regression: Mejores parámetros: ", grid_search_lr.best_params_)
print("Logistic Regression: Mejor puntuación: ", grid_search_lr.best_score_)

In [None]:
predicted = grid_search_lr.predict(test_vectors)

In [None]:
print("Balanced accuracy: ", balanced_accuracy_score(y_test, predicted))
print(classification_report(y_test, predicted))

In [None]:
cm = confusion_matrix(y_test, predicted)
sns.heatmap(cm, annot=True, fmt="d", cmap="YlGnBu")
plt.show()

In [None]:
grid_search_lr.fit(data_vectors, data_y)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define parameter grid
param_grid_rf = {'n_estimators': list(range(100,200,10)), 'max_depth': list(range(10,15)), 'min_samples_split': [2,3,4,5,6,7]}

# Creamos los modelos
rf = RandomForestClassifier(random_state=42)

# Creamos el objeto StratifiedKFold
cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

# Creamos los objetos GridSearchCV
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)

# Ajustamos los modelos
grid_search_rf.fit(train_vectors, y_train)

print("Random Forest: Mejores parámetros: ", grid_search_rf.best_params_)
print("Random Forest: Mejor puntuación: ", grid_search_rf.best_score_)

In [None]:
predicted = grid_search_rf.predict(test_vectors)

In [None]:
print("Balanced accuracy: ", balanced_accuracy_score(y_test, predicted))
print(classification_report(y_test, predicted))

In [None]:
cm = confusion_matrix(y_test, predicted)
sns.heatmap(cm, annot=True, fmt="d", cmap="YlGnBu")
plt.show()

In [None]:
grid_search_rf.fit(data_vectors, data_y)

## Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(estimators=[('svm', grid_search_svm),
                                        ('lr', grid_search_lr),
                                        ('rf', grid_search_rf)], 
                                        voting='soft')

ensemble.fit(train_vectors, y_train)

In [None]:
# Predict the test set
predicted = ensemble.predict(test_vectors)

print("Balanced accuracy: ", balanced_accuracy_score(y_test, predicted))
print(classification_report(y_test, predicted))

In [None]:
cm = confusion_matrix(y_test, predicted)
sns.heatmap(cm, annot=True, fmt="d", cmap="YlGnBu")
plt.show()

In [None]:
ensemble.fit(data_vectors, data_y)

In [None]:
joblib.dump(ensemble, "modelos/_transformers_ensemble_model")