In [None]:
%pip install --upgrade transformers wandb datasets huggingface_hub -q

In [None]:
import pandas as pd
import numpy as np
#import tensorflow as tf
from tqdm import tqdm
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import classification_report
import wandb
from datetime import datetime
from huggingface_hub import login
from google.colab import userdata

In [None]:
login(token=userdata.get('HUGGINGFACE_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
splits = {'train': 'data/train-00000-of-00001-7b34565378f02992.parquet', 'val': 'data/val-00000-of-00001-d7338c59b5e5031f.parquet', 'test': 'data/test-00000-of-00001-c830a979da438bff.parquet'}
df_train = pd.read_parquet("hf://datasets/PrevenIA/spanish-suicide-intent/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Loading and processing the dataset

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136136 entries, 0 to 136135
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Text               136136 non-null  object
 1   Label              136136 non-null  int64 
 2   dataset            136136 non-null  object
 3   __index_level_0__  136136 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 4.2+ MB


### Statistics about text length

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenizar los textos y calcular su longitud en tokens
def get_text_length(text, tokenizer):
    tokens = tokenizer(text, truncation=False, add_special_tokens=True)['input_ids']
    return len(tokens)

# Crear una nueva columna con la longitud de los tokens
df_train['token_length'] = df_train['Text'].apply(lambda x: get_text_length(x, tokenizer))

# Generar estadísticas descriptivas de la columna 'token_length'
length_stats = df_train['token_length'].describe()

print("Estadísticas descriptivas de la longitud de los tokens:")
print(length_stats)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors


Estadísticas descriptivas de la longitud de los tokens:
count    136136.000000
mean         65.920161
std         109.846440
min           2.000000
25%          31.000000
50%          50.000000
75%          79.000000
max        5697.000000
Name: token_length, dtype: float64


In [None]:
# Obtener percentil 95 de la longitud de los tokens
percentile_95 = np.percentile(df_train['token_length'], 95)
print(f"El 95% de los textos tienen una longitud menor o igual a: {percentile_95} tokens")

El 95% de los textos tienen una longitud menor o igual a: 111.0 tokens


In [None]:
# Filtra los datos con Label igual a 1
df_label_1 = df_train[df_train['Label'] == 1]

# Filtra los datos con Label igual a 0
df_label_0 = df_train[df_train['Label'] == 0]

# Toma 10,000 muestras aleatorias de cada grupo
sampled_label_1 = df_label_1.sample(n=50000, random_state=42)
sampled_label_0 = df_label_0.sample(n=50000, random_state=42)

# Combina ambos DataFrames
sampled_df = pd.concat([sampled_label_1, sampled_label_0])

# Reordena las filas aleatoriamente

df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
df.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,50000
1,50000


## Tokenizer and embedding text

In [None]:
df.rename(columns={'Text': 'text'}, inplace=True)

In [None]:
# Cargar el modelo y el tokenizer de BERT
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name)

# Tokenizar los textos (puedes ajustar 'max_length' si es necesario)
def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='tf')

# Procesar los textos en lotes
def get_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Función para calcular los embeddings en lotes
def compute_embeddings(texts, batch_size=32, max_length=128):
    embeddings_list = []

    for batch in tqdm(get_batches(texts, batch_size)):
        # Tokenizar el lote de textos
        inputs = tokenize_texts(batch, tokenizer, max_length=max_length)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        # Calcular los embeddings para el lote
        embeddings = bert_model(input_ids, attention_mask=attention_mask)['last_hidden_state']

        # Tomar el embedding [CLS] de la primera posición para cada texto
        cls_embeddings = embeddings[:, 0, :].numpy()

        embeddings_list.append(cls_embeddings)

    return np.vstack(embeddings_list)

texts = df['text'].tolist()

batch_size = 64
max_length = 128

# Calcular y guardar los embeddings
X = compute_embeddings(texts, batch_size=batch_size, max_length=max_length)

embeddings_as_list = X.tolist()

df['embeddings'] = embeddings_as_list

# Guardar el DataFrame completo en un archivo CSV
df.to_csv('dataset_with_embeddings.csv', index=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

### split embeddings dataset

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Label'])

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.02, random_state=42, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1, random_state=42, stratify=y_temp)

In [None]:
len(y_test)

2000

# Building the model

In [None]:
import wandb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime

# Initialize WandB for logging
today = datetime.today()
run_date = today.strftime("%d-%m-%Y_%H_%M")

# Log into WandB using the token
wb_token = userdata.get('wandb')
wandb.login(key=wb_token)

# Initialize WandB run
run = wandb.init(
    project='XGBoost on suicide intent-spanish',
    job_type="training",
    name=f"experiment_xgboost_{run_date}",
    anonymous="allow"
)

# Create the XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

# Define the parameters for Grid Search
param_grid = {
    'n_estimators': [50, 100, 200, 300],  # Number of trees
    'max_depth': [5, 7, 15,30],         # Maximum tree depth
    'learning_rate': [0.1],          # Learning rate
}

# Set up Grid Search with 5-fold cross-validation
#grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           #scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

randomized_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                                       scoring='accuracy', cv=5, n_jobs=1, verbose=1, n_iter=10)
randomized_search.fit(X_train, y_train)

# Train the model using Grid Search
randomized_search.fit(X_train, y_train)

# Log the best parameters to WandB
wandb.log({"best_params": randomized_search.best_params_})

# Print the best parameters
print("Best Parameters:", randomized_search.best_params_)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfelipeandres29[0m ([33mfelipeandres29-universidad-eafit[0m). Use [1m`wandb login --relogin`[0m to force relogin


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.1}


# Testing model after fine-tuning

In [None]:
# Make predictions with the best model
y_pred = randomized_search.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)

# Log accuracy to WandB
wandb.log({"val_accuracy": accuracy})

# Generate and log the classification report as a dictionary
class_report = classification_report(y_val, y_pred, output_dict=True)
wandb.log({"val_classification_report": class_report})

# If you want to print the classification report as well
print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      4900
           1       0.80      0.80      0.80      4900

    accuracy                           0.80      9800
   macro avg       0.80      0.80      0.80      9800
weighted avg       0.80      0.80      0.80      9800



In [None]:
# Crear el modelo XGBoost
xgb_model = xgb.XGBClassifier(**randomized_search.best_params_)

# Entrenar el modelo
xgb_model.fit(X_temp, y_temp)

# Realizar predicciones
y_pred = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)

# Evaluar el modelo
print(classification_report(y_test, y_pred))

wandb.log({"test_Accuracy": accuracy})
wandb.log({"test_classification_report": class_report})

              precision    recall  f1-score   support

           0       0.80      0.80      0.80      1000
           1       0.80      0.81      0.80      1000

    accuracy                           0.80      2000
   macro avg       0.80      0.80      0.80      2000
weighted avg       0.80      0.80      0.80      2000



In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Convert confusion matrix to dictionary for WandB
conf_matrix_dict = {
    "Confusion Matrix": {
        "True Negative": conf_matrix[0, 0],
        "False Positive": conf_matrix[0, 1],
        "False Negative": conf_matrix[1, 0],
        "True Positive": conf_matrix[1, 1],
    }
}

# Log the confusion matrix to WandB
wandb.log(conf_matrix_dict)

# Optional: print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Finish the WandB run
run.finish()

Confusion Matrix:
[[800 200]
 [194 806]]


VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test_Accuracy,▁
val_accuracy,▁

0,1
test_Accuracy,0.803
val_accuracy,0.80347
