# Modelos de clasificación para texto
Bastián González-Bustamante \
Universidad Diego Portales \
Noviembre 2024

In [1]:
## Dependencies
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
## Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
## Load dataset from GitHub URL
url = "https://raw.githubusercontent.com/training-datalab/gold-standard-toxicity/refs/heads/main/data/tidy/goldstd_protests.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0,id_obs,coder_1,coder_2,consensus,sec_create_1,sec_create_2,sec_review_1,sec_review_2,possibly_sensitive,lang,...,THREAT,date,tox_60,tox_70,tox_80,tox_90,insult_60,insult_70,insult_80,insult_90
0,101238,0,0,1.0,46,28,17,8,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
1,119343,0,0,1.0,8,6,0,2,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
2,122343,0,0,1.0,8,6,1,0,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
3,131878,0,0,1.0,4,52,0,1,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
4,132171,0,0,1.0,6,15,0,1,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0


In [4]:
## Preprocessing function
def preprocess_text(text):
    ## Lowercase
    text = text.lower()
    ## Remove punctuation and non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    ## Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [5]:
## Apply preprocessing to the text data
df['text'] = df['text'].apply(preprocess_text)

In [6]:
## Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['coder_1'], test_size=0.2, random_state=86)

## Vectorize text data
## vectorizer = TfidfVectorizer(max_features=5000)

## Vectorize text data in Spanish
spanish_stop_words = stopwords.words('spanish')
vectorizer = TfidfVectorizer(max_features=10000, stop_words=spanish_stop_words, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [7]:
## Define function to train model and evaluate metrics
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

In [8]:
## NB
nb_model = MultinomialNB()

In [9]:
## Evaluate Naive Bayes
nb_metrics = evaluate_model(nb_model, X_train_vec, y_train, X_test_vec, y_test)
print("Naive Bayes Metrics - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(*nb_metrics))

Naive Bayes Metrics - Accuracy: 0.7300, Precision: 0.7083, Recall: 0.8947, F1 Score: 0.7907


In [18]:
## Hyperparameter tuning
nb_model_ft = MultinomialNB(alpha=0.5)
## For Naive Bayes, MultinomialNB has limited tunable parameters, but adjusting the smoothing parameter (alpha) can sometimes improve performance:
## alpha: Additive smoothing parameter. A value closer to 0 assumes word frequency distributions are more reliable, but alpha=1.0 is generally a good default.

In [19]:
## Evaluate Naive Bayes
nb_metrics_ft = evaluate_model(nb_model_ft, X_train_vec, y_train, X_test_vec, y_test)
print("Naive Bayes Metrics - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(*nb_metrics_ft))

Naive Bayes Metrics - Accuracy: 0.7250, Precision: 0.7092, Recall: 0.8772, F1 Score: 0.7843


In [20]:
# SVM
svm_model = SVC(kernel='linear', random_state=86)

In [21]:
# Evaluate SVM
svm_metrics = evaluate_model(svm_model, X_train_vec, y_train, X_test_vec, y_test)
print("SVM Metrics - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(*svm_metrics))

SVM Metrics - Accuracy: 0.7450, Precision: 0.7890, Recall: 0.7544, F1 Score: 0.7713


In [22]:
## Hyperparameter tuning
svm_model_ft = SVC(kernel='linear', C=1.0, random_state=86)
## For text data, linear kernels often perform well, but you might want to adjust C to control regularization.
## C: Regularization parameter. A smaller C increases regularization strength, which can help avoid overfitting but might reduce accuracy.
## kernel: The kernel type (linear is generally suitable for text data).

In [23]:
# Evaluate SVM
svm_metrics_ft = evaluate_model(svm_model_ft, X_train_vec, y_train, X_test_vec, y_test)
print("SVM Metrics - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(*svm_metrics_ft))

SVM Metrics - Accuracy: 0.7450, Precision: 0.7890, Recall: 0.7544, F1 Score: 0.7713


In [24]:
## Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=86)

In [25]:
# Evaluate Random Forest
rf_metrics = evaluate_model(rf_model, X_train_vec, y_train, X_test_vec, y_test)
print("Random Forest Metrics - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(*rf_metrics))

Random Forest Metrics - Accuracy: 0.7500, Precision: 0.8636, Recall: 0.6667, F1 Score: 0.7525


In [26]:
## Hyperparameter tuning
rf_model_ft = RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, min_samples_leaf=2, random_state=86)
## For Random Forest, you can tune the number of trees, maximum depth, and minimum samples per leaf to optimize its performance:
## n_estimators: Number of trees in the forest. More trees can improve accuracy but increase computation time (e.g., 100, 200).
## max_depth: Maximum depth of each tree. Limiting depth can prevent overfitting (e.g., max_depth=10).
## min_samples_split: Minimum samples required to split an internal node. A higher value can reduce overfitting.
## min_samples_leaf: Minimum samples required at a leaf node.

In [27]:
# Evaluate Random Forest
rf_metrics_ft = evaluate_model(rf_model_ft, X_train_vec, y_train, X_test_vec, y_test)
print("Random Forest Metrics - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(*rf_metrics_ft))

Random Forest Metrics - Accuracy: 0.6850, Precision: 0.6889, Recall: 0.8158, F1 Score: 0.7470


In [28]:
## XGBoost
xgb_model = XGBClassifier(eval_metric='logloss', random_state=86)

In [29]:
# Evaluate XGBoost
xgb_metrics = evaluate_model(xgb_model, X_train_vec, y_train, X_test_vec, y_test)
print("XGBoost Metrics - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(*xgb_metrics))

XGBoost Metrics - Accuracy: 0.7200, Precision: 0.8152, Recall: 0.6579, F1 Score: 0.7282


In [30]:
## Hyperparameter tuning
xgb_model_ft = XGBClassifier(eval_metric='logloss', learning_rate=0.1, n_estimators=500, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=86)
## XGBoost has several parameters that can greatly influence its performance. The learning_rate and n_estimators are especially important.
## learning_rate: Controls the step size of each update. Lower values improve generalization (e.g., 0.1, 0.01).
## n_estimators: Number of boosting rounds. More rounds usually improve performance but increase training time.
## max_depth: Maximum depth of each tree, controlling complexity.
## subsample: Fraction of samples used for each tree. Reducing this can help prevent overfitting.
## colsample_bytree: Fraction of features used for each tree.

In [31]:
# Evaluate XGBoost
xgb_metrics_ft = evaluate_model(xgb_model_ft, X_train_vec, y_train, X_test_vec, y_test)
print("XGBoost Metrics - Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}".format(*xgb_metrics_ft))

XGBoost Metrics - Accuracy: 0.7200, Precision: 0.8021, Recall: 0.6754, F1 Score: 0.7333
