# Demostración Ridge-Lasso
## Data Mining - Doctorado UDP 2024
**Bastián González-Bustamante** \
Noviembre 2024

In [1]:
## Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
import re
import string
from nltk.corpus import stopwords
import nltk

In [2]:
## Download stopwords if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
## Load dataset from GitHub URL
url = "https://raw.githubusercontent.com/training-datalab/gold-standard-toxicity/refs/heads/main/data/tidy/goldstd_protests.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0,id_obs,coder_1,coder_2,consensus,sec_create_1,sec_create_2,sec_review_1,sec_review_2,possibly_sensitive,lang,...,THREAT,date,tox_60,tox_70,tox_80,tox_90,insult_60,insult_70,insult_80,insult_90
0,101238,0,0,1.0,46,28,17,8,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
1,119343,0,0,1.0,8,6,0,2,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
2,122343,0,0,1.0,8,6,1,0,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
3,131878,0,0,1.0,4,52,0,1,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0
4,132171,0,0,1.0,6,15,0,1,False,es,...,,2020-08-17,0,0,0,0,0,0,0,0


In [4]:
## Filter and select columns
df = df[['coder_1', 'text']]

## Map coder_1 to binary labels for regression (if needed)
df['coder_1'] = df['coder_1'].map({0: 'NONTOXIC', 1: 'TOXIC'})

## Text cleaning function
def clean_text(text):
    ## Lowercase
    text = text.lower()
    ## Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    ## Remove stopwords
    stop_words = set(stopwords.words('spanish'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [5]:
## Apply text cleaning
df['text'] = df['text'].apply(clean_text)

## Encode labels
le = LabelEncoder()
df['coder_1'] = le.fit_transform(df['coder_1']) ## Converts 'NONTOXIC' to 0 and 'TOXIC' to 1

## Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['coder_1'], test_size=0.2, random_state=42)

In [6]:
## TF-IDF Vectorizer
## tfidf = TfidfVectorizer(max_features=1000) ## You can adjust max_features based on your data size

## TF-IDF Vectorizer in Spanish
spanish_stop_words = stopwords.words('spanish')
tfidf = TfidfVectorizer(max_features=1000, stop_words=spanish_stop_words, ngram_range=(1, 2)) ## Includes unigrams and bigrams

## Define models (standard values)

## Larger alphas (e.g., 10) will force the coefficients to shrink more, potentially simplifying the model.
## Alpha values between 0.1 and 10 are reasonable
ridge = Ridge(alpha=1.0)

## Larger alphas (e.g., 10 or higher) could potentially leading to simpler models
## Start between 0.1 and 1.0
lasso = Lasso(alpha=0.1) 

## Ridge
ridge_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('ridge', ridge)
])

## LASSO
lasso_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('lasso', lasso)
])

In [7]:
## Train Ridge
ridge_pipeline.fit(X_train, y_train)

In [8]:
## Train LASSO
lasso_pipeline.fit(X_train, y_train)

In [9]:
## Predictions
ridge_pred = ridge_pipeline.predict(X_test)
lasso_pred = lasso_pipeline.predict(X_test)

In [10]:
## Evaluation metrics
def evaluate(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, mae, r2

In [11]:
## Get metrics for Ridge
ridge_mse, ridge_mae, ridge_r2 = evaluate(y_test, ridge_pred)
print("Ridge Regression - MSE:", ridge_mse, "MAE:", ridge_mae, "R2:", ridge_r2)

Ridge Regression - MSE: 0.16603507083336652 MAE: 0.3567994413547472 R2: 0.33559395424823324


In [12]:
## Get metrics for LASSO
lasso_mse, lasso_mae, lasso_r2 = evaluate(y_test, lasso_pred)
print("Lasso Regression - MSE:", lasso_mse, "MAE:", lasso_mae, "R2:", lasso_r2)

Lasso Regression - MSE: 0.25079999999999997 MAE: 0.49920000000000003 R2: -0.0036014405762301305
