# Logistic Regression

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm
2025-05-07 10:13:42.615541: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'train.csv', 'validation': 'dev.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/ibm-research/argument_quality_ranking_30k/" + splits["train"])

In [49]:
#df = df[df["num_responses"] > 1]
PADDING_TOPIC = 15
PADDING_ARG = 100

REGRESSION = True
if REGRESSION:
    df["label_binary"] = df["MACE-P"]
else:
    df["label_binary"] = df["MACE-P"].apply(lambda x: 1 if x > 0.5 else 0)

y = df["label_binary"]
X = []
for i in range(len(df)):
    topic = df["topic"][i] + " <PAD>" * (PADDING_TOPIC - len(df["topic"][i].split()))
    arg = df["argument"][i] + " <PAD>" * (PADDING_ARG - len(df["argument"][i].split()))
    X.append(topic + " " + arg)
X = pd.Series(X)

## Tokenize Prompts
- Bag of words
- TF-IDF
- Transformer-Based Sentence Embedding

Choose the type of tokenizer you want

In [50]:
TOKENIZER_TYPE = 'BOW'  # Choose from 'BOW', 'TF-IDF', 'TRANSFORMER'

# BAG OF WORDS
if TOKENIZER_TYPE == 'BOW':
    tokenizer = CountVectorizer()
    X = tokenizer.fit_transform(X)

# TF-IDF
elif TOKENIZER_TYPE == 'TF-IDF':
    tokenizer = TfidfVectorizer()
    X = tokenizer.fit_transform(X)

# TRANSFORMER-BASED SENTENCE EMBEDDING
elif TOKENIZER_TYPE == 'TRANSFORMER':
    tokenizer = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")  # Lightweight and fast
    X = tokenizer.encode(X.tolist())

else:
    raise ValueError("Invalid TOKENIZER_TYPE. Choose from 'BOW', 'TF-IDF', 'TRANSFORMER'")

## Train Model

In [52]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

if REGRESSION:
    from sklearn.linear_model import LinearRegression

    # Train a regression model
    reg = LinearRegression()
    reg.fit(X_train, y_train)
else:
    # Define the model hyperparameters
    params = {
        "solver": "lbfgs",
        "max_iter": 500,
        "multi_class": "auto",
        "random_state": 8888,
    }

    # Train the model
    lr = LogisticRegression(**params)
    lr.fit(X_train, y_train)

## Evaluate Model

In [None]:
if REGRESSION:
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

    # Predict on the test set
    y_pred = reg.predict(X_test)

    # Calculate regression metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Mean Absolute Error (MAE): {mae}")
else:
    # Predict on the test set
    y_pred = lr.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

Mean Squared Error (MSE): 0.24086942722015747
Mean Absolute Error (MAE): 0.3825032522391897
R-squared (R2): -0.7788747705609178


## Make prediction

- Change the prompt as you want to evaluate it

In [40]:
def prepare_data(topic, arg):
    topic = topic + " <PAD>" * (PADDING_TOPIC - len(topic.split()))
    arg = arg + " <PAD>" * (PADDING_ARG - len(arg.split()))
    return topic + " " + arg
PROMPT = prepare_data("Assisted suicide should be a criminal offence","`people reach their limit when it comes to their quality of life and should be able to end their suffering. this can be done with little or no suffering by assistance and the person is able to say good bye.")
if TOKENIZER_TYPE == 'BOW' or TOKENIZER_TYPE == 'TF-IDF':
    prompt_vector = tokenizer.transform([PROMPT])
elif TOKENIZER_TYPE == 'TRANSFORMER':
    prompt_vector = tokenizer.encode([PROMPT])
else:
    raise ValueError("Invalid TOKENIZER_TYPE. Choose from 'BOW', 'TF-IDF', 'TRANSFORMER'")

print(f"\nPrompt: {PROMPT}")
prediction = lr.predict(prompt_vector)
print(f"Prediction: {prediction[0]}")


Prompt: Assisted suicide should be a criminal offence <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> `people reach their limit when it comes to their quality of life and should be able to end their suffering. this can be done with little or no suffering by assistance and the person is able to say good bye. <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Prediction: 1
