# Logistic Regression

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm
2025-04-30 16:52:31.691737: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data Preprocessing
Choose what you want to evaluate:
- `KIND`: predict if the prompt is human or AI generated
- `GRADE`: predict the quality of the model

In [3]:
LABEL_TYPE = 'KIND'  # Choose from 'KIND', 'GRADE'

df = pd.read_parquet("hf://datasets/data-is-better-together/10k_prompts_ranked/data/train-00000-of-00001.parquet")


In [5]:
#df = df[df["num_responses"] > 1]
df = df[df["agreement_ratio"] > 0.4]

if LABEL_TYPE == 'KIND':
    df["label_binary"] = df["kind"].apply(lambda x: 1 if x == "human" else 0)
elif LABEL_TYPE == 'GRADE':
    df["label_binary"] = df["avg_rating"].apply(lambda x: 1 if x >= 4 else 0)
else:
    raise ValueError("Invalid LABEL_TYPE. Choose from 'KIND', 'GRADE'")

X = df["prompt"]
y = df["label_binary"]

## Tokenize Prompts
- Bag of words
- TF-IDF
- Transformer-Based Sentence Embedding

Choose the type of tokenizer you want

In [6]:
TOKENIZER_TYPE = 'BOW'  # Choose from 'BOW', 'TF-IDF', 'TRANSFORMER'

# BAG OF WORDS
if TOKENIZER_TYPE == 'BOW':
    tokenizer = CountVectorizer()
    X = tokenizer.fit_transform(X)

# TF-IDF
elif TOKENIZER_TYPE == 'TF-IDF':
    tokenizer = TfidfVectorizer()
    X = tokenizer.fit_transform(X)

# TRANSFORMER-BASED SENTENCE EMBEDDING
elif TOKENIZER_TYPE == 'TRANSFORMER':
    tokenizer = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")  # Lightweight and fast
    X = tokenizer.encode(X.tolist())

else:
    raise ValueError("Invalid TOKENIZER_TYPE. Choose from 'BOW', 'TF-IDF', 'TRANSFORMER'")

## Train Model

In [9]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

# Define the model hyperparameters
params = {
    "solver": "lbfgs",
    "max_iter": 500,
    "multi_class": "auto",
    "random_state": 8888,
}

# Train the model
lr = LogisticRegression(**params)
lr.fit(X_train, y_train)

## Evaluate Model

In [10]:
# Predict on the test set
y_pred = lr.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8422636965683323
Precision: 0.8442012905625464
Recall: 0.8422636965683323
F1 Score: 0.8418065585085609


## Make prediction

- Change the prompt as you want to evaluate it

In [11]:
PROMPT = """I love bananas, can you make a recipe out of it ?"""

if TOKENIZER_TYPE == 'BOW' or TOKENIZER_TYPE == 'TF-IDF':
    prompt_vector = tokenizer.transform([PROMPT])
elif TOKENIZER_TYPE == 'TRANSFORMER':
    prompt_vector = tokenizer.encode([PROMPT])
else:
    raise ValueError("Invalid TOKENIZER_TYPE. Choose from 'BOW', 'TF-IDF', 'TRANSFORMER'")

print(f"\nPrompt: {PROMPT}")
prediction = lr.predict(prompt_vector)
print(f"Prediction: {prediction[0]}")


Prompt: I love bananas, can you make a recipe out of it ?
Prediction: 1
