# Dataset Autoclassification

Description: This script performs sentiment analysis on a list of texts using three pre-trained neural-network based models. The output from this is meant to be compared and samples whose predictions agree through all three models should be kept to train another pipeline.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from tqdm import tqdm

# Batched sentiment prediction function
def predict_sentiment(texts, batch_size=32):
    all_preds = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting Sentiment"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        # Model outputs labels in 0–4 range, but we want 1–5 to match the dataset
        preds = (torch.argmax(outputs.logits, dim=-1) + 1).tolist()
        all_preds.extend(preds)
    return all_preds


ModuleNotFoundError: No module named 'transformers'

In [None]:
# Load model and tokenizer
model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load input JSONL file
input_path = "/content/merged_output.jsonl"
df = pd.read_json(input_path, lines=True)

# Predict and add label column
df["label"] = predict_sentiment(df["text"].tolist())

# Save to new JSONL file
output_path = "/content/labelled_output_tabularisai.jsonl"
df.to_json(output_path, orient="records", lines=True, force_ascii=False)

print(f"\nLabeled dataset saved to {output_path}")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from tqdm import tqdm

# Load LiYuan's Amazon review sentiment model
model_name = "LiYuan/amazon-review-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load input JSONL file
input_path = "./merged_output.jsonl"
df = pd.read_json(input_path, lines=True)

# Predict and add label column
df["label"] = predict_sentiment(df["text"].tolist())

# Save to new JSONL file
output_path = "./labelled_output_liyuan.jsonl"
df.to_json(output_path, orient="records", lines=True, force_ascii=False)

print(f"\nLabeled dataset saved to {output_path}")


tokenizer_config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

Predicting Sentiment:   0%|          | 0/250 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

Predicting Sentiment:  66%|██████▌   | 165/250 [02:20<01:17,  1.10it/s]

In [None]:
# Load model directly
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("DataMonke/bert-base-uncased-finetuned-review-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("DataMonke/bert-base-uncased-finetuned-review-sentiment-analysis")
# Your input text
text = "This book was good, but boring."

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load input JSONL file
input_path = "./merged_output.jsonl"
df = pd.read_json(input_path, lines=True)

# Predict and add label column
df["label"] = predict_sentiment(df["text"].tolist())

# Save to new JSONL file
output_path = "./labelled_output_datamonke.jsonl"
df.to_json(output_path, orient="records", lines=True, force_ascii=False)

print(f"\nLabeled dataset saved to {output_path}")


Predicted sentiment: 2
