In [14]:
# ================================================
# 1. Install dependencies (run once if needed)
# ================================================
# !pip install transformers datasets scikit-learn pandas torch

In [15]:
# ================================================
# 2. Import libraries
# ================================================
import pandas as pd
import numpy as np
import torch
import random
import re
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    TrainerCallback
)
from datasets import Dataset

In [16]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [17]:
# ================================================
# 3. Load the data
# ================================================
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(train_df.head())
print(test_df.head())

# ================================================
# 4. Define light preprocessing (safe for GPT-2)
# ================================================
def clean_text_gpt2(text):
    text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['text'] = train_df['text'].apply(clean_text_gpt2)
test_df['text'] = test_df['text'].apply(clean_text_gpt2)

# ================================================
# 5. Convert labels to text prompts
# ================================================
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Prepare prompt and target
train_df['input_text'] = "Tweet: " + train_df['text'] + " Sentiment:"
train_df['target_text'] = train_df['label'].map(label_map)

test_df['input_text'] = "Tweet: " + test_df['text'] + " Sentiment:"

# ================================================
# 6. Train-validation split
# ================================================
train_data, val_data = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)

# ================================================
# 7. Load tokenizer
# ================================================
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 workaround for padding

# ================================================
# 8. Tokenization function
# ================================================
def tokenize_function(examples):
    inputs = tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=10)
    inputs["labels"] = labels["input_ids"]
    return inputs

# Apply to train and val
train_dataset = Dataset.from_pandas(train_data[['input_text','target_text']])
val_dataset = Dataset.from_pandas(val_data[['input_text','target_text']])

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# ================================================
# 9. Load model (GPT-2 Causal LM)
# ================================================
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

# ================================================
# 10. Define Data Collator for LM
# ================================================
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

                                                text  label
0  $BYND - JPMorgan reels in expectations on Beyo...      0
1  $CCL $RCL - Nomura points to bookings weakness...      0
2  $CX - Cemex cut at Credit Suisse, J.P. Morgan ...      0
3  $ESS: BTIG Research cuts to Neutral https://t....      0
4  $FNKO - Funko slides after Piper Jaffray PT cu...      0
   id                                               text
0   0  ETF assets to surge tenfold in 10 years to $50...
1   1  Here’s What Hedge Funds Think Evolution Petrol...
2   2  $PVH - Phillips-Van Heusen Q3 2020 Earnings Pr...
3   3  China is in the process of waiving retaliatory...
4   4  Highlight: “When growth is scarce, investors s...


Map:   0%|          | 0/7634 [00:00<?, ? examples/s]



Map:   0%|          | 0/1909 [00:00<?, ? examples/s]



In [None]:
# ================================================
# 11. Define Trainer and Training Arguments
# ================================================
training_args = TrainingArguments(
    output_dir="./results_gpt2_causal",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.05,
    warmup_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=1,
    logging_dir="./logs_gpt2_causal",
    logging_steps=10,
    report_to="none"
)

# Simple logger callback (optional, tracks losses)
class LossLogger(TrainerCallback):
    def __init__(self):
        self.eval_losses = []
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        self.eval_losses.append(metrics["eval_loss"])

logger = LossLogger()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[logger, EarlyStoppingCallback(early_stopping_patience=2)]
)

# ================================================
# 12. Train model
# ================================================
trainer.train()

  trainer = Trainer(


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# ================================================
# 13. Inference on test set
# ================================================
# Prepare test set for generation:
test_inputs = test_df['input_text'].tolist()
test_encodings = tokenizer(test_inputs, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Generate predictions
generated_ids = model.generate(
    input_ids=test_encodings['input_ids'],
    attention_mask=test_encodings['attention_mask'],
    max_new_tokens=5
)

# Decode predictions
predictions = tokenizer.batch_decode(generated_ids[:, test_encodings['input_ids'].shape[1]:], skip_special_tokens=True)

# Map back to numeric labels
reverse_label_map = {'negative': 0, 'neutral': 1, 'positive': 2}

# Apply simple cleaning to predictions
predictions_clean = [re.sub(r'\W+', '', pred.strip().split()[0].lower()) for pred in predictions]
pred_labels = [reverse_label_map.get(pred, 1) for pred in predictions_clean]  # default to neutral if unknown


# ================================================
# 14. Save submission file
# ================================================
test_df['label'] = pred_labels
test_df[['id','label']].to_csv("pred_decoder_causal.csv", index=False)
print("Prediction file saved as pred_decoder_causal.csv")