In [5]:
# Import
import pandas as pd
import re
import json
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_csv("../data/data.csv")



In [7]:
print("Raw shape: ", df.shape)
print(df.head())

Raw shape:  (5842, 2)
                                            Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral


In [None]:
# Data cleaning and sanity checks

# Drop rows with missing values
df = df[["Sentence", "Sentiment"]].dropna(subset=["Sentence", "Sentiment"])

# Normalize sentiment labels (strip spaces, lowercase)
df["Sentiment"] = df["Sentiment"].astype(str).str.strip().str.lower()

# For safety I'll keep only allowed labels
allowed_labels = ["positive", "negative", "neutral"]
df = df[df["Sentiment"].isin(allowed_labels)].copy()

print("\nClass distribution after cleaning:")
print(df["Sentiment"].value_counts())


Class distribution after cleaning:
Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64


In [10]:
# Text cleaning function

def clean_text(text: str) -> str:
    text = str(text)

    # Remove URLs
    text = re.sub(r"http\S+", "", text)

    # Remove multiple spaces / tabs / newlines
    text = re.sub(r"\s+", " ", text).strip()

    return text


df["Sentence"] = df["Sentence"].astype(str).apply(clean_text)
    

In [11]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [15]:
# Train/validation split

from pandas.core.config_init import val_mca


train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df["Sentiment"],
)

print("\nTrain size:", len(train_df))
print("Validaton size:", len(val_df))


Train size: 5257
Validaton size: 585


In [23]:
# Convert to instruction style format for Llama 3.2 QLoRA with Unsloth
def format_row(row):
    system_prompt = (
        "You are a financial sentiment analyst. "
        "Given the financial text, reply with exactly one word: "
        "Positive, Negative, or Neutral."
    )

    instruction = system_prompt
    user_input = row["Sentence"]
    output = row["Sentiment"].strip().lower().capitalize()

    return {
        "instruction": instruction,
        "input": user_input,
        "output": output,
    }


train_data = [format_row(r) for _, r in train_df.iterrows()]
val_data = [format_row(r) for _, r in val_df.iterrows()]

print("\nExample formatted entry:\n")
print(json.dumps(train_data[0], indent=4, ensure_ascii=False))



Example formatted entry:

{
    "instruction": "You are a financial sentiment analyst. Given the financial text, reply with exactly one word: Positive, Negative, or Neutral.",
    "input": "Equity indexes working on a positive engulfing for March. $SML $RUT $IWC $SMH $SOX Small-caps / semis with a fakeout before the b/out.",
    "output": "Positive"
}


In [24]:
# Save to JSONL files

def save_jsonl(path, data):
    with open(path, "w", encoding="utf-8") as f:
        for row in data:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")


save_jsonl("../data/train_dataset.jsonl", train_data)
save_jsonl("../data/val_dataset.jsonl", val_data)

print("\nSaved files:")
print("  train_dataset.jsonl")
print("  val_dataset.jsonl")



Saved files:
  train_dataset.jsonl
  val_dataset.jsonl
