<a href="https://colab.research.google.com/github/ADPsmackskeys/stock-market-predictive-analysis/blob/main/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/ADPsmackskeys/stock-market-predictive-analysis.git

Cloning into 'stock-market-predictive-analysis'...
remote: Enumerating objects: 23675, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 23675 (delta 30), reused 45 (delta 21), pack-reused 23611 (from 2)[K
Receiving objects: 100% (23675/23675), 1.37 GiB | 5.46 MiB/s, done.
Resolving deltas: 100% (1140/1140), done.
Updating files: 100% (4262/4262), done.


In [None]:
from huggingface_hub import notebook_login
notebook_login()
# !hf auth login

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# --- 1. Install dependencies ---
!pip install transformers datasets torch scikit-learn

# --- 2. Imports ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import torch
import numpy as np
from torch.nn import CrossEntropyLoss




In [None]:
# --- 3. Load your CSV ---
csv_path = "/content/stock-market-predictive-analysis/data/news_sentiment/labeled_news.csv"  # upload to Colab
df = pd.read_csv(csv_path)

# Keep only needed columns
df = df[["News", "Label"]].dropna()

# Encode labels to integers
le = LabelEncoder()
df["Label_ID"] = le.fit_transform(df["Label"])  # e.g., Positive=2, Neutral=1, Negative=0
num_labels = len(le.classes_)

# Split into train, validation, test (stratified)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Label_ID"])
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df["Label_ID"])

# Rename the label column to 'labels' in the dataframes
train_df = train_df.rename(columns={"Label_ID": "labels"})
val_df = val_df.rename(columns={"Label_ID": "labels"})
test_df = test_df.rename(columns={"Label_ID": "labels"})


# --- 4. Convert to Hugging Face Dataset ---
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

# --- 5. Load tokenizer ---
model_name = "ProsusAI/finbert" # Use the correct model name string
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["News"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset   = val_dataset.map(tokenize, batched=True)
test_dataset  = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
columns = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=columns)
val_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)

# --- 6. Load model ---
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# --- 7. Compute class weights for imbalance ---
label_counts = train_df["labels"].value_counts().sort_index().values # Use 'labels' column from the renamed dataframe
total = label_counts.sum()
class_weights = [total/count for count in label_counts]  # inverse frequency
class_weights_tensor = torch.tensor(class_weights).float()
print(f"Class weights: {class_weights}")

# --- 8. Define custom Trainer with custom loss ---
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# --- 9. Training arguments with early stopping ---
# --- 9. Training arguments with early stopping ---
training_args = TrainingArguments(
    output_dir="/content/stock-market-predictive-analysis/models/unimodal",
    num_train_epochs=15,  # Increase epochs, EarlyStopping will handle the rest
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,          # <-- CHANGE: Increased learning rate
    logging_dir="/content/stock-market-predictive-analysis/models/unimodal/logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    warmup_steps=500,            # <-- ADD: Warmup steps
    weight_decay=0.01,           # <-- ADD: Weight decay
)
# --- 10. Metrics ---

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    # Use 'weighted' average to account for class imbalance
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# --- 11. Trainer with EarlyStopping ---
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]  # stop if val doesn't improve 2 epochs
)

# --- 12. Train ---
trainer.train()

# --- 13. Evaluate on test set ---
results = trainer.evaluate(test_dataset)
print("\nTest set accuracy:", results["eval_accuracy"])

# --- 14. Save fine-tuned model ---
trainer.save_model("/content/stock-market-predictive-analysis/models/unimodal")
tokenizer.save_pretrained("/content/stock-market-predictive-analysis/models/unimodal")

print("Fine-tuned model and tokenizer saved in /content/stock-market-predictive-analysis/models/unimodal")

Map:   0%|          | 0/680 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/190 [00:00<?, ? examples/s]

Class weights: [np.float64(1.8888888888888888), np.float64(13.076923076923077), np.float64(2.537313432835821)]


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,2.0346,1.816595,0.526316,0.508882,0.50984,0.526316
2,1.2726,1.318357,0.618421,0.57807,0.576128,0.618421
3,1.1381,1.095714,0.552632,0.557627,0.569512,0.552632
4,0.9342,1.312695,0.552632,0.478955,0.550239,0.552632
5,0.6632,1.462111,0.578947,0.566263,0.573652,0.578947
6,0.4976,1.76209,0.552632,0.5459,0.562608,0.552632


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Test set accuracy: 0.5
Fine-tuned model and tokenizer saved in /content/stock-market-predictive-analysis/models/unimodal


In [None]:
# --- 3. Load Data and CONVERT to Classification (with Manual Bins) ---
csv_path = "/content/stock-market-predictive-analysis/data/news_sentiment/labeled_news.csv"  # Upload the file from Step 1
df = pd.read_csv(csv_path)

# Keep necessary columns and drop any rows with missing scores
df_model = df[["News", "Trajectory_Score"]].dropna()

# --- THIS IS THE KEY CHANGE ---
# Define manual bins for Negative, Neutral, and Positive scores
bins = [-np.inf, -0.01, 0.01, np.inf]
labels = ["Negative", "Neutral", "Positive"]

# Use pd.cut to apply these manual bins
df_model['Label'] = pd.cut(df_model['Trajectory_Score'], bins=bins, labels=labels)

# --- Back to Classification Setup ---
# Drop any rows that couldn't be binned (shouldn't happen with this setup)
df_model.dropna(subset=['Label'], inplace=True)

# Encode the new string labels to integers (0, 1, 2)
le = LabelEncoder()
df_model["labels"] = le.fit_transform(df_model["Label"])
num_labels = len(le.classes_)

# Check the distribution (it will likely be imbalanced now, which is OK)
print("New Label Distribution (from Manual Bins):")
print(df_model['Label'].value_counts())

# Split data (stratified)
train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42, stratify=df_model["labels"])
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df["labels"])

print(f"Data loaded: {len(train_df)} train, {len(val_df)} val, {len(test_df)} test samples")

# --- 4. Convert to Hugging Face Dataset ---
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

# --- 5. Load tokenizer ---
model_name = "Vansh180/FinBERT-India-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["News"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset   = val_dataset.map(tokenize, batched=True)
test_dataset  = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
columns = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=columns)
val_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)

# --- 6. Load model for CLASSIFICATION ---
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels # num_labels is 3
    # We don't need 'ignore_mismatched_sizes' because FinBERT is already a 3-label classifier
)

# --- 7. Compute class weights (Good practice) ---
label_counts = train_df["labels"].value_counts().sort_index().values
total = label_counts.sum()
class_weights = [total/count for count in label_counts]
class_weights_tensor = torch.tensor(class_weights).float()
print(f"Class weights: {class_weights}")

# --- 8. Define custom Trainer with custom loss ---
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# --- 9. Define Metrics for Classification ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted'),
        'precision': precision_score(labels, preds, average='weighted'),
        'recall': recall_score(labels, preds, average='weighted')
    }

# --- 10. Training arguments with early stopping ---
training_args = TrainingArguments(
    output_dir="/content/stock-market-predictive-analysis/models/unimodal/finbert_classification_model",
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    logging_dir="/content/stock-market-predictive-analysis/models/unimodal/logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",       # We want to maximize F1-score
    greater_is_better=True,           # Higher F1 is better
    warmup_steps=500,
    weight_decay=0.01,
)

# --- 11. Initialize Trainer ---
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

# --- 12. Train ---
print("--- Starting Model Training ---")
trainer.train()

# --- 13. Evaluate on test set ---
print("\n--- Final Evaluation on Test Set ---")
results = trainer.evaluate(test_dataset)
print(results)
print(f"\nTest Set F1-Score: {results['eval_f1']:.4f}")

# --- 14. Save fine-tuned model ---
trainer.save_model("/content/stock-market-predictive-analysis/models/unimodal/finbert_classification_model_final")
tokenizer.save_pretrained("/content/stock-market-predictive-analysis/models/unimodal/finbert_classification_model_final")

print("\nFine-tuned classification model and tokenizer saved.")

New Label Distribution (from Manual Bins):
Label
Negative    477
Positive    346
Neutral     121
Name: count, dtype: int64
Data loaded: 679 train, 76 val, 189 test samples


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Map:   0%|          | 0/679 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Class weights: [np.float64(1.9795918367346939), np.float64(7.804597701149425), np.float64(2.7269076305220885)]
--- Starting Model Training ---


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.6927,1.655709,0.315789,0.347982,0.388748,0.315789
2,1.273,1.249187,0.25,0.276038,0.378655,0.25
3,1.1129,1.420684,0.315789,0.31379,0.327646,0.315789
4,1.1532,1.571351,0.315789,0.317879,0.321247,0.315789
5,0.7497,1.625413,0.302632,0.313212,0.33981,0.302632



--- Final Evaluation on Test Set ---


{'eval_loss': 1.3906968832015991, 'eval_accuracy': 0.455026455026455, 'eval_f1': 0.46386509300858575, 'eval_precision': 0.49290184560511097, 'eval_recall': 0.455026455026455, 'eval_runtime': 1.3602, 'eval_samples_per_second': 138.953, 'eval_steps_per_second': 8.822, 'epoch': 5.0}

Test Set F1-Score: 0.4639

Fine-tuned classification model and tokenizer saved.


In [None]:
# --- 1. Install dependencies ---
!pip install transformers datasets torch scikit-learn

# --- 2. Imports ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, pipeline
import torch
import numpy as np
from torch.nn import CrossEntropyLoss
from tqdm.auto import tqdm

# ==============================================================================
# --- Part 1: Create 'Teacher' Labels with a Reliable Model ---
# ==============================================================================
print("--- Part 1: Generating Teacher Labels ---")

# Load your raw news data
csv_path = "/content/stock-market-predictive-analysis/data/news_sentiment/labeled_news.csv" # The file with your "News" column
df = pd.read_csv(csv_path)
df = df[["News"]].dropna().copy()

# Load the "teacher" model (a reliable sentiment classifier)
teacher_model_name = "ProsusAI/finbert"
sentiment_pipeline = pipeline("sentiment-analysis", model=teacher_model_name)

# Predict sentiment for each news headline
# This might take a few minutes
teacher_labels = []
for news_text in tqdm(df['News'], desc="Labeling with Teacher Model"):
    # The pipeline returns a list, we take the first result
    result = sentiment_pipeline(news_text)
    teacher_labels.append(result[0]['label'])

# Add these clean labels to our dataframe
df['teacher_label_str'] = teacher_labels

print("\nLabeling Complete. Sample of Teacher Labels:")
print(df.head())
print("\nTeacher Label Distribution:")
print(df['teacher_label_str'].value_counts())

# ==============================================================================
# --- Part 2: Train Your Indian Model (The 'Student') ---
# ==============================================================================
print("\n--- Part 2: Training the Student Model ---")

# --- Data Preparation ---
# Encode the teacher's string labels to integers
le = LabelEncoder()
df["labels"] = le.fit_transform(df["teacher_label_str"])
num_labels = len(le.classes_)

# Split data (stratified)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["labels"])
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df["labels"])

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

# --- Tokenizer & Model (Loading the 'Student') ---
student_model_name = "Vansh180/FinBERT-India-v1"
tokenizer = AutoTokenizer.from_pretrained(student_model_name)
model = AutoModelForSequenceClassification.from_pretrained(student_model_name, num_labels=num_labels)

def tokenize(batch):
    return tokenizer(batch["News"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset   = val_dataset.map(tokenize, batched=True)
test_dataset  = test_dataset.map(tokenize, batched=True)

columns = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=columns)
val_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)

# --- Standard Classification Training Setup ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {'f1': f1_score(labels, preds, average='weighted')}

training_args = TrainingArguments(
    output_dir="/content/student_model_test",
    num_train_epochs=5, # 5 epochs is enough for this test
    per_device_train_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# --- Train the 'Student' model ---
trainer.train()

# --- Final Evaluation ---
print("\n--- Final Evaluation of Student Model on Test Set ---")
results = trainer.evaluate(test_dataset)
print(f"\nFinal F1-Score: {results['eval_f1']:.4f}")

# Detailed report
print("\nClassification Report:")
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(test_dataset['labels'], preds, target_names=le.classes_))

--- Part 1: Generating Teacher Labels ---


Device set to use cuda:0


Labeling with Teacher Model:   0%|          | 0/944 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Labeling Complete. Sample of Teacher Labels:
                                                News teacher_label_str
0  After receiving the necessary regulatory appro...          positive
1  The company has received a notification of awa...          positive
2  The IT Department conducted a search at the of...           neutral
3  Rajeev Kumar Sinha has resigned as Global Chie...          negative
4  The company has changed its financial year fro...           neutral

Teacher Label Distribution:
teacher_label_str
positive    444
neutral     369
negative    131
Name: count, dtype: int64

--- Part 2: Training the Student Model ---


Map:   0%|          | 0/679 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,F1
1,No log,0.425435,0.810383
2,No log,0.431659,0.853118
3,No log,0.630603,0.826608
4,No log,0.330919,0.908698
5,No log,0.431646,0.86876



--- Final Evaluation of Student Model on Test Set ---



Final F1-Score: 0.8623

Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.92      0.89        26
     neutral       0.85      0.86      0.86        74
    positive       0.87      0.84      0.86        89

    accuracy                           0.86       189
   macro avg       0.86      0.88      0.87       189
weighted avg       0.86      0.86      0.86       189



In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import os

# ==============================================================================
# --- 1. CONFIGURE THE MODEL PATH ---
# ==============================================================================
# Please verify this path one last time in the file browser.
MODEL_PATH = "/content/student_model_test/checkpoint-425"

# ==============================================================================
# --- 2. LOAD COMPONENTS MANUALLY (More Robust Method) ---
# ==============================================================================
print("--- Starting Model Test ---")

# This print statement helps us see if there are hidden characters in your path
print(f"Verifying path: {repr(MODEL_PATH)}")

# Check if the path exists before we proceed
if not os.path.exists(MODEL_PATH):
    print(f"❌ FATAL ERROR: The directory '{MODEL_PATH}' does not exist.")
else:
    try:
        # Step A: Load the tokenizer from the path
        print("\nStep A: Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        print("✅ Tokenizer loaded successfully.")

        # Step B: Load the model from the path
        print("\nStep B: Loading model...")
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
        print("✅ Model loaded successfully.")

        # Step C: Create the pipeline using the loaded objects
        print("\nStep C: Creating pipeline...")
        sentiment_analyzer = pipeline(
            "sentiment-analysis",
            model=model,
            tokenizer=tokenizer
        )
        print("✅ Pipeline created successfully!")

        # --- Test the pipeline ---
        print("\n--- Testing on a sample headline ---")
        headline = "Infosys reports strong Q3 results, beating analyst expectations"
        prediction = sentiment_analyzer(headline)
        label = prediction[0]['label'].capitalize()
        score = prediction[0]['score']
        print(f"'{headline}'\n  -> Sentiment: {label} (Confidence: {score:.2%})")

    except Exception as e:
        print(f"❌ An error occurred. Please see details below:")
        print(e)

--- Starting Model Test ---
Verifying path: '/content/student_model_test/checkpoint-425'

Step A: Loading tokenizer...
❌ An error occurred. Please see details below:
stat: path should be string, bytes, os.PathLike or integer, not NoneType
