## Price prediction

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
import random
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# 0. Fix random seeds
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

# 1. Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(1), :]

# 2. Transformer Model (7 prices + 3 sentiment probs)
class StockGenWithSentimentProbs(nn.Module):
    def __init__(self, input_dim=10):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, 64)
        self.positional_encoding = PositionalEncoding(d_model=64)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=64, nhead=4, dim_feedforward=128, dropout=0.3, activation='gelu'),
            num_layers=4
        )
        self.layer_norm = nn.LayerNorm(64)
        self.fc = nn.Linear(64, 1)

    def forward(self, x):
        x = self.input_proj(x)
        x = self.positional_encoding(x)
        x = x.permute(1, 0, 2)  # (seq_len, batch, dim)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # (batch, seq_len, dim)
        x = self.layer_norm(x)
        x = x.mean(dim=1)
        return self.fc(x).squeeze(-1)

# 3. Load and preprocess data
csv_path = "sentiment_data_news_2yr.csv"
df = pd.read_csv(csv_path)

# Global normalization of 'Close' prices
scaler = StandardScaler()
df['Close'] = scaler.fit_transform(df[['Close']])

# 4. Create lag features
for lag in range(1, 8):
    df[f'Close_t-{lag}'] = df['Close'].shift(lag)

df.dropna(inplace=True)

# 5. Define features
feature_columns = [f"Close_t-{i}" for i in range(1, 8)] + [
    "FinBERT_neutral", "FinBERT_positive", "FinBERT_negative"
]

# 6. Windowed sequence preparation (Predict the change, not the absolute price)
window_size = 5
features, targets = [], []

for i in range(len(df) - window_size):
    window_df = df.iloc[i:i+window_size]
    next_close = df.iloc[i+window_size]['Close']
    change_in_price = next_close - df.iloc[i+window_size-1]['Close']

    seq_features = [[row[col] for col in feature_columns] for _, row in window_df.iterrows()]
    features.append(seq_features)
    targets.append(change_in_price)

X = torch.tensor(np.array(features), dtype=torch.float32)  # shape: (N, 5, 10)
y = torch.tensor(targets, dtype=torch.float32)

# 7. Create DataLoaders
dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# 8. Model Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = StockGenWithSentimentProbs().to(device)
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3)

# Use Learning Rate Scheduler to adjust learning rate during training
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for price_seq, labels in train_loader:
        price_seq, labels = price_seq.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(price_seq)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_seq, val_labels in val_loader:
            val_seq, val_labels = val_seq.to(device), val_labels.to(device)
            val_outputs = model(val_seq)
            loss = criterion(val_outputs, val_labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")

    # Update the learning rate
    scheduler.step()

# 9. Save model and global scaler
torch.save(model.state_dict(), "stock_model_with_probs.pth")
joblib.dump(scaler, "stock_global_scaler.pkl")
print("✅ Model and global scaler saved.")


Using device: cuda




Epoch 1/50 - Train Loss: 0.0120 - Val Loss: 0.0054
Epoch 2/50 - Train Loss: 0.0042 - Val Loss: 0.0051
Epoch 3/50 - Train Loss: 0.0038 - Val Loss: 0.0051
Epoch 4/50 - Train Loss: 0.0037 - Val Loss: 0.0052
Epoch 5/50 - Train Loss: 0.0036 - Val Loss: 0.0051
Epoch 6/50 - Train Loss: 0.0036 - Val Loss: 0.0050
Epoch 7/50 - Train Loss: 0.0035 - Val Loss: 0.0051
Epoch 8/50 - Train Loss: 0.0035 - Val Loss: 0.0050
Epoch 9/50 - Train Loss: 0.0036 - Val Loss: 0.0069
Epoch 10/50 - Train Loss: 0.0035 - Val Loss: 0.0053
Epoch 11/50 - Train Loss: 0.0034 - Val Loss: 0.0051
Epoch 12/50 - Train Loss: 0.0034 - Val Loss: 0.0050
Epoch 13/50 - Train Loss: 0.0034 - Val Loss: 0.0050
Epoch 14/50 - Train Loss: 0.0034 - Val Loss: 0.0050
Epoch 15/50 - Train Loss: 0.0034 - Val Loss: 0.0051
Epoch 16/50 - Train Loss: 0.0034 - Val Loss: 0.0050
Epoch 17/50 - Train Loss: 0.0034 - Val Loss: 0.0051
Epoch 18/50 - Train Loss: 0.0034 - Val Loss: 0.0051
Epoch 19/50 - Train Loss: 0.0034 - Val Loss: 0.0051
Epoch 20/50 - Train L

In [None]:
import torch
import pandas as pd
import joblib
import torch.nn as nn
import numpy as np

# 1. Define model components
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(1), :]

class StockGenWithSentimentProbs(nn.Module):
    def __init__(self, input_dim=10, num_layers=4):  # Allow num_layers to be set as a parameter
        super().__init__()
        self.input_proj = nn.Linear(input_dim, 64)
        self.positional_encoding = PositionalEncoding(d_model=64)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=64, nhead=4, dim_feedforward=128, dropout=0.1, activation='gelu'),
            num_layers=num_layers  # Set number of layers dynamically
        )
        self.layer_norm = nn.LayerNorm(64)
        self.fc = nn.Linear(64, 1)

    def forward(self, x):
        x = self.input_proj(x)
        x = self.positional_encoding(x)
        x = x.permute(1, 0, 2)  # (seq_len, batch, dim)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)
        x = self.layer_norm(x)
        x = x.mean(dim=1)
        return self.fc(x).squeeze(-1)

# 2. Load CSV and original unscaled Close prices
test_data = pd.read_csv('sentiment_data_news_ctsh.csv')
ticker = test_data['Ticker'].iloc[0]
original_close = test_data['Close'].copy().values

# 3. Load global scaler
scaler = joblib.load("stock_global_scaler.pkl")

# 4. Define input feature columns
feature_columns = [f"Close_t-{i}" for i in range(1, 8)] + [
    "FinBERT_neutral", "FinBERT_positive", "FinBERT_negative"
]

# 5. Apply global scaling and create lag features
test_data["Close"] = scaler.transform(test_data[["Close"]])

for lag in range(1, 8):
    test_data[f'Close_t-{lag}'] = test_data['Close'].shift(lag)

test_data.dropna(inplace=True)
window_size = 5
input_sequences = []
targets = []

for i in range(len(test_data) - window_size):
    window = test_data.iloc[i:i+window_size]
    next_price = original_close[i + window_size]
    seq = [[row[col] for col in feature_columns] for _, row in window.iterrows()]
    input_sequences.append(seq)
    targets.append(next_price)

X = torch.tensor(np.array(input_sequences), dtype=torch.float32)

# 6. Load trained model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = StockGenWithSentimentProbs(input_dim=10, num_layers=4).to(device)  # Use 4 layers
model.load_state_dict(torch.load("stock_model_with_probs.pth", map_location=device))
model.eval()

# 7. Predict delta_Close
with torch.no_grad():
    predicted_changes = model(X.to(device)).cpu().numpy()

# 8. Compute predicted Close using Close_t-1
predictions = []
for i in range(len(predicted_changes)):
    close_t_minus_1 = test_data.iloc[i + window_size - 1]["Close_t-1"]
    close_t_minus_1_unscaled = scaler.inverse_transform([[close_t_minus_1]])[0][0]
    predicted_price = close_t_minus_1_unscaled + predicted_changes[i]
    predictions.append(predicted_price)

predictions = np.array(predictions)
targets = np.array(targets)

# 9. Evaluation
mae = np.mean(np.abs(predictions - targets))
r2 = 1 - np.sum((targets - predictions) ** 2) / np.sum((targets - np.mean(targets)) ** 2)
accuracy = 1 - np.mean(np.abs((targets - predictions) / targets))

print(f"✅ MAE: {mae:.4f}")
print(f"✅ R-squared: {r2:.4f}")
print(f"✅ Accuracy: {accuracy * 100:.2f}%")

# 10. Sample predictions
print("\n🔍 Sample Predictions:")
for i in range(10):
    print(f"Actual: {targets[i]:.2f} | Predicted: {predictions[i]:.2f}")

# 11. Re-read original file to preserve original columns
original_df = pd.read_csv("sentiment_data_news_ctsh.csv")

# Align and add Predicted_Close column
start_idx = len(original_df) - len(predictions)
original_df = original_df.iloc[start_idx:].copy()
original_df['Predicted_Close'] = predictions

# 12. Add Predicted_Close to original_df and save to CSV
original_df['Predicted_Close'] = predictions

# Save to new CSV without trend classification
original_df.to_csv("sentiment_data_news_ctsh_predictions.csv", index=False)
print("\n📁 Saved as 'sentiment_data_news_ctsh_predictions.csv' with Predicted_Close column only.")

✅ MAE: 4.3061
✅ R-squared: -0.0123
✅ Accuracy: 94.85%

🔍 Sample Predictions:
Actual: 88.24 | Predicted: 87.74
Actual: 88.24 | Predicted: 85.92
Actual: 88.24 | Predicted: 85.80
Actual: 88.24 | Predicted: 86.91
Actual: 87.74 | Predicted: 84.36
Actual: 87.74 | Predicted: 84.36
Actual: 85.92 | Predicted: 84.36
Actual: 85.80 | Predicted: 84.36
Actual: 86.91 | Predicted: 84.36
Actual: 84.36 | Predicted: 73.60

📁 Saved as 'sentiment_data_news_ctsh_predictions.csv' with Predicted_Close column only.


  model.load_state_dict(torch.load("stock_model_with_probs.pth", map_location=device))


## Trend

In [None]:
import pandas as pd

# Load the existing predictions file
df = pd.read_csv("sentiment_data_news_ctsh_predictions.csv")

# Drop first row with NaN (from shift)
df = df.dropna().reset_index(drop=True)

# Set threshold percentage (e.g., 0.005 for 0.5%)
threshold_percentage = 0  # No threshold margin

# Containers for trend labels
actual_trends = []
predicted_trends = []

# Rule-based classification
for i in range(len(df)):
    close_t_minus_1 = df.loc[i, 'Close_t-1']
    actual_close = df.loc[i, 'Close']
    predicted_close = df.loc[i, 'Predicted_Close']

    delta_actual = actual_close - close_t_minus_1
    delta_predicted = predicted_close - close_t_minus_1
    threshold = threshold_percentage * close_t_minus_1

    # Actual trend
    if delta_actual > threshold:
        actual_trend = "Uptrend"
    elif delta_actual < -threshold:
        actual_trend = "Downtrend"
    else:
        actual_trend = "Sideways"

    # Predicted trend
    if delta_predicted > threshold:
        predicted_trend = "Uptrend"
    elif delta_predicted < -threshold:
        predicted_trend = "Downtrend"
    else:
        predicted_trend = "Sideways"

    actual_trends.append(actual_trend)
    predicted_trends.append(predicted_trend)

# Add the results
df['Trend'] = actual_trends
df['Predicted_Trend'] = predicted_trends

# Save back to same file
df.to_csv("sentiment_data_news_ctsh_predictions.csv", index=False)
print("✅ Trends updated in 'sentiment_data_news_ctsh_predictions.csv'")


✅ Trends updated in 'sentiment_data_news_ctsh_predictions.csv'


## Confidence Score

In [1]:
# Adding trend to the dataset
import pandas as pd

# Load the dataset
df = pd.read_csv("sentiment_data_news_2yr.csv")

# Apply rule-based trend classification using existing Close_t-1
def classify_trend(row):
    if row['Close'] > row['Close_t-1']:
        return "Uptrend"
    elif row['Close'] < row['Close_t-1']:
        return "Downtrend"
    else:
        return "Sideways"

df['Trend'] = df.apply(classify_trend, axis=1)

# Save it back to the same file
df.to_csv("sentiment_data_news_2yr.csv", index=False)
print("✅ Trend column added using existing 'Close_t-1'.")


✅ Trend column added using existing 'Close_t-1'.


In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("sentiment_data_news_ctsh_predictions.csv")

# Confidence score based on how many past prices agree with the current trend
confidence_scores = []

# Iterate through rows
for i in range(len(df)):
    current_close = df.loc[i, "Close"]
    current_trend = df.loc[i, "Trend"]
    
    match_count = 0
    valid_days = 0

    # Loop over t-1 to t-7
    for t in range(1, 8):
        col_name = f"Close_t-{t}"
        if col_name in df.columns and pd.notna(df.loc[i, col_name]):
            past_close = df.loc[i, col_name]
            price_delta = current_close - past_close
            valid_days += 1

            if current_trend == "Uptrend" and price_delta > 0:
                match_count += 1
            elif current_trend == "Downtrend" and price_delta < 0:
                match_count += 1
            elif current_trend == "Sideways" and abs(price_delta) < 0.01:
                match_count += 1

    if valid_days > 0:
        confidence = round((match_count / valid_days) * 100, 2)
    else:
        confidence = None

    confidence_scores.append(confidence)

# Add confidence to DataFrame
df["Confidence"] = confidence_scores

# Drop any rows where confidence couldn't be computed
df = df.dropna(subset=["Confidence"]).reset_index(drop=True)

# Save
df.to_csv("sentiment_data_news_ctsh_predictions.csv", index=False)
print("✅ Confidence column (based on historical trend agreement) added.")


✅ Confidence column (based on historical trend agreement) added.


In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv("sentiment_data_news_2yr.csv")

# Round confidence values to standardize (optional, if needed)
# df["Confidence"] = df["Confidence"].round(2)

# Get unique values and counts
value_counts = df["Confidence"].value_counts().sort_index()

# Print each unique confidence value and its count
for confidence, count in value_counts.items():
    print(f"Confidence: {confidence} → {count} rows")


Confidence: 14.29 → 4511 rows
Confidence: 28.57 → 3295 rows
Confidence: 42.86 → 2922 rows
Confidence: 57.14 → 3191 rows
Confidence: 71.43 → 3502 rows
Confidence: 85.71 → 4127 rows
Confidence: 100.0 → 17887 rows


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
import pandas as pd
import gc

# Check if CUDA is available, otherwise fall back to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
df = pd.read_csv("sentiment_data_news_2yr.csv")

# Optional: use a subset for faster prototyping
# df = df.sample(1000, random_state=42).reset_index(drop=True)

# Prepare text and labels
input_texts = []
labels = []

for _, row in df.iterrows():
    trend = row["Trend"]
    close_t_1_to_7 = [row[f"Close_t-{t}"] for t in range(1, 8)]
    close = row["Close"]
    confidence = row["Confidence"]

    input_text = (
        f"As a financial expert, you are tasked with predicting the confidence level of a stock trend continuing.\n"
        f"Trend: {trend}\n"
        f"Past 7-Day Closing Prices: {close_t_1_to_7}\n"
        f"Current Close: {close}\n"
        f"Please output the confidence score."
    )
    
    input_texts.append(input_text)
    labels.append(str(confidence))  # Use string so tokenizer can handle it

# Tokenizer and model (using CUDA if available)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small").to(device)

# Tokenize input and labels
encodings = tokenizer(input_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
label_encodings = tokenizer(labels, padding=True, truncation=True, max_length=16, return_tensors="pt")

# Custom Dataset
class ConfidenceDataset(Dataset):
    def __init__(self, input_encodings, label_encodings):
        self.input_ids = input_encodings["input_ids"]
        self.attention_mask = input_encodings["attention_mask"]
        self.labels = label_encodings["input_ids"]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx],
        }

dataset = ConfidenceDataset(encodings, label_encodings)

# Hyperparameters
epochs = 3
batch_size = 4  # You can increase this if you have enough GPU memory
lr = 5e-5

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# PEFT configuration for LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Initialize PEFT model
model = get_peft_model(base_model, lora_config).to(device)

# Optimizer setup
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1} | Avg Loss: {avg_loss:.4f}")
    gc.collect()

# Save the fine-tuned model
model.save_pretrained("./finetuned_lora_model")
tokenizer.save_pretrained("./finetuned_lora_model")
print("✅ Model fine-tuned with LoRA/PEFT and saved to './finetuned_lora_model'")


Using device: cuda




Epoch 1 | Avg Loss: 1.7930
Epoch 2 | Avg Loss: 0.9936
Epoch 3 | Avg Loss: 0.9355
✅ Model fine-tuned with LoRA/PEFT and saved to './finetuned_lora_model'


In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel, PeftConfig

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and base model
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

# Load PEFT configuration and model
peft_model = PeftModel.from_pretrained(base_model, "./finetuned_lora_model_confidence")
peft_model = peft_model.to(device)
peft_model.eval()

# Load your new inference dataset
df_infer = pd.read_csv("sentiment_data_news_ctsh_predictions.csv")

# Generate inputs (same format as training)
input_texts = []
for _, row in df_infer.iterrows():
    trend = row["Trend"]
    close_t_1_to_7 = [row[f"Close_t-{t}"] for t in range(1, 8)]
    close = row["Close"]

    input_text = (
        f"As a financial expert, you are tasked with predicting the confidence level of a stock trend continuing.\n"
        f"Trend: {trend}\n"
        f"Past 7-Day Closing Prices: {close_t_1_to_7}\n"
        f"Current Close: {close}\n"
        f"Please output the confidence score."
    )
    input_texts.append(input_text)

# Tokenize inputs
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

# Perform inference
with torch.no_grad():
    generated_ids = peft_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=16,
        temperature=1,    
        top_p=0.9,          
        do_sample=True     
    )

    preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

# Add predictions to the dataframe
df_infer["Predicted_Confidence"] = preds

# Optional: save results
df_infer.to_csv("sentiment_data_news_ctsh_predictions.csv", index=False)
print("✅ Inference complete. Predictions saved to 'sentiment_data_news_ctsh_predictions.csv'")

Using device: cuda




✅ Inference complete. Predictions saved to 'sentiment_data_news_ctsh_predictions.csv'


## Volatility

In [5]:
import pandas as pd

# Load dataset
df = pd.read_csv("sentiment_data_news_ctsh_predictions.csv")

# Compute volatility: max(close_t-1 to t-7) - min(close_t-1 to t-7)
def compute_volatility(row):
    try:
        close_prices = [row[f"Close_t-{t}"] for t in range(1, 8)]
        return max(close_prices) - min(close_prices)
    except:
        return 0.0  # Default to 0 if any error

# Apply to dataframe
df["Volatility"] = df.apply(compute_volatility, axis=1)

# Save updated dataset
df.to_csv("sentiment_data_news_ctsh_predictions.csv", index=False)
print("✅ Volatility column added and saved to 'sentiment_data_news_ctsh_predictions.csv'")


✅ Volatility column added and saved to 'sentiment_data_news_ctsh_predictions.csv'


## Recommendation

In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("sentiment_data_news_ctsh_predictions.csv")

# Define the rule-based recommendation logic for short-term (1-day) prediction
def short_term_recommendation(actual_trend, close_prices):
    recent_trend = close_prices[0] - close_prices[-1]  # Momentum from t-7 to t-1
    # Rule 1: Uptrend and increasing momentum → Buy
    if actual_trend == "Uptrend" and recent_trend > 0:
        return "Buy"

    # Rule 2: Downtrend and decreasing momentum → Sell
    if actual_trend == "Downtrend" and recent_trend < 0:
        return "Sell"

    # Rule 3: Sideways trend or low momentum → Hold
    return "Hold"

# Apply to DataFrame
def generate_recommendations(df):
    recs = []
    for _, row in df.iterrows():
        try:
            close_prices = [row[f"Close_t-{t}"] for t in range(1, 8)]
            actual_trend = row["Trend"]  # Use the actual trend for recommendation
            recs.append(short_term_recommendation(actual_trend, close_prices))
        except Exception as e:
            recs.append("Hold")
    return recs

# Apply the logic
df["Recommendation"] = generate_recommendations(df)

# Save updated DataFrame
df.to_csv("sentiment_data_news_ctsh_predictions.csv", index=False)
print("✅ Updated short-term recommendations saved to 'sentiment_data_news_ctsh_predictions.csv'")


✅ Updated short-term recommendations saved to 'sentiment_data_news_ctsh_predictions.csv'


In [3]:
import pandas as pd

# Load the updated DataFrame
df = pd.read_csv("sentiment_data_news_ctsh_predictions.csv")

# Get counts of each recommendation
recommendation_counts = df["Recommendation"].value_counts()

# Print the results
print("📊 Recommendation Counts:")
for label, count in recommendation_counts.items():
    print(f"{label}: {count} rows")


📊 Recommendation Counts:
Hold: 17 rows
Buy: 2 rows


In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import Dataset, DataLoader
import gc

# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and preprocess data
df = pd.read_csv("sentiment_data_news_2yr.csv")

# Create input-output pairs
input_texts = []
labels = []

for _, row in df.iterrows():
    trend = row["Trend"]
    confidence = row["Confidence"]
    volatility = row["Volatility"]
    close_t_1_to_7 = [row[f"Close_t-{t}"] for t in range(1, 8)]
    close = row["Close"]
    recommendation = row["Recommendation"]

    input_text = (
        f"You are a financial assistant. Based on the data, make a one-day stock recommendation.\n"
        f"Trend: {trend}\n"
        f"Confidence: {confidence:.2f}\n"
        f"Volatility: {volatility:.2f}\n"
        f"Past 7-Day Closing Prices: {close_t_1_to_7}\n"
        f"Current Close: {close}\n"
        f"Recommendation (Buy, Sell, Hold):"
    )

    input_texts.append(input_text)
    labels.append(recommendation)

# Load tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small").to(device)

# Tokenize
input_encodings = tokenizer(input_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
label_encodings = tokenizer(labels, padding=True, truncation=True, max_length=8, return_tensors="pt")

# Dataset
class RecommendationDataset(Dataset):
    def __init__(self, input_encodings, label_encodings):
        self.input_ids = input_encodings["input_ids"]
        self.attention_mask = input_encodings["attention_mask"]
        self.labels = label_encodings["input_ids"]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

dataset = RecommendationDataset(input_encodings, label_encodings)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# LoRA Configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    task_type=TaskType.SEQ_2_SEQ_LM,
    lora_dropout=0.1
)

# Apply LoRA to the model
model = get_peft_model(base_model, peft_config).to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training Loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1} | Avg Loss: {avg_loss:.4f}")
    gc.collect()

# Save fine-tuned model
model.save_pretrained("./finetuned_recommendation_model")
tokenizer.save_pretrained("./finetuned_recommendation_model")
print("✅ Model fine-tuned and saved to './finetuned_recommendation_model'")

Using device: cuda




Epoch 1 | Avg Loss: 0.5093
Epoch 2 | Avg Loss: 0.2489
Epoch 3 | Avg Loss: 0.2362
✅ Model fine-tuned and saved to './finetuned_recommendation_model'


In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the saved fine-tuned model and tokenizer
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
peft_model = PeftModel.from_pretrained(base_model, "./finetuned_recommendation_model")
peft_model = peft_model.to(device)
peft_model.eval()

# Load inference dataset
df_infer = pd.read_csv("sentiment_data_news_ctsh_predictions.csv")

# Generate input prompts
input_texts = []
for _, row in df_infer.iterrows():
    trend = row["Predicted_Trend"]
    confidence = row["Predicted_Confidence"]
    volatility = row["Volatility"]
    close_t_1_to_7 = [row[f"Close_t-{t}"] for t in range(1, 8)]
    close = row["Predicted_Close"]

    input_text = (
        f"You are a financial assistant. Based on the data, make a one-day stock recommendation.\n"
        f"Trend: {trend}\n"
        f"Confidence: {confidence:.2f}\n"
        f"Volatility: {volatility:.2f}\n"
        f"Past 7-Day Closing Prices: {close_t_1_to_7}\n"
        f"Current Close: {close}\n"
        f"Recommendation (Buy, Sell, Hold):"
    )
    input_texts.append(input_text)

# Tokenize inputs
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {key: val.to(device) for key, val in inputs.items()}

# Run inference
with torch.no_grad():
    outputs = peft_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=8
    )

# Decode predictions
predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Add predictions to DataFrame
df_infer["Predicted_Recommendation"] = predictions

# Save results to CSV
df_infer.to_csv("sentiment_data_news_ctsh_predictions.csv", index=False)
print("✅ Inference complete. Predictions added to 'Predicted_Recommendation' column.")


Using device: cuda




✅ Inference complete. Predictions added to 'Predicted_Recommendation' column.
