In [1]:
!pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# =============================
# 1. Imports
# =============================
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# =============================
# 2. Load your data
# =============================
# Assuming your CSV contains: Date, Label, Top1..Top25, Sentiment, etc.
# Replace 'your_data.csv' with your filename
df = pd.read_csv("Data.csv", parse_dates=["Date"])

# =============================
# 3. Combine Top1-Top25 into single daily text
# =============================
df["Combined_Headlines"] = df.iloc[:, 2:27].astype(str).apply(lambda x: " ".join(x), axis=1)

# =============================
# 4. Sentiment Scoring (Lexicon-Based: VADER)
# =============================
analyser = SentimentIntensityAnalyzer()
df["SentimentScore"] = df["Combined_Headlines"].apply(lambda x: analyser.polarity_scores(x)['compound'])

# =============================
# 5. TF-IDF Vectorization
# =============================
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(df["Combined_Headlines"])

# =============================
# 6. Feature Engineering
# =============================
# Example: Include SentimentScore and rolling mean as additional numeric features
df["Sentiment_Rolling3"] = df["SentimentScore"].rolling(3).mean().fillna(0)
X_features = np.hstack([X_text.toarray(), df[["SentimentScore", "Sentiment_Rolling3"]].values])

# Target variable (replace with your numeric outcome: e.g., market return)
y = df["Sentiment"]  # or df["MarketReturn"], etc.

# =============================
# 7. Train/Test Split (Time-Aware)
# =============================
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, shuffle=False)

# =============================
# 8. Regression Models
# =============================
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)

# Random Forest Regression
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

# =============================
# 9. Evaluation
# =============================
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - MSE: {mse:.4f}, RÂ²: {r2:.4f}")
    return mse, r2

mse_lr, r2_lr = evaluate_model(y_test, pred_lr, "Linear Regression")
mse_rf, r2_rf = evaluate_model(y_test, pred_rf, "Random Forest")

# =============================
# 10. Visualization
# =============================
plt.figure(figsize=(12,6))
plt.plot(y_test.values, label="Actual")
plt.plot(pred_lr, label="Linear Regression")
plt.plot(pred_rf, label="Random Forest")
plt.legend()
plt.title("Actual vs Predicted Sentiment / Target")
plt.show()

# =============================
# 11. Export Results
# =============================
results = df.iloc[len(X_train):].copy()
results["Pred_LR"] = pred_lr
results["Pred_RF"] = pred_rf
results.to_csv("sentiment_forecast_results.csv", index=False)

print("Pipeline complete. Results saved to 'sentiment_forecast_results.csv'")


ModuleNotFoundError: No module named 'pandas'