In [1]:
!pip install textblob
import sys
print(sys.executable)


C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\python.exe



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# =============================
# 1. Imports
# =============================
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from textblob import TextBlob  # simpler sentiment scoring without extra installation

# =============================
# 2. Load data
# =============================
# Replace with your CSV file
df = pd.read_csv("your_data.csv", parse_dates=["Date"])

# =============================
# 3. Fill missing headline values
# =============================
df.iloc[:,2:27] = df.iloc[:,2:27].fillna('')

# =============================
# 4. Combine Top1–Top25 into single daily text
# =============================
df["Combined_Headlines"] = df.iloc[:,2:27].astype(str).apply(lambda x: " ".join(x), axis=1)

# =============================
# 5. Lexicon-based sentiment scoring using TextBlob
# =============================
df["SentimentScore"] = df["Combined_Headlines"].apply(lambda x: TextBlob(x).sentiment.polarity)

# Optional: rolling sentiment (3-day window)
df["Sentiment_Rolling3"] = df["SentimentScore"].rolling(3).mean().fillna(0)

# =============================
# 6. TF-IDF Vectorization
# =============================
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(df["Combined_Headlines"])

# =============================
# 7. Combine TF-IDF + numeric features
# =============================
X_features = np.hstack([
    X_text.toarray(),
    df[["SentimentScore", "Sentiment_Rolling3"]].values
])

# Target variable
y = df["Sentiment"]  # numeric outcome

# =============================
# 8. Outlier removal (Z-score method)
# =============================
from scipy import stats
z_scores = stats.zscore(y)
mask = np.abs(z_scores) < 3
X_filtered = X_features[mask]
y_filtered = y[mask]

print("Removed outliers. Dataset size after filtering:", X_filtered.shape[0])

# =============================
# 9. Train/Test Split (time-aware)
# =============================
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, shuffle=False
)

# =============================
# 10. Regression Models
# =============================
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)

# Random Forest Regression
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

# =============================
# 11. Evaluation
# =============================
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - MSE: {mse:.4f}, R²: {r2:.4f}")
    return mse, r2

mse_lr, r2_lr = evaluate_model(y_test, pred_lr, "Linear Regression")
mse_rf, r2_rf = evaluate_model(y_test, pred_rf, "Random Forest")

# =============================
# 12. Visualization
# =============================
plt.figure(figsize=(12,6))
plt.plot(y_test.values, label="Actual")
plt.plot(pred_lr, label="Linear Regression")
plt.plot(pred_rf, label="Random Forest")
plt.legend()
plt.title("Actual vs Predicted Sentiment / Target")
plt.show()

# =============================
# 13. Export Results
# =============================
results = df.iloc[len(X_train):].copy()
results["Pred_LR"] = pred_lr
results["Pred_RF"] = pred_rf
results.to_csv("sentiment_forecast_results.csv", index=False)

print("Pipeline complete. Results saved to 'sentiment_forecast_results.csv'")


ModuleNotFoundError: No module named 'textblob'