In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# === Load stock data ===
data_folder = 'C:\\Users\\Amenzz\\Downloads\\Compressed\\data1\\yfinance_data'
files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]
stock_data = {}

for file in files:
    stock_name = file.split('_')[0]
    df = pd.read_csv(os.path.join(data_folder, file))
    df['Date'] = pd.to_datetime(df['Date']).dt.normalize()
    
    required_cols = {'Open', 'High', 'Low', 'Close', 'Volume'}
    if required_cols.issubset(df.columns):
        df = df.sort_values('Date')
        df['stock'] = stock_name
        stock_data[stock_name] = df

# === Combine all stock data into one DataFrame ===
all_stock_df = pd.concat(stock_data.values(), ignore_index=True)

# === Example: Load news data ===
# Replace this with your actual news data loading
# Columns expected: headline, url, publisher, date, stock
news_data = [
    {"headline": "Apple launches new iPhone", "url": "", "publisher": "TechCrunch", "date": "2024-06-01", "stock": "AAPL"},
    {"headline": "Apple beats earnings", "url": "", "publisher": "CNBC", "date": "2024-06-01", "stock": "AAPL"},
    {"headline": "Apple under investigation", "url": "", "publisher": "Reuters", "date": "2024-06-02", "stock": "AAPL"},
    {"headline": "Tesla expands factory", "url": "", "publisher": "CNN", "date": "2024-06-01", "stock": "TSLA"},
]
news_df = pd.DataFrame(news_data)
news_df['date'] = pd.to_datetime(news_df['date']).dt.normalize()

# === Aggregate news by date and stock ===
daily_news_count = news_df.groupby(['stock', 'date']).size().reset_index(name='news_count')

# === Merge stock and news data ===
merged_df = pd.merge(
    all_stock_df,
    daily_news_count,
    left_on=['stock', 'Date'],
    right_on=['stock', 'date'],
    how='left'
)
merged_df['news_count'] = merged_df['news_count'].fillna(0)

# === Calculate price change ===
merged_df['price_change'] = merged_df.groupby('stock')['Close'].pct_change().fillna(0)

# === Visualize correlation ===
plt.figure(figsize=(12, 6))
sns.scatterplot(data=merged_df, x='news_count', y='price_change', hue='stock')
plt.title('News Count vs Stock Price Movement')
plt.xlabel('Daily News Count')
plt.ylabel('Daily Price Change (%)')
plt.grid(True)
plt.tight_layout()
plt.show()

# === Show correlation matrix ===
print("\n📊 Correlation Matrix:")
print(merged_df[['news_count', 'price_change']].corr())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import os

# === Load stock data ===
data_folder = 'C:\\Users\\Amenzz\\Downloads\\Compressed\\data1\\yfinance_data'
files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]
stock_data = {}

for file in files:
    stock_name = file.split('_')[0]
    df = pd.read_csv(os.path.join(data_folder, file))
    df['Date'] = pd.to_datetime(df['Date']).dt.normalize()
    if {'Open', 'High', 'Low', 'Close', 'Volume'}.issubset(df.columns):
        df = df.sort_values('Date')
        df['stock'] = stock_name
        df['daily_return'] = df['Close'].pct_change().fillna(0)
        stock_data[stock_name] = df

# === Combine all stock data into one DataFrame ===
all_stock_df = pd.concat(stock_data.values(), ignore_index=True)

# === Load or simulate news data ===
news_data = [
    {"headline": "Apple launches new iPhone", "date": "2024-06-01", "stock": "AAPL"},
    {"headline": "Apple beats earnings expectations", "date": "2024-06-01", "stock": "AAPL"},
    {"headline": "Apple under investigation", "date": "2024-06-02", "stock": "AAPL"},
    {"headline": "Tesla expands factory", "date": "2024-06-01", "stock": "TSLA"},
    {"headline": "Tesla faces production delays", "date": "2024-06-02", "stock": "TSLA"},
]
news_df = pd.DataFrame(news_data)
news_df['date'] = pd.to_datetime(news_df['date']).dt.normalize()

# === Sentiment Analysis ===
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

news_df['sentiment'] = news_df['headline'].apply(get_sentiment)

# === Aggregate daily average sentiment per stock ===
daily_sentiment = news_df.groupby(['stock', 'date'])['sentiment'].mean().reset_index()
daily_sentiment.rename(columns={'date': 'Date', 'sentiment': 'avg_sentiment'}, inplace=True)

# === Merge stock returns with sentiment ===
merged_df = pd.merge(all_stock_df, daily_sentiment, on=['stock', 'Date'], how='left')
merged_df['avg_sentiment'] = merged_df['avg_sentiment'].fillna(0)

# === Correlation Analysis ===
for stock in merged_df['stock'].unique():
    df = merged_df[merged_df['stock'] == stock]
    corr = df['avg_sentiment'].corr(df['daily_return'])
    print(f"📈 Correlation for {stock}: {corr:.4f}")

    # === Plot ===
    plt.figure(figsize=(10, 5))
    sns.scatterplot(data=df, x='avg_sentiment', y='daily_return')
    plt.axvline(0, color='gray', linestyle='--')
    plt.axhline(0, color='gray', linestyle='--')
    plt.title(f'{stock} — Daily Return vs. News Sentiment')
    plt.xlabel('Average Daily Sentiment')
    plt.ylabel('Daily Return (%)')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
