In [1]:
# Import necessary libraries
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

# Function to load stock data
def load_data(file_path):
    """
    Load stock data from a CSV file.
    """
    df = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
    return df

# Function to load news sentiment data
def load_news(file_path):
    """
    Load news sentiment data from a CSV file.
    """
    df = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
    return df

# Calculate daily returns for the stock data
def calculate_daily_returns(stock_df):
    """
    Calculate daily returns based on the percentage change in the closing price.
    
    :param stock_df: DataFrame containing stock data with 'Close' column
    :return: DataFrame with a new 'Daily_Return' column
    """
    stock_df['Daily_Return'] = stock_df['Close'].pct_change() * 100
    return stock_df

# Aggregate sentiment scores by date, calculating the mean sentiment for each day
def aggregate_sentiment(news_df):
    """
    Aggregate sentiment scores by date, calculating the mean sentiment for each day.
    
    :param news_df: DataFrame containing news sentiment scores
    :return: Series with the average sentiment for each day
    """
    news_df['Date'] = news_df.index.date
    daily_sentiment = news_df.groupby('Date')['Sentiment'].mean()
    return daily_sentiment

# Merge stock data and daily sentiment data based on the date
def merge_data(stock_df, daily_sentiment):
    """
    Merge stock data and daily sentiment data based on the date.
    
    :param stock_df: DataFrame containing stock data
    :param daily_sentiment: Series containing daily sentiment scores
    :return: Merged DataFrame with stock returns and sentiment data
    """
    stock_df['Date'] = stock_df.index.date
    merged_df = pd.merge(stock_df, daily_sentiment, left_on='Date', right_index=True, how='inner')
    return merged_df

# Calculate Pearson correlation between sentiment and stock returns
def calculate_correlation(merged_df):
    """
    Calculate Pearson correlation between sentiment and stock returns.
    
    :param merged_df: DataFrame with stock returns and sentiment data
    :return: Pearson correlation coefficient and p-value
    """
    correlation, p_value = stats.pearsonr(merged_df['Sentiment'], merged_df['Daily_Return'])
    return correlation, p_value

# Run the task
def run_task_3(stock_file, news_file):
    # Load stock and news data
    stock_df = load_data(stock_file)
    news_df = load_news(news_file)
    
    # Calculate daily returns for stock data
    stock_df = calculate_daily_returns(stock_df)
    
    # Aggregate sentiment data by day
    daily_sentiment = aggregate_sentiment(news_df)
    
    # Merge stock data and sentiment data
    merged_df = merge_data(stock_df, daily_sentiment)
    
    # Perform correlation analysis
    correlation, p_value = calculate_correlation(merged_df)
    
    # Output results
    print(f"Pearson correlation between news sentiment and stock returns: {correlation}")
    print(f"P-value: {p_value}")

# Define file paths
stock_file = '../data/MSFT_historical_data.csv'  # Replace with your stock data file path
news_file = '../data/TSLA_historical_data.csv'   # Replace with your news sentiment file path

# Run the task and print results
run_task_3(stock_file, news_file)


FileNotFoundError: [Errno 2] No such file or directory: 'data/MSFT_historical_data.csv'