In [95]:
# Importing  common python libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [79]:
# Userdefined functions


# Function to load the csv files in data frame

def load_csv_to_dataframe(file_path):
    """
    Load a CSV file into a pandas DataFrame.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The loaded DataFrame.
    """
    try:
        df = pd.read_csv(file_path)
        print("CSV file loaded successfully.")
        return df
    except Exception as e:
        print(f"An error occurred while loading the CSV file: {e}")
        return None


# *************************************************************************


# ################# Function for creating features########333

def create_features(df):
    df = df.copy().sort_index()

    df['Close_rolling_mean_7'] = df['Close'].rolling(window=7).mean()
    df['Close_rolling_std_7'] = df['Close'].rolling(window=7).std()

    # Exponential Moving Average (EMA)
    df['Close_ema_7'] = df['Close'].ewm(span=7, adjust=False).mean()

    # Volume-based Features
    df['Volume_rolling_mean_7'] = df['Shares Traded'].rolling(window=7).mean()
    df['Volume_pct_change'] = df['Shares Traded'].pct_change()

    # VADER & Sentiment Features
    df['summary_vader_rolling_mean_7'] = df['summary_vader'].rolling(window=7).mean()
    df['summary_sentiment_rolling_mean_7'] = df['summary_sentiment'].rolling(window=7).mean()
    df['description_vader_rolling_mean_7'] = df['description_vader'].rolling(window=7).mean()
    df['description_sentiment_rolling_mean_7'] = df['description_sentiment'].rolling(window=7).mean()

    # Drop rows with NaN values created by lagging (optional)
    df = df.dropna()

    return df

##################



In [81]:
# loading data into DF

df_stocks = load_csv_to_dataframe('C:\/Users\/609370801\/DSP_project\/DSP_project\/Dataset\/stocks\/preprocessed\/NIFTY 50-26-10-2023-to-25-10-2024_preprocessing.csv')
df_news_sentiments = load_csv_to_dataframe('C:\/Users\/609370801\/DSP_project\/DSP_project\/Dataset\/aggregated\/aggregated_news_data.csv')
# df_stocks.head()
# df_news_sentiments.head()


CSV file loaded successfully.
CSV file loaded successfully.


Unnamed: 0,market_date,summary_vader,summary_sentiment,description_vader,description_sentiment,news_count
0,2024-05-01,0.753775,0.25,0.2334,-0.5,4
1,2024-05-02,0.281233,0.333333,0.214233,0.0,3
2,2024-05-05,0.649033,0.0,0.324667,0.0,3
3,2024-05-06,-0.4408,-1.0,0.4136,0.0,2
4,2024-05-07,-0.06345,0.0,0.359125,-0.25,4


In [85]:
df_news_sentiments.head()

Unnamed: 0,market_date,summary_vader,summary_sentiment,description_vader,description_sentiment,news_count
0,2024-05-01,0.753775,0.25,0.2334,-0.5,4
1,2024-05-02,0.281233,0.333333,0.214233,0.0,3
2,2024-05-05,0.649033,0.0,0.324667,0.0,3
3,2024-05-06,-0.4408,-1.0,0.4136,0.0,2
4,2024-05-07,-0.06345,0.0,0.359125,-0.25,4


In [87]:
df_stocks.head()



Unnamed: 0,Date,Open,High,Low,Close,Shares Traded,Turnover (₹ Cr),Daily_Change,Daily_change_percent
0,2023-10-26,19027.25,19041.7,18837.85,18857.25,300356469.0,28939.64,-170.0,-0.893455
1,2023-10-27,18928.75,19076.15,18926.65,19047.25,205201044.0,19947.48,118.5,0.626032
2,2023-10-30,19053.4,19158.5,18940.0,19140.9,180132492.0,17095.73,87.5,0.459236
3,2023-10-31,19232.95,19233.7,19056.45,19079.6,206049341.0,19397.36,-153.35,-0.79733
4,2023-11-01,19064.05,19096.05,18973.7,18989.15,194103279.0,18452.51,-74.9,-0.392886


In [89]:
# Function to merge stock data with average sentiment on the basis of date

df_news_sentiments['market_date'] = pd.to_datetime(df_news_sentiments['market_date'])
df_stocks['Date'] = pd.to_datetime(df_stocks['Date'])
work_correlation_df = pd.merge(df_news_sentiments, df_stocks, left_on='market_date', right_on='Date', how='inner')
work_correlation_df.head()



Unnamed: 0,market_date,summary_vader,summary_sentiment,description_vader,description_sentiment,news_count,Date,Open,High,Low,Close,Shares Traded,Turnover (₹ Cr),Daily_Change,Daily_change_percent
0,2024-05-02,0.281233,0.333333,0.214233,0.0,3,2024-05-02,22567.85,22710.5,22567.85,22648.2,445909456.0,43133.31,80.35,0.356037
1,2024-05-06,-0.4408,-1.0,0.4136,0.0,2,2024-05-06,22561.6,22588.8,22409.45,22442.7,320255789.0,33417.53,-118.9,-0.527002
2,2024-05-07,-0.06345,0.0,0.359125,-0.25,4,2024-05-07,22489.75,22499.05,22232.05,22302.5,297815469.0,31512.26,-187.25,-0.832602
3,2024-05-08,0.39975,0.0,0.16515,-0.5,2,2024-05-08,22231.2,22368.65,22185.2,22302.5,277438692.0,29717.33,71.3,0.32072
4,2024-05-09,0.72125,-0.5,0.4903,-0.5,2,2024-05-09,22224.8,22307.75,21932.4,21957.5,331327454.0,37870.88,-267.3,-1.20271


In [91]:
work_correlation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   market_date            122 non-null    datetime64[ns]
 1   summary_vader          122 non-null    float64       
 2   summary_sentiment      122 non-null    float64       
 3   description_vader      122 non-null    float64       
 4   description_sentiment  122 non-null    float64       
 5   news_count             122 non-null    int64         
 6   Date                   122 non-null    datetime64[ns]
 7   Open                   122 non-null    float64       
 8   High                   122 non-null    float64       
 9   Low                    122 non-null    float64       
 10  Close                  122 non-null    float64       
 11  Shares Traded          120 non-null    float64       
 12  Turnover (₹ Cr)        120 non-null    float64       
 13  Daily

In [93]:
# Perform correlation study 
# correlation = correlation_study(work_correlation_df, 'Open', 'summary_sentiment')
correlation = work_correlation_df.corr()
# Display correlation results
print("\nCorrelation Matrix:")
correlation
# print(correlation)


Correlation Matrix:


Unnamed: 0,market_date,summary_vader,summary_sentiment,description_vader,description_sentiment,news_count,Date,Open,High,Low,Close,Shares Traded,Turnover (₹ Cr),Daily_Change,Daily_change_percent
market_date,1.0,-0.063815,-0.229887,-0.141188,-0.178483,0.414032,1.0,0.883435,0.887561,0.865966,0.868384,-0.17755,-0.106415,-0.06489,-0.061786
summary_vader,-0.063815,1.0,0.744728,0.642447,0.568768,0.09559,-0.063815,0.068198,0.078511,0.111201,0.130368,-0.105499,-0.033877,0.353516,0.34589
summary_sentiment,-0.229887,0.744728,1.0,0.700357,0.78421,-0.061366,-0.229887,-0.073739,-0.052909,-0.01627,0.01502,0.001014,0.056503,0.500772,0.49086
description_vader,-0.141188,0.642447,0.700357,1.0,0.700668,-0.059465,-0.141188,0.023753,0.024487,0.064432,0.080874,-0.091747,-0.027713,0.32391,0.309438
description_sentiment,-0.178483,0.568768,0.78421,0.700668,1.0,0.073662,-0.178483,-0.023787,0.000159,0.031848,0.069082,0.072903,0.133061,0.52519,0.513934
news_count,0.414032,0.09559,-0.061366,-0.059465,0.073662,1.0,0.414032,0.453854,0.475828,0.429276,0.455986,0.144945,0.144775,0.022504,0.025255
Date,1.0,-0.063815,-0.229887,-0.141188,-0.178483,0.414032,1.0,0.883435,0.887561,0.865966,0.868384,-0.17755,-0.106415,-0.06489,-0.061786
Open,0.883435,0.068198,-0.073739,0.023753,-0.023787,0.453854,0.883435,1.0,0.995419,0.985157,0.984343,-0.122736,-0.047338,-0.065644,-0.06645
High,0.887561,0.078511,-0.052909,0.024487,0.000159,0.475828,0.887561,0.995419,1.0,0.984118,0.990743,-0.100209,-0.025236,-0.003585,-0.004387
Low,0.865966,0.111201,-0.01627,0.064432,0.031848,0.429276,0.865966,0.985157,0.984118,1.0,0.993526,-0.219778,-0.129682,0.070028,0.069556


<!-- Strong Positive Correlations:

Open, High, Low, and Close: These variables are highly correlated with each other, with correlation coefficients around 0.98 to 0.99. This indicates that when one of these variables increases, the others tend to increase as well.
Shares Traded and Turnover (₹ Cr): There is a strong positive correlation (r = 0.95), suggesting that higher trading volumes are associated with higher turnover.
Daily Change and Daily Change Percent: These two variables are almost perfectly correlated (r = 0.99), which is expected as they represent similar concepts.
Moderate Positive Correlations:

Sentiment Scores: There are moderate positive correlations between the sentiment scores derived from summaries and descriptions:
Summary Vader and Summary Sentiment (r = 0.74)
Summary Vader and Description Vader (r = 0.64)
Summary Vader and Description Sentiment (r = 0.57)
Summary Sentiment and Description Vader (r = 0.70)
Summary Sentiment and Description Sentiment (r = 0.78)
Description Vader and Description Sentiment (r = 0.70)
Date with Open, High, Low, and Close: The date shows moderate positive correlations with these variables (r ≈ 0.87 to 0.89), indicating a general upward trend over time.
News Count with Market Variables: News count has moderate positive correlations with Date (r = 0.41), Open (r = 0.45), High (r = 0.48), Low (r = 0.43), and Close (r = 0.46).
Weak Negative Correlations:

Market Date with Sentiment Scores: There are weak negative correlations between market date and sentiment scores, such as summary sentiment (r = -0.23) and description sentiment (r = -0.18).
These findings suggest that market variables like Open, High, Low, and Close are closely related, and sentiment scores from summaries and descriptions are moderately correlated. Additionally, trading volume and turnover are strongly linked, and there is a general upward trend in market variables over time. News count also shows some correlation wi -->th market activity.