In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import yfinance as yf
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
# Loading the tokenizer to generate tokens and model to perform sentiment analysis
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')



In [None]:
# Define the path to save the tokenizer and model
#save_directory = '/content/drive/MyDrive/Final Year Project/twitter_roberta_sentiment'

# Save the tokenizer and model
#tokenizer.save_pretrained(save_directory)
#model.save_pretrained(save_directory)

In [None]:
# Define the path where the tokenizer and model are saved
load_directory = '/content/drive/MyDrive/Final Year Project/twitter_roberta_sentiment'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(load_directory)
model = AutoModelForSequenceClassification.from_pretrained(load_directory)

In [None]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

In [None]:
# Start date, end date and the stock we want to predict
today = datetime.now()
start = datetime(today.year - 2, today.month, today.day)
end = datetime(today.year, today.month, today.day)
print(start)
print(end)

2022-06-19 00:00:00
2024-06-19 00:00:00


In [None]:
# Override how Pandas Datareader reads data
yf.pdr_override()

yfinance: pandas_datareader support is deprecated & semi-broken so will be removed in a future verison. Just use yfinance.


In [None]:
# Define the list of stocks you want to download
stocks = ['ADANIENT.NS', 'ADANIPORTS.NS', 'APOLLOHOSP.NS', 'ASIANPAINT.NS', 'AXISBANK.NS',
          'BAJAJ-AUTO.NS', 'BAJFINANCE.NS', 'BAJAJFINSV.NS', 'BPCL.NS', 'BHARTIARTL.NS',
          'BRITANNIA.NS', 'CIPLA.NS', 'COALINDIA.NS', 'DIVISLAB.NS', 'DRREDDY.NS',
          'EICHERMOT.NS', 'GRASIM.NS', 'HCLTECH.NS', 'HDFCBANK.NS', 'HDFCLIFE.NS',
          'HEROMOTOCO.NS', 'HINDALCO.NS', 'HINDUNILVR.NS', 'ICICIBANK.NS', 'ITC.NS',
          'INDUSINDBK.NS', 'INFY.NS', 'JSWSTEEL.NS', 'KOTAKBANK.NS', 'LTIM.NS', 'LT.NS',
          'M&M.NS', 'MARUTI.NS', 'NTPC.NS', 'NESTLEIND.NS', 'ONGC.NS', 'POWERGRID.NS',
          'RELIANCE.NS', 'SBILIFE.NS', 'SHRIRAMFIN.NS', 'SBIN.NS', 'SUNPHARMA.NS',
          'TCS.NS', 'TATACONSUM.NS', 'TATAMOTORS.NS', 'TATASTEEL.NS', 'TECHM.NS',
          'TITAN.NS', 'ULTRACEMCO.NS', 'WIPRO.NS']

# Create an empty DataFrame to hold all the data
all_stock_data = pd.DataFrame()

# Loop through each stock and download the data
for stock in stocks:
    # Download data from Yahoo Finance
    df = yf.download(stock, start=start, end=end)

    # Ticker for searching news
    stockraw = stock[:-3]

    # Getting data using html requests
    link = f"https://news.google.com/search?q={stockraw}&hl=en-IN&gl=IN&ceid=IN%3Aen"
    req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, 'html.parser')

    # Extract titles and dates
    titles = soup.find_all('a', class_='JtKRv')
    datetimes = soup.find_all('time', class_='hvbAAd')

    # Create lists to store extracted data
    dates_list = []
    titles_list = []

    for title, datetime_tag in zip(titles, datetimes):
        # Remove leading/trailing whitespace from title
        title_text = title.text.strip()
        datetime_str = datetime_tag['datetime']

        # Parse datetime string
        formatted_datetime = datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S%z')

        # Append data to respective lists
        dates_list.append(formatted_datetime.date())
        titles_list.append(title_text)

    # Create DataFrame
    df2 = pd.DataFrame({'Date': dates_list, 'Headline': titles_list})

    # Finding the encodings for each of the headlines found
    encoded_headlines = []
    for headline in df2['Headline']:
        encoded_headline = tokenizer(headline, padding=True, truncation=True, return_tensors='pt')
        encoded_headlines.append(encoded_headline)

    # Finding the sentiment scores of each headline
    # Initialize an empty list to store the tweet sentiments
    headline_sentiments = []

    # Iterate over the encoded tweets
    for encoded_headline in encoded_headlines:
        # Pass the encoded tweet to the model
        output = model(**encoded_headline)

        # Extract the scores
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        # Calculate the sentiment score between -1 and 1
        sentiment_score = scores[2] - scores[0]

        # Append the sentiment score to the list
        headline_sentiments.append(sentiment_score)

    # Add the tweet sentiments as a new column to the DataFrame
    df2['Sentiments'] = headline_sentiments

    # Convert 'Date' column to datetime if it's not already
    df2['Date'] = pd.to_datetime(df2['Date'])

    # Group by 'Date' and calculate the average sentiment for each day
    average_sentiments = df2.groupby('Date')['Sentiments'].mean().reset_index()

    # Merge the dataframes on the 'Date' column with a left join to keep all rows from df
    merged_df = pd.merge(df, average_sentiments, on='Date', how='left')

    # Fill missing sentiment scores with 0
    merged_df['Sentiments'].fillna(0, inplace=True)

    # If you want to ensure that 'Sentiments' column is of numeric type
    merged_df['Sentiments'] = pd.to_numeric(merged_df['Sentiments'])

    # Isolating features for training
    cols = list(merged_df)[1:8]

    # Storing all data for scaling
    df_for_scaling = merged_df[cols]

    # Initialize the scaler
    scaler = MinMaxScaler(feature_range=(0, 1))

    # Fit and transform the numerical data
    scaled_numerical_data = scaler.fit_transform(df_for_scaling)

    # Create a new DataFrame for the scaled data
    df_scaled = df_for_scaling.copy()
    df_scaled[cols] = scaled_numerical_data

    # Add a column for the stock ticker
    df_scaled['Stock'] = stock

    # Add all data to the following dataframe
    all_stock_data = pd.concat([all_stock_data, df_scaled])

print(all_stock_data)

[*********************100%%**********************]  1 of 1 completed
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%***

         Open      High       Low     Close  Adj Close    Volume  Sentiments  \
0    0.317591  0.281448  0.330458  0.298254   0.297956  0.053975    0.484371   
1    0.315182  0.300118  0.352318  0.325830   0.325520  0.024824    0.484371   
2    0.331667  0.290713  0.347546  0.301400   0.301101  0.034967    0.484371   
3    0.316832  0.283082  0.348497  0.308517   0.308215  0.017841    0.484371   
4    0.322112  0.296989  0.357877  0.325712   0.325403  0.020814    0.484371   
..        ...       ...       ...       ...        ...       ...         ...   
486  0.645024  0.639752  0.674848  0.641595   0.643044  0.047972    1.000000   
487  0.638637  0.667837  0.681467  0.646146   0.647576  0.084161    0.057688   
488  0.649281  0.672968  0.695808  0.676660   0.677966  0.077096    0.303429   
489  0.675891  0.674318  0.685053  0.649358   0.650775  0.063649    0.375911   
490  0.654603  0.714286  0.706288  0.726178   0.727284  0.228773    0.457440   

           Stock  
0    ADANIENT.NS  
1

In [24]:
df_for_training = all_stock_data.copy()

In [27]:
# Initialize OneHotEncoder for the 'Stock' column
one_hot_encoder = OneHotEncoder(sparse_output=False)
stock_encoded = one_hot_encoder.fit_transform(df_for_training[['Stock']])

# Create a DataFrame with the one-hot encoded columns
stock_encoded_df = pd.DataFrame(stock_encoded, columns=one_hot_encoder.get_feature_names_out(['Stock']))

# Concatenate the one-hot encoded columns with the original data
df_for_training = pd.concat([df_for_training.reset_index(drop=True), stock_encoded_df], axis=1)

# Drop the original 'Stock' column as it's now one-hot encoded
df_for_training.drop('Stock', axis=1, inplace=True)

In [28]:
stock_encoded

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [29]:
df_for_training

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Sentiments,Stock_ADANIENT.NS,Stock_ADANIPORTS.NS,Stock_APOLLOHOSP.NS,...,Stock_SHRIRAMFIN.NS,Stock_SUNPHARMA.NS,Stock_TATACONSUM.NS,Stock_TATAMOTORS.NS,Stock_TATASTEEL.NS,Stock_TCS.NS,Stock_TECHM.NS,Stock_TITAN.NS,Stock_ULTRACEMCO.NS,Stock_WIPRO.NS
0,0.317591,0.281448,0.330458,0.298254,0.297956,0.053975,0.484371,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.315182,0.300118,0.352318,0.325830,0.325520,0.024824,0.484371,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.331667,0.290713,0.347546,0.301400,0.301101,0.034967,0.484371,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.316832,0.283082,0.348497,0.308517,0.308215,0.017841,0.484371,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.322112,0.296989,0.357877,0.325712,0.325403,0.020814,0.484371,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24543,0.645024,0.639752,0.674848,0.641595,0.643044,0.047972,1.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
24544,0.638637,0.667837,0.681467,0.646146,0.647576,0.084161,0.057688,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
24545,0.649281,0.672968,0.695808,0.676660,0.677966,0.077096,0.303429,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
24546,0.675891,0.674318,0.685053,0.649358,0.650775,0.063649,0.375911,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
df_for_training.to_csv('/content/drive/MyDrive/Datasets/Project/df_for_training2.csv', index=False)

In [None]:
df_for_training.shape

(24548, 57)