In [22]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import numpy as np
import re


In [23]:
companies = ['BAJFINANCE.NS', 'BPCL.NS', 'ULTRACEMCO.NS', 'DIVISLAB.NS', 'LT.NS', 
             'BAJAJ-AUTO.NS', 'TATASTEEL.NS', 'SUNPHARMA.NS', 'ICICIBANK.NS', 'TCS.NS']

In [24]:
company_df = pd.read_csv('useful/company_df.csv', index_col=0)
company_df = company_df[company_df['Symbol'].isin(companies)]
news_df = pd.read_csv('useful/news.csv', index_col=0)
news_df = news_df.dropna().drop_duplicates()
company_df['Symbol'] = company_df['Symbol'].str.replace('.NS', '', regex=False)

for col in company_df.select_dtypes(include=[object]).columns:
    company_df[col] = company_df[col].str.lower()

# Convert all string columns to lowercase for news_df
for col in news_df.select_dtypes(include=[object]).columns:
    news_df[col] = news_df[col].str.lower()


In [25]:
display(news_df, company_df)

Unnamed: 0_level_0,headline
data,Unnamed: 1_level_1
2014-02-14,5 ways to recover from career mistakes
2014-02-14,"government raises tariff value on gold, silver"
2014-02-14,"suzlon q3 net losses narrow to rs 1,075 crore;..."
2014-02-14,"people will teach lesson to bjp, congress: arv..."
2014-02-14,1000-men security deployed for narendra modi's...
...,...
2024-04-26,edelweiss mutual fund launches nifty alpha low...
2024-04-26,avenue supermart shares down 0.3% as nifty drops
2024-04-26,okaya starts pre-booking of ferrato disruptor ...
2024-04-26,"mphasis management on company performance, gro..."


Unnamed: 0,Symbol,Full Name,Name
0,bajaj-auto,bajaj auto ltd.,bajaj auto
8,icicibank,icici bank ltd.,icici bank
13,lt,larsen & toubro ltd.,l&t
24,bpcl,bharat petroleum corporation ltd.,bpcl
28,bajfinance,bajaj finance ltd.,bajaj finance
36,tcs,tata consultancy services ltd.,tcs
41,ultracemco,ultratech cement ltd.,ultratech
44,tatasteel,tata steel ltd.,tata steel
46,divislab,divi's laboratories ltd.,divi's lab
48,sunpharma,sun pharmaceutical industries ltd.,sun pharma


In [26]:

# Create a dictionary to map company names to their symbols
name_to_symbol = dict(zip(company_df['Name'], company_df['Symbol']))

# Filter news that mentions any company name and add the corresponding symbol
def find_symbol_in_title(title):
    for name, symbol in name_to_symbol.items():
        if name in title:
            return symbol
    return None

filtered_news = news_df[news_df['headline'].apply(lambda x: any(name in x for name in company_df['Name']))].copy()
filtered_news['Symbol'] = filtered_news['headline'].apply(find_symbol_in_title)
filtered_news['Symbol'] = filtered_news['Symbol'].str.upper()

In [None]:

filtered_news.to_csv('data/filtered_news.csv')
print(len(filtered_news))

In [27]:
display(filtered_news.head(10))

Unnamed: 0_level_0,headline,Symbol
data,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-02-14,tata steel hopes to maintain good performance ...,TATASTEEL
2014-02-14,"dealing room: bhel, sbi, sun pharma, tech mahi...",SUNPHARMA
2014-02-14,buy sun pharma ltd with a target of rs 635: mi...,SUNPHARMA
2014-02-14,sell bajaj auto with a target of rs 1840: mite...,BAJAJ-AUTO
2014-02-14,expect sun pharma's earnings growth to be abov...,SUNPHARMA
2014-02-15,egypt two-wheeler import ban likely to hit baj...,BAJAJ-AUTO
2014-02-16,"bajaj auto offers rs 10,000 hike to chakan wor...",BAJAJ-AUTO
2014-02-17,sell ultratech cements ltd with target price o...,ULTRACEMCO
2014-02-17,sell bajaj auto ltd with a target of rs 1770: ...,BAJAJ-AUTO
2014-02-18,l&t insurance launches my health medisure supe...,LT


In [28]:
# we make a copy to avoid the SettingWithCopyWarning
filtered_news_1 = filtered_news.copy()


In [29]:
# Assuming filtered_news_1 is loaded as a DataFrame and 'date' is the index
filtered_news_1.reset_index(inplace=True)  # Reset the index to turn 'date' into a column

# Rename the 'data' column to 'date'
filtered_news_1.rename(columns={'data': 'date'}, inplace=True)

# Convert 'date' to datetime if it's not already
filtered_news_1['date'] = pd.to_datetime(filtered_news_1['date'])

In [30]:
filtered_news_1

Unnamed: 0,date,headline,Symbol
0,2014-02-14,tata steel hopes to maintain good performance ...,TATASTEEL
1,2014-02-14,"dealing room: bhel, sbi, sun pharma, tech mahi...",SUNPHARMA
2,2014-02-14,buy sun pharma ltd with a target of rs 635: mi...,SUNPHARMA
3,2014-02-14,sell bajaj auto with a target of rs 1840: mite...,BAJAJ-AUTO
4,2014-02-14,expect sun pharma's earnings growth to be abov...,SUNPHARMA
...,...,...,...
14265,2024-04-26,bajaj finance may maintain 20-25% growth but 4...,BAJFINANCE
14266,2024-04-26,"stocks in news: bajaj finance, vedanta, tech m...",BAJFINANCE
14267,2024-04-26,icici bank q4 result preview: pat likely to ri...,ICICIBANK
14268,2024-04-26,bajaj finance outlook drags sensex 609 points ...,BAJFINANCE


In [31]:


# Cleaning the tweets text
def clean_tweet_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove user mentions
    text = re.sub(r'#', '', text)  # Remove the hashtag symbol but keep the text
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    return text.strip()

# Apply the cleaning function to the tweet body
filtered_news_1['clean_title'] = filtered_news_1['headline'].apply(clean_tweet_text)

# Aggregate tweets per day by counting them
daily_sentiment = filtered_news_1.groupby(['date', 'Symbol']).size().reset_index(name='news_count')


In [32]:
filtered_news_1.head(20)

Unnamed: 0,date,headline,Symbol,clean_title
0,2014-02-14,tata steel hopes to maintain good performance ...,TATASTEEL,tata steel hopes to maintain good performance ...
1,2014-02-14,"dealing room: bhel, sbi, sun pharma, tech mahi...",SUNPHARMA,dealing room bhel sbi sun pharma tech mahindra...
2,2014-02-14,buy sun pharma ltd with a target of rs 635: mi...,SUNPHARMA,buy sun pharma ltd with a target of rs mitesh...
3,2014-02-14,sell bajaj auto with a target of rs 1840: mite...,BAJAJ-AUTO,sell bajaj auto with a target of rs mitesh th...
4,2014-02-14,expect sun pharma's earnings growth to be abov...,SUNPHARMA,expect sun pharmas earnings growth to be above...
5,2014-02-15,egypt two-wheeler import ban likely to hit baj...,BAJAJ-AUTO,egypt twowheeler import ban likely to hit baja...
6,2014-02-16,"bajaj auto offers rs 10,000 hike to chakan wor...",BAJAJ-AUTO,bajaj auto offers rs hike to chakan workers
7,2014-02-17,sell ultratech cements ltd with target price o...,ULTRACEMCO,sell ultratech cements ltd with target price o...
8,2014-02-17,sell bajaj auto ltd with a target of rs 1770: ...,BAJAJ-AUTO,sell bajaj auto ltd with a target of rs mites...
9,2014-02-18,l&t insurance launches my health medisure supe...,LT,lt insurance launches my health medisure super...


In [33]:
daily_sentiment.head(10)

Unnamed: 0,date,Symbol,news_count
0,2014-02-14,BAJAJ-AUTO,1
1,2014-02-14,SUNPHARMA,3
2,2014-02-14,TATASTEEL,1
3,2014-02-15,BAJAJ-AUTO,1
4,2014-02-16,BAJAJ-AUTO,1
5,2014-02-17,BAJAJ-AUTO,1
6,2014-02-17,ULTRACEMCO,1
7,2014-02-18,BPCL,1
8,2014-02-18,LT,1
9,2014-02-19,LT,2


In [34]:


# Load the pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Function to perform sentiment analysis
def sentiment_analysis(texts, model, tokenizer):
    # Tokenize the texts
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    
    # Predict sentiments
    with torch.no_grad():
        outputs = model(**encoded_input)
    
    # Get scores and convert to probabilities using softmax
    scores = outputs.logits.softmax(dim=-1)
    
    # Convert scores to list of dictionaries with sentiment labels
    sentiments = scores.argmax(dim=-1)
    return sentiments.numpy()

# Example usage
example_texts = ["I love this stock, it's going up!", "This company is going bankrupt."]
sentiments = sentiment_analysis(example_texts, model, tokenizer)
print(sentiments)


  _torch_pytree._register_pytree_node(


[2 1]


In [35]:
# Define the sentiment mapping function
def map_sentiment(prediction_index):
    # Define the mapping from indices to sentiment scores
    sentiment_mapping = {0: -1, 1: 0, 2: 1}  # Update this as per model output
    return sentiment_mapping.get(prediction_index, 0)  # Default to neutral

# Apply the sentiment analysis to each tweet and map the results to scores
filtered_news_1['sentiment_index'] = filtered_news_1['clean_title'].apply(
    lambda text: map_sentiment(sentiment_analysis([text], model, tokenizer)[0])
)




In [38]:
# Group by 'post_date' and 'ticker_symbol' to calculate the mean sentiment score for each day
daily_sentiment_index = filtered_news_1.groupby(['date', 'Symbol'])['sentiment_index'].mean().reset_index()

# Display the daily sentiment index
print(daily_sentiment_index.tail(10))

           date      Symbol  sentiment_index
9078 2024-04-23          LT         1.000000
9079 2024-04-24  BAJAJ-AUTO         0.000000
9080 2024-04-24  BAJFINANCE        -1.000000
9081 2024-04-24   SUNPHARMA         1.000000
9082 2024-04-24   TATASTEEL         1.000000
9083 2024-04-24         TCS         1.000000
9084 2024-04-25  BAJFINANCE         0.250000
9085 2024-04-25   ICICIBANK         0.250000
9086 2024-04-26  BAJFINANCE         0.428571
9087 2024-04-26   ICICIBANK         0.000000


In [39]:
daily_sentiment_index.shape

(9088, 3)

In [40]:
daily_sentiment_index.to_csv('data_final/daily_sentiment_index.csv', index=False)

In [41]:
filtered_news_1.to_csv('filtered_news_1_sentiment_result.csv', index=False)

In [44]:
print(daily_sentiment_index.tail(50))

           date      Symbol  sentiment_index
9038 2024-04-02  ULTRACEMCO        -1.000000
9039 2024-04-03        BPCL         1.000000
9040 2024-04-03         TCS         0.000000
9041 2024-04-04        BPCL         1.000000
9042 2024-04-04   ICICIBANK         1.000000
9043 2024-04-04          LT         1.000000
9044 2024-04-04         TCS        -1.000000
9045 2024-04-05  BAJFINANCE         1.000000
9046 2024-04-05          LT        -1.000000
9047 2024-04-08   ICICIBANK         1.000000
9048 2024-04-08   TATASTEEL         1.000000
9049 2024-04-09          LT         1.000000
9050 2024-04-09         TCS        -1.000000
9051 2024-04-10   ICICIBANK         1.000000
9052 2024-04-10   TATASTEEL         1.000000
9053 2024-04-10         TCS        -1.000000
9054 2024-04-12          LT         0.000000
9055 2024-04-12         TCS         0.428571
9056 2024-04-13         TCS         0.000000
9057 2024-04-14   ICICIBANK        -1.000000
9058 2024-04-14   SUNPHARMA         0.000000
9059 2024-

In [43]:
filtered_news_1.tail()

Unnamed: 0,date,headline,Symbol,clean_title,sentiment_index
14265,2024-04-26,bajaj finance may maintain 20-25% growth but 4...,BAJFINANCE,bajaj finance may maintain growth but xx one ...,-1
14266,2024-04-26,"stocks in news: bajaj finance, vedanta, tech m...",BAJFINANCE,stocks in news bajaj finance vedanta tech mahi...,1
14267,2024-04-26,icici bank q4 result preview: pat likely to ri...,ICICIBANK,icici bank q result preview pat likely to rise...,-1
14268,2024-04-26,bajaj finance outlook drags sensex 609 points ...,BAJFINANCE,bajaj finance outlook drags sensex points dow...,0
14269,2024-04-26,"f&o stocks: bajaj finserv, bajaj finance among...",BAJFINANCE,fo stocks bajaj finserv bajaj finance among s...,1
