In [78]:
!pip install clean-text

Collecting clean-text
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting emoji<2.0.0,>=1.0.0
  Downloading emoji-1.7.0.tar.gz (175 kB)
     -------------------------------------- 175.4/175.4 kB 3.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting ftfy<7.0,>=6.0
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
     ---------------------------------------- 53.1/53.1 kB ? eta 0:00:00
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py): started
  Building wheel for emoji (setup.py): finished with status 'done'
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171032 sha256=deab6e50cb087cbf0735d93375ffe6ffbfde69c131504fe11ec3cf4619a85220
  Stored in directory: c:\users\cloudseals\appdata\local\pip\cache\wheels\fa\7a\e9\22dd0515e1bad255e51663ee513a2fa839c95934c5fc301090
Successfully built emoji
Installing collected packages: emoji, ftfy, clean-

In [1]:
import requests
from langdetect import detect
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
import pickle
import json
import plotly.express as px
import yfinance as yf
# NLTK VADER for sentiment analysis
import dateparser
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from cleantext import clean

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\CLOUDSEALS\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Data Gathering

In [2]:

URL = 'https://www1.nseindia.com/content/indices/ind_nifty50list.csv'
df = pd.read_csv(URL, index_col = 'Company Name')


In [4]:

News_data = pd.DataFrame()
for stock_symbol in list(df['Symbol']):
# Set the stock symbol for which you want to fetch the news    
    # Set the Google News API endpoint URL and parameters
    api_url = 'https://newsapi.org/v2/everything'
    params = {
        'q': stock_symbol,
        'sortBy': 'publishedAt',
        'language': 'en',
        'apiKey': '2abb9ff888b64e3eb54f292f42e8da91' # Replace with your API key
    }

    # Make a GET request to the API endpoint with the specified parameters
    response = requests.get(api_url, params=params)

    # Parse the JSON response content
    news_data = json.loads(response.content)

    # Print the latest news articles related to the stock symbol
    for article in news_data['articles']:
        article.pop('source')
        data = pd.DataFrame(article, index=[0])
        News_data = pd.concat([News_data,data])



In [5]:
News_data.shape

(1974, 7)

# Data Preprocessing 

In [6]:
News_data= News_data.reset_index()

In [7]:
News_data['content'] = [News_data['content'][i][:-15] for i in range(len(News_data['content']))]

In [8]:
for column in News_data.columns:
    News_data[column] =News_data[column].astype(str)
    

# Get scores using NLTk sentiment intensity analyzer

In [9]:
def score_news(News_data):
    # Instantiate the sentiment intensity analyzer
    vader = SentimentIntensityAnalyzer()
    News_data['News'] = News_data['title']+ News_data['description'] + News_data['content']
    News_data = News_data.drop(['title','description','content'],axis=1)
    # Iterate through the headlines and get the polarity scores using vader
    scores = News_data['News'].apply(vader.polarity_scores).tolist()

    # Convert the 'scores' list of dicts into a DataFrame
    scores_df = pd.DataFrame(scores)
    News_data = News_data.sort_values('publishedAt')
    # Join the DataFrames of the news and the list of dicts
    parsed_and_scored_news = News_data.join(scores_df, rsuffix='_right')        
    parsed_and_scored_news = parsed_and_scored_news.set_index('publishedAt')             
    parsed_and_scored_news = parsed_and_scored_news.rename(columns={"compound": "sentiment_score"})

    return parsed_and_scored_news

In [10]:

News_data_lable = score_news(News_data)

In [11]:
News_data_lable.columns

Index(['index', 'author', 'url', 'urlToImage', 'News', 'neg', 'neu', 'pos',
       'sentiment_score'],
      dtype='object')

In [12]:
News_data_lable = News_data_lable.drop(['index', 'author', 'url', 'urlToImage','neg','neu','pos'],axis=1)

In [13]:
News_data_lable['Label'] = [1 if i>0 else 0 for i in list(News_data_lable['sentiment_score'])]

In [14]:
headlines = News_data_lable['News']


In [15]:
headlines

publishedAt
2023-02-19T06:09:15Z    Upgrade error from mysql 5.7 to 8.0Добрый день...
2023-02-19T12:54:48Z    Trade setup for Monday: Top 15 things to know ...
2023-02-19T15:54:27Z    Capgemini: Going 'LONG' On EU Quality For 2023...
2023-02-19T19:32:29Z    Forecasters warn 800 mile-wide 'Greenland Barr...
2023-02-19T20:46:57Z    Power restored to all homes after Storm Otto m...
                                              ...                        
2023-03-19T06:05:36Z    django.core.serializers.base.DeserializationEr...
2023-03-19T06:07:59Z    Soldier recalls horror of squad shooting dead ...
2023-03-19T06:10:08Z    Delhi Lt Governor Flags Off G20 Cyclothon Rall...
2023-03-19T06:14:02Z    TEUCER M2-LD02 PCIe NVMe M.2 2280 SSD Double S...
2023-03-19T06:18:00Z    In line with new govt rules, Reliance re-aucti...
Name: News, Length: 1974, dtype: object

In [16]:
headlines = [clean(headline.lower()) for headline in headlines] 

In [17]:
News_data_lable['News']=headlines

# Feature extraction from text 

In [18]:
from sklearn.feature_extraction.text import CountVectorizer


In [19]:
## implement BAG OF WORDS
countvector=CountVectorizer(ngram_range=(2,2))
X=countvector.fit_transform(headlines)
Y=News_data_lable['Label']

# Train - Test Split

In [20]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


#  model selection

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
# implement RandomForest Classifier
Model=LogisticRegression()
Model.fit(X_train,y_train)

LogisticRegression()

## Test data preprocessing and Prediction

In [23]:
## Predict for the Test Dataset

predictions = Model.predict(X_test)

## Evaluation of the model

In [24]:
## Import library to check accuracy
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [25]:
matrix=confusion_matrix(y_test,predictions)
print(matrix)
score=accuracy_score(y_test,predictions)
print(score)
report=classification_report(y_test,predictions)
print(report)

[[ 80  63]
 [ 14 238]]
0.8050632911392405
              precision    recall  f1-score   support

           0       0.85      0.56      0.68       143
           1       0.79      0.94      0.86       252

    accuracy                           0.81       395
   macro avg       0.82      0.75      0.77       395
weighted avg       0.81      0.81      0.79       395



In [32]:
import joblib
#Save model in h5 file
filename = "Completed_model.joblib"
vector_filename = "vector_filename.joblib"
joblib.dump(Model, filename)
joblib.dump(countvector,vector_filename)
# Recreate the exact same model, including its weights and the optimizer
loaded_model = joblib.load(filename)
loaded_Vector = joblib.load(vector_filename)


In [27]:
News_data = pd.DataFrame()
api_url = 'https://newsapi.org/v2/everything'
params = {
    'q': 'JSL',
    'sortBy': 'publishedAt',
    'language': 'en',
    'apiKey': '2abb9ff888b64e3eb54f292f42e8da91' # Replace with your API key
}

# Make a GET request to the API endpoint with the specified parameters
response = requests.get(api_url, params=params)

# Parse the JSON response content
news_data = json.loads(response.content)

# Print the latest news articles related to the stock symbol
for article in news_data['articles']:
    article.pop('source')
    data = pd.DataFrame(article, index=[0])
    News_data = pd.concat([News_data,data])

News_data= News_data.reset_index()
News_data['content'] = [News_data['content'][i][:-15] for i in range(len(News_data['content']))]
for column in News_data.columns:
    News_data[column] =News_data[column].astype(str)
 


News_data_lable = score_news(News_data)
News_data_lable = News_data_lable.drop(['index', 'author', 'url', 'urlToImage','neg','neu','pos'],axis=1)
News_data_lable['Label'] = [1 if i>0 else 0 for i in list(News_data_lable['sentiment_score'])]
headlines = News_data_lable['News']
headlines = [clean(headline.lower()) for headline in headlines] 
News_data_lable['News']=headlines
#countvector=CountVectorizer(ngram_range=(2,2))
X=loaded_Vector.fit_transform(headlines)
Y=News_data_lable['Label']

In [38]:
## implement BAG OF WORDS
#countvector=CountVectorizer(ngram_range=(2,2))
X1=loaded_Vector.transform(headlines)
Y=News_data_lable['Label']

In [39]:
predictions1 = loaded_model.predict(X1)

In [40]:
predictions1 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

In [35]:
matrix=confusion_matrix(Y,predictions1)
print(matrix)
score=accuracy_score(Y,predictions1)
print(score)
report=classification_report(Y,predictions1)
print(report)

[[ 1  9]
 [ 0 14]]
0.625
              precision    recall  f1-score   support

           0       1.00      0.10      0.18        10
           1       0.61      1.00      0.76        14

    accuracy                           0.62        24
   macro avg       0.80      0.55      0.47        24
weighted avg       0.77      0.62      0.52        24



In [36]:
predictions1

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

## Best Model Selection

In [64]:
predictions

array([1, 1, 1, 1, 1], dtype=int64)

In [96]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


# Define the models to be trained
models = [LogisticRegression(), SVC(), RandomForestClassifier(), GradientBoostingClassifier(), KNeighborsClassifier(), DecisionTreeClassifier()]
model_names = ["Logistic Regression", "Support Vector Classifier", "Random Forest", "Gradient Boosting", "K-Nearest Neighbors", "Decision Tree"]

X_train, X_val, y_train, y_val

results=[]
# Train each model and get predictions
for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    
    # Calculate evaluation metrics for each model
    accuracy = accuracy_score(y_val, predictions)
    precision = precision_score(y_val, predictions)
    recall = recall_score(y_val, predictions)
    f1 = f1_score(y_val, predictions)
    
    # Store the results of each model in the results list
    results.append((model_name, accuracy, precision, recall, f1))
    
# Print the results
for result in results:
    print("Model: {}\nAccuracy: {:.2f}%\nPrecision: {:.2f}%\nRecall: {:.2f}%\nF1 Score: {:.2f}%\n".format(result[0], result[1]*100, result[2]*100, result[3]*100, result[4]*100))


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Model: Logistic Regression
Accuracy: 83.42%
Precision: 80.62%
Recall: 95.49%
F1 Score: 87.43%

Model: Support Vector Classifier
Accuracy: 80.69%
Precision: 76.43%
Recall: 98.36%
F1 Score: 86.02%

Model: Random Forest
Accuracy: 78.96%
Precision: 75.40%
Recall: 96.72%
F1 Score: 84.74%

Model: Gradient Boosting
Accuracy: 76.49%
Precision: 73.07%
Recall: 96.72%
F1 Score: 83.25%

Model: K-Nearest Neighbors
Accuracy: 65.10%
Precision: 90.55%
Recall: 47.13%
F1 Score: 61.99%

Model: Decision Tree
Accuracy: 78.47%
Precision: 77.74%
Recall: 90.16%
F1 Score: 83.49%



# Save Model and Reuse

In [119]:
import joblib
#Save model in h5 file
filename = "Completed_model.joblib"
joblib.dump(model, filename)
# Recreate the exact same model, including its weights and the optimizer
loaded_model = joblib.load(filename)


# Sentment analysis using BERT finetuning

In [20]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from datetime import datetime
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Dense, Input

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import (BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification,
                          CamembertTokenizer, CamembertForSequenceClassification, TFCamembertForSequenceClassification)

# Load tokenizer

In [21]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

# Load transformers model

In [22]:
transformers_model = TFCamembertForSequenceClassification.from_pretrained('jplu/tf-camembert-base', num_labels=2)

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/545M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

Some layers of TFCamembertForSequenceClassification were not initialized from the model checkpoint at jplu/tf-camembert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
transformers_model.summary()

Model: "tf_camembert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFCamembertMainLay  multiple                 110031360 
 er)                                                             
                                                                 
 classifier (TFCamembertClas  multiple                 592130    
 sificationHead)                                                 
                                                                 
Total params: 110,623,490
Trainable params: 110,623,490
Non-trainable params: 0
_________________________________________________________________


In [34]:
News_data_lable['sent_len']= News_data_lable['News'].apply(lambda x: len(x.split(" ")))
max_seq_len = np.round(News_data_lable['sent_len'].mean() + 2 * News_data_lable['sent_len'].std()).astype(int)
max_seq_len

102

In [35]:
input_sequences = []
# The attention mask is an optional argument used when batching sequences together.
# The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them.
attention_masks = []

for text in tqdm_notebook(News_data_lable['News']):
    sequence_dict = tokenizer.encode_plus(text, max_length=max_seq_len, pad_to_max_length=True)
    input_ids = sequence_dict['input_ids']
    att_mask = sequence_dict['attention_mask']

    input_sequences.append(input_ids)
    attention_masks.append(att_mask)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for text in tqdm_notebook(News_data_lable['News']):


  0%|          | 0/2025 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [36]:
print(input_sequences[0])
print(attention_masks[0])


[5, 343, 92, 906, 640, 204, 92, 52, 11152, 110, 5225, 9, 29129, 253, 5349, 8476, 472, 906, 5406, 21746, 204, 1938, 18, 2408, 108, 11163, 669, 1280, 793, 996, 52, 11152, 110, 26, 234, 3058, 91, 4152, 442, 5472, 1895, 7, 5467, 13545, 10, 1782, 9477, 24537, 35, 9, 1723, 15502, 970, 816, 33, 10302, 10884, 91, 669, 7395, 2509, 67, 3047, 657, 9360, 133, 2509, 9, 310, 122, 10, 11152, 110, 17234, 10, 11152, 110, 17234, 4473, 10, 11387, 4480, 6434, 3770, 10, 16396, 33, 15267, 155, 105, 2446, 343, 185, 636, 816, 5444, 88, 9, 5467, 13545, 10, 6]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [37]:
labels = News_data_lable['Label'].values

# Train Test Split

In [38]:
X_train, X_test, y_train, y_test, att_masks_train, att_masks_test = (
    train_test_split(input_sequences, labels, attention_masks, random_state=42, test_size=0.2)
)

In [39]:
X_train = tf.constant(X_train)
X_test = tf.constant(X_test)

y_train = tf.constant(y_train)
y_test = tf.constant(y_test)

att_masks_train = tf.constant(att_masks_train)
att_masks_test = tf.constant(att_masks_test)

In [40]:
print(f'Train | X shape: {X_train.shape}, att_mask shape: {att_masks_train.shape}, y shape: {y_train.shape}')
print(f'Test | X shape: {X_test.shape}, att_mask shape: {att_masks_test.shape}, y shape: {y_test.shape},')

Train | X shape: (1620, 102), att_mask shape: (1620, 102), y shape: (1620,)
Test | X shape: (405, 102), att_mask shape: (405, 102), y shape: (405,),


In [41]:
def create_model():
    model = TFCamembertForSequenceClassification.from_pretrained('jplu/tf-camembert-base', num_labels=2)
    
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    opt = tf.keras.optimizers.Adam(lr=2e-5)
  
    model.compile(loss=loss,
                  optimizer=opt,
                  metrics=['accuracy'])
  
    return model

In [42]:
model = create_model()
model.summary()

All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

Some layers of TFCamembertForSequenceClassification were not initialized from the model checkpoint at jplu/tf-camembert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_camembert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFCamembertMainLay  multiple                 110031360 
 er)                                                             
                                                                 
 classifier (TFCamembertClas  multiple                 592130    
 sificationHead)                                                 
                                                                 
Total params: 110,623,490
Trainable params: 110,623,490
Non-trainable params: 0
_________________________________________________________________


# Test Model before finetune

In [43]:
loss, metric = model.evaluate([X_test, att_masks_test], y_test, batch_size=32, verbose=0)
print(f"Loss before training: {loss:.4f}, Accuracy before training: {metric:.2%}")

Loss before training: 0.6978, Accuracy before training: 40.25%


# Finetune Model

In [None]:
history = model.fit([X_train, att_masks_train], y_train, batch_size=32, epochs=10, validation_data=([X_test, att_masks_test], y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
 4/51 [=>............................] - ETA: 25:55 - loss: 0.6485 - accuracy: 0.6562

# prediction

In [None]:
def predict(text):
    # pre-process text
    encoded_text = tokenizer.encode(text)

    input_ = tf.expand_dims(encoded_text, 0)

    logits = model(input_)[0][0]
    pred = tf.nn.softmax(logits)
    
    return pred

In [None]:
text = "Adani Green Energy Ltd has lost 14.81% over last one month compared to 0.25% fall in S&P BSE Utilities index and 2.48% drop in the SENSEX"
predict(text)