In [1]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping




In [2]:
df=pd.read_excel('dataset_telegram.xlsx')
df.head(5)

Unnamed: 0,DATE,HEADLINE,TICKER,COMPANY NAME,SECTOR,SUBSECTOR,SERIES,DATE OF LISTING,PAID UP VALUE,MARKET LOT,...,close,volume,adj close,avg High,avg low,avg open,avg close,avg volume,avg adj close,trend
0,2022-02-15,Religare Ent Says\nâ–¶ï¸Religare Finvest vict...,RELIGARE,Religare Enterprises Limited,Finance - Investments,Finance - Investments,EQ,2007-11-21,10,1,...,122.050003,122.050003,910705,159.109998,159.779999,152.860001,156.13,156.13,686379.6,bearish
1,2022-02-15,Ashok Leyland targets 40% ICV sales from CNG v...,ASHOKLEY,Ashok Leyland Limited,Auto - LCVs & HCVs,Auto - LCVs & HCVs,EQ,1995-05-25,1,1,...,128.5,127.602028,15896279,159.329999,161.720001,156.199997,158.689999,157.581058,16145476.0,bearish
2,2022-02-15,Cipla promoters plan to sell 2.5% of stock,CIPLA,Cipla Limited,Pharmaceuticals,Pharmaceuticals,EQ,1995-02-08,2,1,...,921.849976,917.393127,8304774,1135.429993,1155.719995,1129.699988,1145.319995,1139.782751,3662772.4,bullish
3,2022-02-15,"2.05 cr shares of Cipla worth â‚¹1,880 cr trad...",CIPLA,Cipla Limited,Pharmaceuticals,Pharmaceuticals,EQ,1995-02-08,2,1,...,921.849976,917.393127,8304774,1135.429993,1155.719995,1129.699988,1145.319995,1139.782751,3662772.4,bullish
4,2022-02-15,NMDC in focus\n\nIron Ore falls 10% on SGX as ...,NMDC,NMDC Limited,Mining & Minerals,Mining & Minerals,EQ,2008-03-03,1,1,...,149.300003,143.697586,17659797,186.379999,189.089999,182.490005,185.719998,178.750937,18851958.2,bullish


In [3]:
df.dropna(inplace=True)
df.shape

(5386, 29)

In [4]:
data = df.iloc[:,1:6]
data.replace("[^a-zA-Z0-9\s]"," ",regex=True,inplace=True)
data.dropna(inplace=True)
data.shape

(5386, 5)

# Combine Columns

In [5]:
corpus = []
for row in range(0,len(data.index)):
    corpus.append(' '.join(str(x) for x in data.iloc[row,0:5]))
corpus

['Religare Ent Says\n      Religare Finvest victim of fraud perpetuated by erstwhile mgmt\n      Have challenged process of RFL being classified as    fraud    account \n      RBI on Feb 11 said RFL restructuring can   t be implemented with REL being a promoter as lenders declared RFL a fraud exposure RELIGARE Religare Enterprises Limited Finance   Investments Finance   Investments',
 'Ashok Leyland targets 40  ICV sales from CNG variants ASHOKLEY Ashok Leyland Limited Auto   LCVs   HCVs Auto   LCVs   HCVs',
 'Cipla promoters plan to sell 2 5  of stock CIPLA Cipla Limited Pharmaceuticals Pharmaceuticals',
 '2 05 cr shares of Cipla worth    1 880 cr trade on NSE at average    915 sh\n\nAlert  Promoters likely to have sold a stake in Cipla via block deal CIPLA Cipla Limited Pharmaceuticals Pharmaceuticals',
 'NMDC in focus\n\nIron Ore falls 10  on SGX as China fights to rein in prices NMDC NMDC Limited Mining   Minerals Mining   Minerals',
 '  Manappuram under pressure after a weak set o

In [6]:
y = pd.get_dummies(df['trend']).iloc[:,1].values
y

array([False, False,  True, ..., False, False,  True])

# One-hot Representation

In [7]:
vocab_size = 10000
onehot_reprs=[one_hot(words,vocab_size)for words in corpus]
print(onehot_reprs)

[[9127, 3574, 514, 9127, 9702, 7475, 5189, 4543, 9087, 1273, 2512, 3619, 9817, 504, 71, 5189, 4165, 9468, 3331, 1373, 4543, 3852, 7159, 8664, 7751, 3270, 9518, 4165, 1196, 1507, 7125, 5277, 4133, 9360, 4145, 9468, 9537, 2304, 1373, 1453, 8592, 4165, 9537, 4543, 4123, 9127, 9127, 4953, 2223, 8214, 5323, 8214, 5323], [2298, 7053, 7608, 5176, 838, 2091, 6931, 1023, 1499, 8262, 2298, 7053, 2223, 1737, 9356, 428, 1737, 9356, 428], [6046, 3949, 3089, 3657, 4082, 1186, 419, 5189, 2091, 6046, 6046, 2223, 8542, 8542], [1186, 5508, 7571, 245, 5189, 6046, 851, 2241, 8328, 7571, 7918, 8664, 2023, 9577, 9664, 8035, 6940, 9887, 3949, 4982, 3657, 9817, 4407, 9537, 8304, 3064, 6046, 3604, 1699, 7943, 6046, 6046, 2223, 8542, 8542], [8522, 3064, 864, 5372, 80, 2857, 740, 8664, 5637, 1373, 2440, 6614, 3657, 4613, 3064, 3365, 8522, 8522, 2223, 543, 5839, 543, 5839], [5338, 2650, 2652, 6951, 9537, 8838, 5044, 5189, 8825, 3064, 9741, 5338, 5338, 8214, 2223, 8214, 8214], [1506, 8343, 1041, 9707, 5521, 1273, 

In [8]:
sent_len=100
emb=pad_sequences(onehot_reprs,padding='pre',maxlen=sent_len)
print(emb[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 9127 3574  514 9127 9702 7475 5189 4543 9087
 1273 2512 3619 9817  504   71 5189 4165 9468 3331 1373 4543 3852 7159
 8664 7751 3270 9518 4165 1196 1507 7125 5277 4133 9360 4145 9468 9537
 2304 1373 1453 8592 4165 9537 4543 4123 9127 9127 4953 2223 8214 5323
 8214 5323]


# Building LSTM Model

In [9]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=100,input_length=sent_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])





In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1042305 (3.98 MB)
Trainable params: 1042305 (3.98 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
X_train, X_test, y_train, y_test = train_test_split(emb, y, test_size=0.25, random_state=42)

In [12]:
model.fit(X_train, y_train, epochs=10, verbose=1, batch_size=64)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1ef12624410>

In [13]:
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print("LSTM Model Accuracy:", accuracy)

LSTM Model Accuracy: 0.5746102333068848
