In [4]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import pandas as pd
from keras.models import load_model

In [2]:
MAX_NB_WORDS = 50000  
MAX_SEQUENCE_LENGTH = 100  
VALIDATION_SPLIT = 0.2  
EMBEDDING_DIM = 100  
LSTM_UNITS = 64  
BATCH_SIZE = 32  
EPOCHS = 10 

In [4]:
df = pd.read_csv('../Dataset/df5.csv')
df.head()

Unnamed: 0,Text,Sentiment
0,"According to Gran , the company has no plans t...",2
1,Technopolis plans to develop in stages an area...,2
2,The international electronic industry company ...,-1
3,With the new production plant the company woul...,1
4,According to the company 's updated strategy f...,1


In [7]:
positive = []
negative = []
neutral = []
for i in range(len(df)):
    if df['Sentiment'][i] == 1:
        positive.append(df['Text'][i])
    elif df['Sentiment'][i] == 2:
        neutral.append(df['Text'][i])
    elif df['Sentiment'][i] == -1:
        negative.append(df['Text'][i])
    else:
        print('error')

print("Positive: ",len(positive))
print("Negative: ",len(negative))
print("Neutral: ",len(neutral))


Positive:  5048
Negative:  2710
Neutral:  2879


In [34]:
print(positive[0])
print(negative[0])
print(neutral[0])

With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .
The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .
According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .


In [8]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

In [9]:
tokenizer.fit_on_texts(positive + negative + neutral) 

In [10]:
sequence_poitive = tokenizer.texts_to_sequences(positive)
sequence_negative = tokenizer.texts_to_sequences(negative)
sequence_neutral = tokenizer.texts_to_sequences(neutral)

In [11]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 17001 unique tokens.


In [12]:
data = pad_sequences(sequence_poitive + sequence_negative + sequence_neutral , maxlen=MAX_SEQUENCE_LENGTH)

In [13]:
labels = np.concatenate([np.ones(len(positive)), -np.ones(len(negative)),np.full(len(neutral),2)])
labels = to_categorical(labels)

In [14]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (10637, 100)
Shape of label tensor: (10637, 3)


In [15]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [16]:
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [22]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(LSTM_UNITS, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(3, activation='softmax'))

In [23]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
checkpoint = ModelCheckpoint('news_model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
callbacks = [checkpoint]

In [27]:
history = model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_val, y_val), callbacks=callbacks)

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.77057, saving model to news_model.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.77057
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.77057
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.77057
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.77057
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.77057
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.77057
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.77057
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.77057
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.77057


In [29]:
model = load_model('news_model.h5')

In [36]:
sentence = "Reliance industries goes bankrupt"
sequence = tokenizer.texts_to_sequences([sentence])
data = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
result = model.predict(data)
print(np.argmax(result))

2


In [63]:
df1 = pd.read_csv('../Dataset/T/train.csv',encoding='latin-1')
df1.head()
df1.text = df1.text.astype(str)
df1.selected_text = df1.selected_text.astype(str)
df1.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [64]:
positive = []
negative = []
neutral = []
for i in range(len(df1)):
    if df1['sentiment'][i] == "positive":
        positive.append(df1['selected_text'][i])
    elif df1['sentiment'][i] == "neutral":
        neutral.append(df1['selected_text'][i])
    elif df1['sentiment'][i] == "negative":
        negative.append(df1['selected_text'][i])
    else:
        print('error')

print("Positive: ",len(positive))
print("Negative: ",len(negative))
print("Neutral: ",len(neutral))

Positive:  8582
Negative:  7781
Neutral:  11118


In [65]:
print(positive[0])

fun


In [66]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

In [67]:
tokenizer.fit_on_texts(positive + negative + neutral)

In [68]:
sequence_poitive = tokenizer.texts_to_sequences(positive)
sequence_negative = tokenizer.texts_to_sequences(negative)
sequence_neutral = tokenizer.texts_to_sequences(neutral)

In [69]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 17819 unique tokens.


In [70]:
data = pad_sequences(sequence_poitive + sequence_negative + sequence_neutral , maxlen=MAX_SEQUENCE_LENGTH)

In [95]:
labels = np.concatenate([np.ones(len(positive)), np.full(len(negative),-1),np.full(len(neutral),2)])

labels = to_categorical(labels)
print(labels)



[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [96]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (27481, 100)
Shape of label tensor: (27481, 3)


In [97]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [102]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(LSTM_UNITS, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [103]:
checkpoint = ModelCheckpoint('news_model_4.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
callbacks = [checkpoint]

In [None]:
history = model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(x_val, y_val), callbacks=callbacks)

In [4]:
from GoogleNews import GoogleNews
import pandas as pd
import requests

In [7]:
requests.get('https://www.google.com/', headers = {'User-agent': 'your bot 0.1'})
googlenews=GoogleNews(start='8/01/2023',end='8/04/2023')
googlenews.search('Punjab National Bank')
result=googlenews.result()
dfn=pd.DataFrame(result)
dfn.head()

HTTP Error 429: Too Many Requests


In [7]:

titles = []
for i in range(len(dfn)):
    titles.append(dfn['desc'][i])
    

titles.append("Reliance industries goes bankrupt")
print(titles)

print(len(titles))


['LUCKNOW: The Central Bureau of Investigation (CBI) on Tuesday arrested a branch manager and a field officer, both working at Punjab National Bank in Uttar...', 'The Central Bureau of Investigation has arrested a Branch Manager and a Field Officer, both working in Punjab National Bank, Khekra, Bagpat(Uttar Pradesh)...', 'The Central Bureau of Investigation has arrested a Branch Manager and a Field Officer of Punjab National Bank, Bagpat in Uttar Pradesh for demanding and...', 'The current trading price of PNB is 61.2 Rs, which is lower than the S2 level of 62.816666 Rs. 12:18 PM IST. Punjab National Bank Share Price Live Updates:...', 'Punjab National Bank (PNB) is conducting an electronic auction (e-auction) for mortgage properties on 3 August 2023. The properties offered by the PNB...', 'Your account may be restricted. Update KYC soon. Check last date. Punjab National Bank KYC Update News (Last Date, Documents Required): PNB has asked its...', 'Punjab National Bank (PNB) Share Price

In [113]:
def getPred(sentence):
    sequence = tokenizer.texts_to_sequences([sentence])
    data = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
    result = model.predict(data)
    return np.argmax(result)


In [115]:
for title in titles:
    print(getPred(title))

2
2
2
1
2
2
2
2
2
2


In [10]:
from textblob import TextBlob

for title in titles:
    print(TextBlob(title).sentiment)




Sentiment(polarity=0.0, subjectivity=0.25)
Sentiment(polarity=0.0, subjectivity=0.25)
Sentiment(polarity=0.0, subjectivity=0.25)
Sentiment(polarity=0.06818181818181818, subjectivity=0.45)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.0, subjectivity=0.06666666666666667)
Sentiment(polarity=0.21428571428571427, subjectivity=0.42857142857142855)
Sentiment(polarity=-0.1875, subjectivity=0.3875)
Sentiment(polarity=0.0, subjectivity=0.0)
Sentiment(polarity=0.0, subjectivity=0.0)


In [11]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [12]:
sentiment = SentimentIntensityAnalyzer()

In [16]:
for title in titles:
    sent = sentiment.polarity_scores(title)["pos"]
    print(sent)

0.0
0.0
0.0
0.07
0.0
0.0
0.145
0.113
0.174
0.0
