In [1]:
# Importing the necessary libraries

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
from string import punctuation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
%matplotlib inline
import os

In [2]:
df1 = pd.read_csv('finance_dataset/positive_sentiment.csv')
df2 = pd.read_csv('finance_dataset/negative_sentiment.csv')

df2.label = 0

df = pd.concat([df1, df2], ignore_index=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5989 entries, 0 to 5988
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5989 non-null   object
 1   label   5989 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 93.7+ KB


In [4]:
df.label.value_counts()

label
1    4526
0    1463
Name: count, dtype: int64

In [5]:
df.head()

Unnamed: 0,text,label
0,britam pretax profit dips 80 due to 2016 reval...,1
1,nys to provide sh50 psv services in nairobi ro...,1
2,kenya welcomes sh101b renewable energy facilit...,1
3,kcb to fly two lucky customers to watch fifa w...,1
4,41 of kenyans depend on raw water sources shoa...,1


In [6]:
# Function which performs tokenization, lemmatization and removes stop words and punctuations
# This step is important as it removes any unnecessary data

def preprocess(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word not in punctuation]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return " ".join(lemmatized_tokens)

In [7]:
nltk.download('stopwords')  
nltk.download('wordnet')     
nltk.download('averaged_perceptron_tagger') 
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AUDRIK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AUDRIK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\AUDRIK\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [8]:
df['processed_text'] = df.text.apply(preprocess)

In [9]:
# Splitting the data into training, validation and testing data

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

smote = SMOTE()
X_train, X_temp, y_train, y_temp = train_test_split(df.processed_text, df.label, test_size = 0.35, stratify=df.label, random_state = 123)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size = 0.35, stratify=y_temp, random_state = 123)

In [10]:
# Tokenization of the data and padding to ensure that the length of the arrays is identical

max_features = 1000
tokenizer = Tokenizer(num_words = max_features, split = ' ')
tokenizer.fit_on_texts(X_train)

training_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(training_sequences, maxlen = 500, padding = 'post')

validation_sequences = tokenizer.texts_to_sequences(X_val)
X_val_padded = pad_sequences(validation_sequences, maxlen = 500, padding = 'post')

testing_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(testing_sequences, maxlen = 500, padding = 'post')

In [11]:
print(X_train)

1957    oil discovered kenya recoverable deposit surpa...
3923                                               wti 51
5031     nigeria loses 100 billion revenue attack cut oil
2064    east africa aim regional stock exchange within...
303           rising need conveniencefocused business via
                              ...                        
2982    ivory coast raise 750m 10year bond sale deal m...
2477    ghana cut key interest rate halfpoint inflatio...
3058    fund available spending public service grow av...
1488    uganda cut benchmark lending rate 90 country 1...
5783    junction mall statement regarding decision eje...
Name: processed_text, Length: 3892, dtype: object


In [12]:
# Using SMOTE to handle imbalance in the data by synthetically creating values for the minority class

X_train_sm, y_train_sm = smote.fit_resample(X_train_padded, y_train)
X_val_sm, y_val_sm = smote.fit_resample(X_val_padded, y_val)
X_test_sm, y_test_sm = smote.fit_resample(X_test_padded, y_test)

In [13]:
print(y_train_sm.value_counts(), y_val_sm.value_counts(), y_test_sm.value_counts())

label
1    2941
0    2941
Name: count, dtype: int64 label
1    555
0    555
Name: count, dtype: int64 label
1    1030
0    1030
Name: count, dtype: int64


In [14]:
# Creating the model and adding the necessary layers

embedding_dim = 35
max_len = 500
total_words = len(tokenizer.word_index) + 1

model = keras.Sequential([
    keras.layers.Embedding(total_words, embedding_dim, input_length=max_len),
    keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True, recurrent_activation='sigmoid')), 
    keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True, recurrent_activation='sigmoid')),  
    keras.layers.Bidirectional(keras.layers.LSTM(64, recurrent_activation='sigmoid')),
    keras.layers.Dense(64, activation='sigmoid'),
    keras.layers.Dense(64, activation='sigmoid'),
    keras.layers.Dense(1, activation='sigmoid') 
])

model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 35)           256200    
                                                                 
 bidirectional (Bidirectiona  (None, 500, 128)         51200     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 500, 128)         98816     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                        

In [15]:
model.fit(X_train_sm, y_train_sm, validation_data = (X_val_sm, y_val_sm), epochs = 25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x2ef9931d0a0>

In [16]:
model.evaluate(X_test_padded, y_test)



[1.0123087167739868, 0.7190021872520447]

In [17]:
# Creating a function which will be used to predict the sentiment for the input text

def predict(text, tokenizer, model):
    # Preprocess the text
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word not in punctuation]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    processed_text = " ".join(lemmatized_tokens)
    
    # Convert the processed text into a sequence of integers using the trained tokenizer
    sequences = tokenizer.texts_to_sequences([processed_text])  # Note the brackets
    predict_padded = pad_sequences(sequences, maxlen=500, padding='post')

    # Model prediction
    predicted_sentiment = model.predict(predict_padded)
    
    # Convert probability to binary class (0 or 1)
    predicted_class = int(np.round(predicted_sentiment[0][0]))  # Round to get either 0 or 1
    
    if (predicted_class == 0):
        return "Negative"
    else:
        return "Positive"

In [18]:
text = "The outlook is not looking great"

predict(text, tokenizer, model)



'Negative'

In [19]:
predict("The value of the stocks are going up and people are investing",  tokenizer, model)



'Positive'

In [21]:
import os

if not os.path.exists("finance_dataset"):
    os.makedirs("finance_dataset")

model_versions = [i for i in os.listdir("finance_dataset") if i.isdigit()]

model_version = max([int(i) for i in model_versions] + [0]) + 1

model.save(f"finance_dataset/model{model_version}.h5")

# model.save('model.h5')

save_path = 'finance_dataset'
tokenizer_path = os.path.join(save_path, 'tokenizer.json')
tokenizer_json = tokenizer.to_json()
with open(tokenizer_path, 'w') as f:
    f.write(tokenizer_json)