In [1]:
# import libraries
import numpy as np 
import pandas as pd 
import emoji
import string
import re

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.callbacks import EarlyStopping

from keras.layers import Dense, Embedding, LSTM, Bidirectional
from keras.layers import BatchNormalization, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# import data
data = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv',header=None)
data.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


# Preprocessing

In [3]:
# give column names
new_columns=["id","type","sentiment","text"]
data.columns = new_columns
data.head()

Unnamed: 0,id,type,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  int64 
 1   type       74682 non-null  object
 2   sentiment  74682 non-null  object
 3   text       73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
# Drop columns that aren't important
data=data[['text','sentiment']]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       73996 non-null  object
 1   sentiment  74682 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [6]:
data["sentiment"].value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: sentiment, dtype: int64

In [7]:
# Drop the NA values
data.dropna(subset=['text'], inplace=True)

In [8]:
# Drop duplicate values
data = data.drop_duplicates(subset=["sentiment","text"], keep='first')

In [9]:
# Convert the 'sentiment' column to strings
data['sentiment'] = data['sentiment'].astype(str)

In [10]:
# Convert the 'text' column to strings
data['text'] = data['text'].astype(str)

In [11]:
# Convert text column to lowercase
data['text'] = data['text'].apply(lambda x: x.lower())
data

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,i am coming to the borders and i will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive
...,...,...
74677,just realized that the windows partition of my...,Positive
74678,just realized that my mac window partition is ...,Positive
74679,just realized the windows partition of my mac ...,Positive
74680,just realized between the windows partition of...,Positive


In [12]:
##CUSTOM DEFINED FUNCTIONS TO CLEAN THE TWEETS

#Clean emojis from text
def strip_emoji(text):
    emoji_pattern = emoji.demojize(text)
    return re.sub(r":[a-zA-Z_]+:", "", emoji_pattern)

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

def clean_text(t):
    return remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(strip_emoji(t)))))

In [13]:
# Preprocess data using the clean_text function
texts_new = []
for t in data.text:
    texts_new.append(clean_text(t))
data['cleaned_text'] = [text.lower() for text in texts_new]

In [14]:
# Clean the 'text' column by removing non-alphanumeric characters except spaces
data['text'] = data['text'].apply((lambda x: re.sub('[^a-z0-9\s]','',x)))
data

Unnamed: 0,text,sentiment,cleaned_text
0,im getting on borderlands and i will murder yo...,Positive,im getting on borderlands and i will murder yo...
1,i am coming to the borders and i will kill you...,Positive,i am coming to the borders and i will kill you...
2,im getting on borderlands and i will kill you all,Positive,im getting on borderlands and i will kill you all
3,im coming on borderlands and i will murder you...,Positive,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...
74677,just realized that the windows partition of my...,Positive,just realized that the windows partition of my...
74678,just realized that my mac window partition is ...,Positive,just realized that my mac window partition is ...
74679,just realized the windows partition of my mac ...,Positive,just realized the windows partition of my mac ...
74680,just realized between the windows partition of...,Positive,just realized between the windows partition of...


# Build the Tokenizer 

In [15]:
top_words = 10000
# Create a Tokenizer object
tokenizer = Tokenizer(num_words=top_words, split=' ')
# Fit tokenizer object on text column
tokenizer.fit_on_texts(data['text'].values) 

In [16]:
# Obtain the word index built by the Tokenizer
word_index =  tokenizer.word_index

In [17]:
# Convert the text data to sequences of corresponding integer indices using the Tokenizer
X = tokenizer.texts_to_sequences(data['text'].values)

In [18]:
# padding
X = pad_sequences(X)

In [19]:
# Create a LabelEncoder object to encode categorical labels as numerical values
le = preprocessing.LabelEncoder()
y=le.fit_transform(data['sentiment'])

In [20]:
# Get class labels and their corresponding indices
class_labels = le.classes_
class_indices = {class_label: class_index for class_index, class_label in enumerate(class_labels)}

# Print class labels and their indices
for class_label, class_index in class_indices.items():
    print(f"Class Label: {class_label}, Index: {class_index}")

Class Label: Irrelevant, Index: 0
Class Label: Negative, Index: 1
Class Label: Neutral, Index: 2
Class Label: Positive, Index: 3


# Build the model

In [25]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(46745, 166) (46745,)
(23024, 166) (23024,)


In [27]:
embed_dim = 128
lstm_units = 196

# Define a sequential model
model1 = Sequential()
model1.add(Embedding(top_words, embed_dim,input_length = X.shape[1]))

model1.add(LSTM(100,dropout=0.25,recurrent_dropout=0.25))
model1.add(Dense(50,activation='relu'))
model1.add(Dense(25,activation='relu'))

model1.add(Dense(4,activation='softmax'))
model1.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model1.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 166, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 50)                5050      
                                                                 
 dense_1 (Dense)             (None, 25)                1275      
                                                                 
 dense_2 (Dense)             (None, 4)                 104       
                                                                 
Total params: 1,378,029
Trainable params: 1,378,029
Non-trainable params: 0
_________________________________________________________________
None


In [28]:
batch_size = 32
model1.fit(X_train, y_train, epochs = 7, batch_size=batch_size,validation_data= (X_test, y_test)
                             ,callbacks=EarlyStopping(patience=3,restore_best_weights=True))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x79e80004ee50>

In [29]:
model1.evaluate(X_train, y_train)



[0.10408245027065277, 0.9640603065490723]

In [30]:
model1.evaluate(X_test, y_test)



[0.5590208172798157, 0.8468120098114014]

In [32]:
y_pred_probs = model1.predict(X_test)
# Get the predicted classes by selecting the class with the highest probability
y_pred_classes = y_pred_probs.argmax(axis=-1)

# Compute evaluation metrics
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_classes)



In [33]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

Precision: 0.8472539792807534
Recall: 0.846812022237665
F1 Score: 0.8468188503681144
Confusion Matrix:
[[3266  241  308  297]
 [ 176 6048  347  404]
 [ 208  303 4779  386]
 [ 277  216  364 5404]]


In [34]:
model1.save('twitter_sentiment.h5')

In [37]:
embed_dim = 128
lstm_units = 196

# Define a sequential model
model2 = Sequential()
model2.add(Embedding(top_words, embed_dim,input_length = X.shape[1]))

model2.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model2.add(BatchNormalization())
model2.add(Dropout(0.25))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(32, activation='relu'))

model2.add(Dense(4, activation='softmax'))
model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model2.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 166, 128)          1280000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              263168    
 nal)                                                            
                                                                 
 batch_normalization_1 (Batc  (None, 256)              1024      
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_6 (Dense)             (None, 64)                16448     
                                                                 
 dense_7 (Dense)             (None, 32)               

In [38]:
batch_size = 32
model2.fit(X_train, y_train, epochs = 7, batch_size=batch_size,validation_data= (X_test, y_test)
                             ,callbacks=EarlyStopping(patience=3,restore_best_weights=True))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x79e7e0115090>

In [39]:
model2.evaluate(X_train, y_train)



[0.1131368950009346, 0.9597176313400269]

In [43]:
model2.evaluate(X_test, y_test)



[0.5145952105522156, 0.8482018709182739]

In [41]:
y_pred_probs = model2.predict(X_test)
# Get the predicted classes by selecting the class with the highest probability
y_pred_classes = y_pred_probs.argmax(axis=-1)

# Compute evaluation metrics
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_classes)



In [42]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

Precision: 0.8503452644248831
Recall: 0.8482018763029882
F1 Score: 0.8477398375268652
Confusion Matrix:
[[3140  255  277  440]
 [ 125 6174  289  387]
 [ 172  329 4610  565]
 [ 142  276  238 5605]]


In [54]:
model2.save('twitter_sentiment2.h5')

In [44]:
embed_dim = 128
lstm_units = 128

# Define a sequential model
model3 = Sequential()
model3.add(Embedding(top_words, embed_dim, input_length = X.shape[1]))
model3.add(SpatialDropout1D(0.3))
model3.add(LSTM(128, dropout = 0.3, recurrent_dropout = 0.3))
model3.add(Dense(128, activation = 'relu'))
model3.add(Dense(64, activation = 'relu'))
model3.add(Dropout(0.3))
model3.add(Dense(4, activation = 'softmax'))
model3.compile(loss='sparse_categorical_crossentropy',optimizer='Adam',metrics=['accuracy'])

print(model3.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 166, 128)          1280000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 166, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm_3 (LSTM)               (None, 128)               131584    
                                                                 
 dense_9 (Dense)             (None, 128)               16512     
                                                                 
 dense_10 (Dense)            (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                      

In [46]:
batch_size = 64
model3.fit(X_train, y_train, epochs = 10, batch_size=batch_size,validation_data= (X_test, y_test)
                             ,callbacks=EarlyStopping(patience=3,restore_best_weights=True))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x79e383bfc310>

In [47]:
model3.evaluate(X_train, y_train)



[0.17080062627792358, 0.9405497908592224]

In [48]:
model3.evaluate(X_test, y_test)



[0.5035551190376282, 0.8349114060401917]

In [52]:
y_pred_probs = model3.predict(X_test)
# Get the predicted classes by selecting the class with the highest probability
y_pred_classes = y_pred_probs.argmax(axis=-1)

# Compute evaluation metrics
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_classes)



In [53]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

Precision: 0.8346366127051107
Recall: 0.8349113968033357
F1 Score: 0.8345758403808092
Confusion Matrix:
[[3196  262  250  404]
 [ 191 6125  350  309]
 [ 273  392 4600  411]
 [ 246  367  346 5302]]


In [51]:
model3.save('twitter_sentiment3.h5')

# Prediction

In [58]:
oos_data = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv',header=0)

In [59]:
oos_data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [60]:
oos_data.dropna(subset=['Sentence'], inplace=True)

In [61]:
oos_data = oos_data.drop_duplicates(subset=["Sentiment","Sentence"], keep='first')

In [62]:
oos_data['Sentiment'] = oos_data['Sentiment'].astype(str)
oos_data['Sentence'] = oos_data['Sentence'].astype(str)
oos_data['Sentence'] = oos_data['Sentence'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [63]:
oos_texts_new = []
for t in oos_data.Sentence:
    oos_texts_new.append(clean_text(t))
oos_data['cleaned_text'] = [text.lower() for text in oos_texts_new]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [64]:
oos_data['Sentence'] = oos_data['Sentence'].apply((lambda x: re.sub('[^a-z0-9\s]','',x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [65]:
top_words = 10000
tokenizer = Tokenizer(num_words=top_words, split=' ')
tokenizer.fit_on_texts(oos_data['Sentence'].values) 

In [66]:
word_index =  tokenizer.word_index

In [67]:
X = tokenizer.texts_to_sequences(oos_data['Sentence'].values)
X = pad_sequences(X)

In [68]:
le = preprocessing.LabelEncoder()
y=le.fit_transform(oos_data['Sentiment'])

In [86]:
model1.evaluate(X[0:5], y[0:5])



[1.847285509109497, 0.800000011920929]

In [87]:
model2.evaluate(X[0:5], y[0:5])



[2.4150404930114746, 0.6000000238418579]

In [93]:
model3.evaluate(X[0:5], y[0:5])



[1.560268521308899, 0.6000000238418579]