In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Concatenate
import pickle
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score




In [2]:
# Loading the Excel file
df = pd.read_excel('negative_sentiments.xlsx')


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   User           209 non-null    object 
 1   Tweet          209 non-null    object 
 2   Cleaned_Tweet  209 non-null    object 
 3   Negative       209 non-null    float64
 4   Neutral        209 non-null    float64
 5   Positive       209 non-null    float64
 6   Sentiment      209 non-null    object 
 7   Label          151 non-null    float64
dtypes: float64(4), object(4)
memory usage: 13.2+ KB


In [4]:
# Separating labeled and unlabeled data
predict_df = df[df['Label'].isna()]
train_df = df.dropna(subset=['Label'])

In [5]:
# Tokenizing and prepare data for training
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['Cleaned_Tweet'])
vocab_size = len(tokenizer.word_index) + 1

In [6]:
# Tokenize unigrams
X_unigrams = tokenizer.texts_to_sequences(train_df['Cleaned_Tweet'])
X_unigrams = pad_sequences(X_unigrams, padding='post', truncating='post', maxlen=128)

# Creating bigrams 
X_bigrams = []
for tweet_tokens in zip(train_df['Cleaned_Tweet'].str.split(), train_df['Cleaned_Tweet'].str.split().shift(-1)):
    bigram_sequence = tokenizer.texts_to_sequences([' '.join(map(str, tweet_tokens))])
    X_bigrams.extend(bigram_sequence)

X_bigrams = pad_sequences(X_bigrams, padding='post', truncating='post', maxlen=128)

# Create trigrams
X_trigrams = []
for tweet_tokens in zip(train_df['Cleaned_Tweet'].str.split(), train_df['Cleaned_Tweet'].str.split().shift(-1), train_df['Cleaned_Tweet'].str.split().shift(-2)):
    trigram_sequence = tokenizer.texts_to_sequences([' '.join(map(str, tweet_tokens))])
    X_trigrams.extend(trigram_sequence)

X_trigrams = pad_sequences(X_trigrams, padding='post', truncating='post', maxlen=128)

# Concatenating unigrams, bigrams, and trigrams
X_combined = np.concatenate([X_unigrams, X_bigrams, X_trigrams], axis=1)

y = train_df['Label'].values

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.1, random_state=42)


In [7]:
# Defining the BiLSTM model with three layers and dropout
embedding_dim = 50

inputs = Input(shape=(384,))  
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=384)(inputs)
bi_lstm_1 = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
dropout_1 = Dropout(0.5)(bi_lstm_1)
bi_lstm_2 = Bidirectional(LSTM(64, return_sequences=True))(dropout_1)
dropout_2 = Dropout(0.5)(bi_lstm_2)
bi_lstm_3 = Bidirectional(LSTM(64))(dropout_2)
dropout_final = Dropout(0.5)(bi_lstm_3)
outputs = Dense(1, activation='sigmoid')(dropout_final)

# Compiling the model
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


# Training the BiLSTM model with early stopping
epochs = 25
batch_size = 8

history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_val, y_val)
   
)



Epoch 1/25


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


# Tokenizing and prepare data for training
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['Cleaned_Tweet'])
vocab_size = len(tokenizer.word_index) + 1

X = tokenizer.texts_to_sequences(train_df['Cleaned_Tweet'])
X = pad_sequences(X, padding='post', truncating='post', maxlen=128)
y = train_df['Label'].values

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Defining the BiLSTM model with three layers and dropout
embedding_dim = 50

inputs = Input(shape=(128,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=128)(inputs)
bi_lstm_1 = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
dropout_1 = Dropout(0.5)(bi_lstm_1)
bi_lstm_2 = Bidirectional(LSTM(64, return_sequences=True))(dropout_1)
dropout_2 = Dropout(0.5)(bi_lstm_2)
bi_lstm_3 = Bidirectional(LSTM(64))(dropout_2)
dropout_final = Dropout(0.5)(bi_lstm_3)
outputs = Dense(1, activation='sigmoid')(dropout_final)

# Compiling the model
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Defining early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Training the BiLSTM model with early stopping
epochs = 50
batch_size = 8

history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping]  
)

In [8]:
# Validation
val_preds = model.predict(X_val).flatten()
val_preds_binary = (val_preds > 0.5).astype(int)

# Print classification report
print(classification_report(y_val, val_preds_binary))


              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93        15
         1.0       0.00      0.00      0.00         1

    accuracy                           0.88        16
   macro avg       0.47      0.47      0.47        16
weighted avg       0.88      0.88      0.88        16



In [9]:
# Calculating Mean Squared Error (MSE)
mse = mean_squared_error(y_val, val_preds)
print(f"Mean Squared Error (MSE): {mse:.4f}")

# Calculating Mean Absolute Error (MAE)
mae = mean_absolute_error(y_val, val_preds)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Calculating R-squared (R2) score
r2 = r2_score(y_val, val_preds)
print(f"R-squared (R2) Score: {r2:.4f}")

Mean Squared Error (MSE): 0.1249
Mean Absolute Error (MAE): 0.1250
R-squared (R2) Score: -1.1309


In [10]:
# Prediction on unlabeled data
unlabeled_texts = predict_df['Cleaned_Tweet'].values
X_unlabeled = tokenizer.texts_to_sequences(unlabeled_texts)
X_unlabeled = pad_sequences(X_unlabeled, padding='post', truncating='post', maxlen=128)

unlabeled_preds = model.predict(X_unlabeled).flatten()
unlabeled_preds_binary = (unlabeled_preds > 0.5).astype(int)

# Add predicted labels to the predict_df DataFrame
predict_df['Predicted_Label'] = unlabeled_preds_binary

# Print each tweet and the predicted label
for index, row in predict_df.iterrows():
    print(f"Tweet: {row['Cleaned_Tweet']}, Predicted Label: {row['Predicted_Label']}")


Tweet: bagger amazed knew walking football result yesterday clearly doesnt twitter explained followed, Predicted Label: 0
Tweet: blimey mick old dog, Predicted Label: 0
Tweet: wondered rd right obvs due play rare midweek game wasnt good enough saturday league l put bat pls send regard, Predicted Label: 1
Tweet: guy tweet date wrong think, Predicted Label: 0
Tweet: biggest crime issue north east lincolnshire tell u view survey open th august, Predicted Label: 0
Tweet: spent week tackling fire hatfield moor desperately need help stop fire beautiful site like especially hot dry day like today bbqs cigarette rubbish yes life, Predicted Label: 1
Tweet: attended incident tonight binbrook way group youth used petrol start fire tree bushed areagiven dry condition late incredibly dangerous thank attending averting issue, Predicted Label: 0
Tweet: fire safety message today people easy overload electrical socket result house fire check advice dont call, Predicted Label: 0
Tweet: warm outside does

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predict_df['Predicted_Label'] = unlabeled_preds_binary


In [11]:
# Input text
text = "there is a fire in south street"

# Tokenize the input text
input_sequence = tokenizer.texts_to_sequences([text])
input_sequence = pad_sequences(input_sequence, padding='post', truncating='post', maxlen=128)

# Make a prediction
prediction = model.predict(input_sequence).flatten()[0]
predicted_label = int(prediction > 0.5)

# Display the prediction
print(f"Predicted Label for Input Text: {predicted_label}")

Predicted Label for Input Text: 1


In [12]:
predict_df['Predicted_Label'].value_counts()

Predicted_Label
0    46
1    12
Name: count, dtype: int64

In [13]:
predict_df

Unnamed: 0,User,Tweet,Cleaned_Tweet,Negative,Neutral,Positive,Sentiment,Label,Predicted_Label
151,Bev,@GyAncient @nige_gallop @CCLeeFreeman @Cleeccs...,bagger amazed knew walking football result yes...,0.519025,0.442044,0.038931,negative,,0
152,Nigel 💙,@GyAncient @BevskiMids @CCLeeFreeman @Cleeccsc...,blimey mick old dog,0.677478,0.285934,0.036588,negative,,0
153,Nigel 💙,@BevskiMids @CCLeeFreeman @Cleeccsc @Humberbea...,wondered rd right obvs due play rare midweek g...,0.76398,0.222234,0.013785,negative,,1
154,Iain Joseph Gorry*,@Cleeccsc @JoRobbo68 @Humberbeat @HumbersideFi...,guy tweet date wrong think,0.664363,0.322601,0.013036,negative,,0
155,North East Lincolnshire Council,What are the biggest crime issues in North Eas...,biggest crime issue north east lincolnshire te...,0.662119,0.320535,0.017346,negative,,0
156,South Yorkshire Fire,We spent weeks tackling a fire on Hatfield Moo...,spent week tackling fire hatfield moor despera...,0.560314,0.336407,0.103279,negative,,1
157,Humberside Police - North East Lincolnshire,#Grimsby #Willows Attended an incident tonight...,attended incident tonight binbrook way group y...,0.889538,0.104743,0.005718,negative,,0
158,Safer Roads Humber,A fire safety message today. With more people ...,fire safety message today people easy overload...,0.514286,0.462393,0.023321,negative,,0
159,North Lincs Council,"Just because it's warm outside, it doesn't mea...",warm outside doesnt mean warm underwater cold ...,0.698401,0.289254,0.012345,negative,,0
160,DC_LK1989,Unfortunately I’m going to say no...our street...,unfortunately im going say noour street would ...,0.875694,0.117654,0.006652,negative,,0


In [16]:
# Sample list of sentences
sentences = ["Had a wonderful time in hull today","there is a fire in the south street we need the your assistance @HumbersideFire", "The kids are lighting fireworks in pearson park it is really dangerous", "I see smoke coming from the paragon station", "Some teenager are jumping of the bridge into the water","There is no incident in the beverly road"]
# Tokenize the list of sentences
input_sequences = tokenizer.texts_to_sequences(sentences)
input_sequences = pad_sequences(input_sequences, padding='post', truncating='post', maxlen=128)

# Make predictions for the input sequences
predictions = model.predict(input_sequences).flatten()

# Apply a threshold (e.g., 0.5) to get binary predictions
predicted_labels = (predictions > 0.5).astype(int)

# Print results
for sentence, label, prob in zip(sentences, predicted_labels, predictions):
    print(f"Sentence: {sentence}")
    print(f"Predicted Label: {label}")
    print(f"Probability: {prob:.4f}")
    print()


Sentence: Had a wonderful time in hull today
Predicted Label: 0
Probability: 0.0001

Sentence: there is a fire in the south street we need the your assistance @HumbersideFire
Predicted Label: 1
Probability: 0.9995

Sentence: The kids are lighting fireworks in pearson park it is really dangerous
Predicted Label: 1
Probability: 0.9982

Sentence: I see smoke coming from the paragon station
Predicted Label: 1
Probability: 0.9995

Sentence: Some teenager are jumping of the bridge into the water
Predicted Label: 0
Probability: 0.0001

Sentence: There is no incident in the beverly road
Predicted Label: 0
Probability: 0.0001



# Saving the model
model.save('BiLSTM_model.h5')
print("Model saved successfully.")

# Save the tokenizer
with open('BiLSTM_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Tokenizer saved successfully.")

In [18]:
import os

In [20]:

output_file_path = 'Labeled_Tweets.xlsx'
predict_df.to_excel(output_file_path, index=False)