In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### data processing

In [4]:
train_data_path = '/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv'
validation_data_path = '/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv'

In [5]:
twitter_data = pd.read_csv(train_data_path)
validation_data = pd.read_csv(validation_data_path)

In [6]:
twitter_data.columns=['ID','Keyword','Sentiment','Tweet']
twitter_data.head()

Unnamed: 0,ID,Keyword,Sentiment,Tweet
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [7]:
validation_data.columns=['ID','Keyword','Sentiment','Tweet']
validation_data.head()

Unnamed: 0,ID,Keyword,Sentiment,Tweet
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [8]:
twitter_data.isnull().sum()

ID             0
Keyword        0
Sentiment      0
Tweet        686
dtype: int64

In [9]:
twitter_data = twitter_data.dropna(subset=['Tweet'])
twitter_data.isnull().sum()

ID           0
Keyword      0
Sentiment    0
Tweet        0
dtype: int64

In [10]:
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [11]:
twitter_data['cleaned_tweet'] = twitter_data['Tweet'].apply(clean_text)

In [12]:
#mapping negative=0, neutral=1, positive=2
twitter_data['Sentiment'] = twitter_data['Sentiment'].map({'Negative': 0, 'Positive': 2, 'Neutral': 1, 'Irrelevant': 1})

In [13]:
#tokenization and padding
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(twitter_data['cleaned_tweet'])

sequences = tokenizer.texts_to_sequences(twitter_data['cleaned_tweet'])
X = pad_sequences(sequences, maxlen=max_len)
y = to_categorical(twitter_data['Sentiment'].values, num_classes=3)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
#LSTM model
model = Sequential()
model.add(Embedding(max_words, 128))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

In [17]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 145ms/step - accuracy: 0.6003 - loss: 0.8446 - val_accuracy: 0.7721 - val_loss: 0.5630
Epoch 2/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 143ms/step - accuracy: 0.8102 - loss: 0.4635 - val_accuracy: 0.8177 - val_loss: 0.4633
Epoch 3/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 145ms/step - accuracy: 0.8701 - loss: 0.3282 - val_accuracy: 0.8368 - val_loss: 0.4277
Epoch 4/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 144ms/step - accuracy: 0.8929 - loss: 0.2656 - val_accuracy: 0.8480 - val_loss: 0.4173
Epoch 5/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 144ms/step - accuracy: 0.9095 - loss: 0.2255 - val_accuracy: 0.8530 - val_loss: 0.4229
Epoch 6/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 146ms/step - accuracy: 0.9228 - loss: 0.1919 - val_accuracy: 0.8551 - val_loss:

<keras.src.callbacks.history.History at 0x7fc44e70bdf0>

In [21]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100}%')

[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 41ms/step - accuracy: 0.8704 - loss: 0.4661
Test Accuracy: 87.02614903450012%


In [22]:
import numpy as np
from sklearn.metrics import f1_score

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

f1 = f1_score(y_true, y_pred, average='weighted')
print(f'F1 Score: {f1}')

[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 42ms/step
F1 Score: 0.8701641080806005
