In [230]:
import pandas as pd 
import numpy as np


In [231]:
train_data = pd.read_csv(r'F:\M.Tech_CollgeMaterials\CodeLLM\sentiment_analysis\data\twitter_training.csv')

In [232]:
train_data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [233]:
# First step will assign the data set column names so that it can be easier to perform operations 
column_names = ['id', 'keyword', 'sentiment', 'text']

train_data = pd.read_csv(r'F:\M.Tech_CollgeMaterials\CodeLLM\sentiment_analysis\data\twitter_training.csv',names=column_names)
val_data = pd.read_csv(r'F:\M.Tech_CollgeMaterials\CodeLLM\sentiment_analysis\data\twitter_validation.csv',names=column_names)

In [234]:
train_data.head()

Unnamed: 0,id,keyword,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [235]:
train_data.isnull().sum()

id             0
keyword        0
sentiment      0
text         686
dtype: int64

In [236]:
train_data = train_data.dropna()


In [237]:
train_data.dtypes

id            int64
keyword      object
sentiment    object
text         object
dtype: object

In [238]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sayed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sayed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sayed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [239]:
import re

def preprocess(text):
    # Check if the text is a string, if not, return empty string
    if not isinstance(text, str):  
        return ""  # Handle NaN or other non-string values

    # Convert text to lowercase
    text = text.lower()

    # Replace <unk> token with a placeholder (optional, you could remove it)
    text = text.replace("<unk>", "unknown")

    # Remove URLs (replace with 'URL' or just remove)
    text = re.sub(r'http\S+|www\S+|https\S+', 'URL', text)

    # Remove Twitter handles (e.g., @username)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (e.g., #example)
    text = re.sub(r'#\w+', '', text)

    # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and punctuation
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Lemmatize the tokens (convert to base form)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Return the preprocessed text as a space-separated string
    return " ".join(tokens)


In [240]:
train_data['text'] = train_data['text'].apply(preprocess)

In [241]:
val_data['text'] = val_data['text'].apply(preprocess)

In [242]:
train_data['sentiment'] = train_data['sentiment'].map({'Negative': 0, 'Positive': 2, 'Neutral': 1, 'Irrelevant': 1})

In [243]:
val_data['sentiment'] = val_data['sentiment'].map({'Negative': 0, 'Positive': 2, 'Neutral': 1, 'Irrelevant': 1})

In [244]:
train_data.head()

Unnamed: 0,id,keyword,sentiment,text
0,2401,Borderlands,2,im getting borderland murder
1,2401,Borderlands,2,coming border kill
2,2401,Borderlands,2,im getting borderland kill
3,2401,Borderlands,2,im coming borderland murder
4,2401,Borderlands,2,im getting borderland 2 murder


In [245]:
val_data.head()

Unnamed: 0,id,keyword,sentiment,text
0,3364,Facebook,1,mentioned facebook struggling motivation go ru...
1,352,Amazon,1,bbc news amazon bos jeff bezos reject claim co...
2,8312,Microsoft,0,pay word function poorly chromebook
3,4371,CS-GO,0,csgo matchmaking full closet hacking truly awf...
4,4433,Google,1,president slapping american face really commit...


In [246]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.utils import to_categorical  
from tensorflow.keras.models import Sequential    
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout 


max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['text'])  

sequences = tokenizer.texts_to_sequences(train_data['text'])
X = pad_sequences(sequences, maxlen=max_len)
y = to_categorical(train_data['sentiment'].values, num_classes=3)

In [247]:
val_data.dtypes

id            int64
keyword      object
sentiment     int64
text         object
dtype: object

In [248]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [249]:
#LSTM model
model = Sequential()
model.add(Embedding(max_words, 128))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

In [250]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [251]:
X_val = val_data['text']
y_val = val_data['sentiment']
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

In [252]:
y_val_onehot = to_categorical(y_val, num_classes=3)


In [253]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val_pad, y_val_onehot))


Epoch 1/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 51ms/step - accuracy: 0.5958 - loss: 0.8504 - val_accuracy: 0.8600 - val_loss: 0.3897
Epoch 2/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 52ms/step - accuracy: 0.8147 - loss: 0.4585 - val_accuracy: 0.9270 - val_loss: 0.2662
Epoch 3/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 53ms/step - accuracy: 0.8693 - loss: 0.3310 - val_accuracy: 0.9310 - val_loss: 0.2249
Epoch 4/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 54ms/step - accuracy: 0.8912 - loss: 0.2750 - val_accuracy: 0.9440 - val_loss: 0.2054
Epoch 5/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 54ms/step - accuracy: 0.9091 - loss: 0.2260 - val_accuracy: 0.9530 - val_loss: 0.1860
Epoch 6/10
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 55ms/step - accuracy: 0.9270 - loss: 0.1832 - val_accuracy: 0.9510 - val_loss: 0.1904
Epo

<keras.src.callbacks.history.History at 0x1d588f36010>