In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Load the dataset from the file path and assign column names
file_path = '/content/drive/MyDrive/Colab_Notebooks/01 Project Sentiment Analysis/Datasets/training.1600000.processed.noemoticon.csv'
columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
df = pd.read_csv(file_path, encoding='latin-1', names=columns)

# Display the first few rows to inspect the dataset
df.head()


Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Data Preprocessing
clean the text data by removing unnecessary columns and cleaning the text


In [3]:
import re

# Keep only the necessary columns: 'sentiment' and 'text'
df = df[['sentiment', 'text']]

# Map sentiment labels to binary values (0 = negative, 4 = positive)
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})

# Define a function to clean the text data
def clean_text(text):
    # Remove @mentions
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    # Remove '#' symbol
    text = re.sub(r'#', '', text)
    # Remove 'RT' (retweet)
    text = re.sub(r'RT[\s]+', '', text)
    # Remove hyperlinks
    text = re.sub(r'https?://\S+', '', text)
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the text data
df['text'] = df['text'].apply(clean_text)

# Display the first few rows of the cleaned data
df.head()


Unnamed: 0,sentiment,text
0,0,awww that s a bummer you shoulda got da...
1,0,is upset that he can t update his facebook by ...
2,0,i dived many times for the ball managed to s...
3,0,my whole body feels itchy and like its on fire
4,0,no it s not behaving at all i m mad why am...


# Lemmatization

In [4]:
import nltk # importing Natural Language Toolkit for lemmatization
from nltk.tokenize import word_tokenize
from nltk.stem import  WordNetLemmatizer
nltk.download('wordnet')  # Download WordNet
nltk.download('punkt')
nltk.download('punkt_tab')

def lemmatize(tet):
    if not isinstance(tet, str):
        return ''  # Return an empty string if tet is not a string
    lemmatizer = WordNetLemmatizer()
    twet = word_tokenize(tet)
    lemmas = ' '.join([lemmatizer.lemmatize(token) for token in twet])
    return lemmas


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
df['text'] = df['text'].apply(lemmatize)

#Text Vectorization
Convert the text data into numerical form using tokenization and padding

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer with a maximum number of words to keep
tokenizer = Tokenizer(num_words=5000)

# Fit the tokenizer on the text data
tokenizer.fit_on_texts(df['text'])

# Convert text to sequences of integers
X = tokenizer.texts_to_sequences(df['text'])

# Pad sequences to ensure uniform length
X = pad_sequences(X, maxlen=100)

# Extract sentiment labels
y = df['sentiment'].values


#Train-Test Split
Split the data into training and testing sets

In [7]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Model Building
Build and compile a deep learning model using LSTM

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input, BatchNormalization, GRU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Define the model
model_lem = Sequential()
model_lem.add(Input(shape = (51,)))

# Add an embedding layer (input_dim: vocabulary size, output_dim: embedding dimension, input_length: sequence length)
model_lem.add(Embedding(input_dim = 50000, output_dim = 128, input_length = 51))
model_lem.add(BatchNormalization())

# Add an GRU layer with dropout
model_lem.add(GRU(units =64, activation = 'tanh', return_sequences = False,  kernel_regularizer=l2(0.05))) ##---return_sequences: Indicates that only the output of the last time step is returned.
model_lem.add(BatchNormalization())

model_lem.add(Dropout(0.3))
model_lem.add(Dense(units = 32, activation='tanh', kernel_regularizer=l2(0.05)))
model_lem.add(BatchNormalization())

model_lem.add(Dropout(0.5))
model_lem.add(Dense(units = 16, activation='tanh', kernel_regularizer=l2(0.05)))
model_lem.add(BatchNormalization())

# Add a dense layer with sigmoid activation for binary classification
model_lem.add(Dropout(0.6))
model_lem.add(Dense(units = 1, activation='sigmoid'))

# model_lem.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) #----- compiling the model
model_lem.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model_lem.summary()




# Model Training
Train the model on the training data.

In [None]:
# Loading the previous weights of the model to avoid training from scratch
model_lem.load_weights('model_lem.weights.h5') #--- it should be commented first time for training than it should be uncommented

In [10]:
from tensorflow.keras.callbacks import EarlyStopping , ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)#----- early stopping callback used to avoid overfitting by monitoring the validation loss
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.01, patience=2, min_lr=0.00001)#----- reduce learning rate on plateau callback used to reduce the learning rate if the validation loss plateaus

In [11]:
model_lem.fit(X_train, y_train, epochs = 30, batch_size = 512, validation_split=0.2, callbacks=[early_stopping, reduce_lr]) #----- fitting the model
model_lem.save_weights('model_lem.weights.h5')

Epoch 1/30
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 23ms/step - accuracy: 0.7589 - loss: 0.8204 - val_accuracy: 0.7963 - val_loss: 0.4871 - learning_rate: 0.0100
Epoch 2/30
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 30ms/step - accuracy: 0.7970 - loss: 0.5031 - val_accuracy: 0.7955 - val_loss: 0.4882 - learning_rate: 0.0100
Epoch 3/30
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 25ms/step - accuracy: 0.8036 - loss: 0.4858 - val_accuracy: 0.8013 - val_loss: 0.4764 - learning_rate: 0.0100
Epoch 4/30
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 23ms/step - accuracy: 0.8058 - loss: 0.4800 - val_accuracy: 0.8070 - val_loss: 0.4588 - learning_rate: 0.0100
Epoch 5/30
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 23ms/step - accuracy: 0.8087 - loss: 0.4754 - val_accuracy: 0.8065 - val_loss: 0.4702 - learning_rate: 0.0100
Epoch 6/30
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━

#Model Evaluation
Evaluate the model's performance on the test dat

In [12]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test data
y_pred = (model_lem.predict(X_test) > 0.5).astype("int32")  # Threshold the predictions at 0.5

# Print the accuracy score
print("Accuracy:", accuracy_score(y_test, y_pred))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step
Accuracy: 0.812403125
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.82      0.81    159494
           1       0.82      0.81      0.81    160506

    accuracy                           0.81    320000
   macro avg       0.81      0.81      0.81    320000
weighted avg       0.81      0.81      0.81    320000



In [13]:
model_lem.evaluate(X_test, y_test)

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 7ms/step - accuracy: 0.8120 - loss: 0.4096


[0.4108762741088867, 0.81240314245224]

In [18]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

#Define the StratifiedKFold
KFoldn  = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store the results
accuracies = []
losses = []

for train_index, val_index in KFoldn.split(X, y):
  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

# Evaluate the model
loss, accuracy = model_lem.evaluate(X_val, y_val)
accuracies.append(accuracy)
losses.append(loss)

print(f"Mean validation accuracy: {np.mean(accuracies)}")
print(f"Mean validation loss: {np.mean(losses)}")

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 5ms/step - accuracy: 0.8203 - loss: 0.4012
Mean validation accuracy: 0.8183687329292297
Mean validation loss: 0.40221139788627625
