In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv("D:/Industry/Projects/Sentiment analysis/Dataset/Twitter_Data.csv")

In [2]:
df = df[df["category"]!=0]
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df['category'] = df['category'].apply(lambda x: 0 if x == -1 else 1)

In [3]:
df['category'].unique()

array([0, 1], dtype=int64)

In [4]:
import re
def remove_special_words(text):
    return re.sub(r'^@|^http|[^\w\s\U0001F600-\U0001F64F]','',text)

df["clean_text"] = df["clean_text"].apply(remove_special_words)
df["clean_text"] = df["clean_text"].str.lower()

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded_sequences = pad_sequences(sequences, padding='post')

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_sequences,
                                                    df['category'],
                                                    test_size=0.2,
                                                    random_state=42)
class_weights = {
    1: 1.0,     
    0: 2.0
}

sample_weights = np.zeros(len(y_train))
for i, val in enumerate(y_train):
    sample_weights[i] = class_weights[val]

In [7]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

# Monitor
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)
# Early stop on plateau
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True)

# Save best Model
checkpoint_path = 'D:/Industry/Projects/Sentiment analysis/Weights/best_model.keras'
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=0)

# Reduce Learning Rate on Validation set accuracy reduction
lr_reduce_callback = ReduceLROnPlateau(
    monitor='val_accuracy',
    factor=0.1,
    patience=5,
    min_delta=1e-3,
    cooldown=2,
    min_lr=1e-8,
    verbose=0)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout

model = Sequential()
model.add(Embedding(
    input_dim=5000,
    output_dim=500,))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [9]:
from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=1e-4)
model.compile(  loss='binary_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'],
                )

In [10]:
try:
    history = model.fit(X_train, y_train,
                epochs=50,
                batch_size=64,
                validation_data=(X_test, y_test),
                sample_weight=sample_weights,
                callbacks=[ lr_reduce_callback,
                            checkpoint_callback,
                            tensorboard_callback])
except KeyboardInterrupt: # for the next training
    print("Manually Interruptted")
pd.DataFrame(history.history).to_csv("D:/Industry/Projects/Sentiment analysis/History/history.csv")

Epoch 1/50
[1m1347/1347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 80ms/step - accuracy: 0.5139 - loss: 0.9231 - val_accuracy: 0.3260 - val_loss: 0.6944 - learning_rate: 1.0000e-04
Epoch 2/50
[1m1347/1347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 82ms/step - accuracy: 0.6658 - loss: 0.7692 - val_accuracy: 0.9086 - val_loss: 0.2376 - learning_rate: 1.0000e-04
Epoch 3/50
[1m1347/1347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 88ms/step - accuracy: 0.9117 - loss: 0.3179 - val_accuracy: 0.9235 - val_loss: 0.2058 - learning_rate: 1.0000e-04
Epoch 4/50
[1m1347/1347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 91ms/step - accuracy: 0.9297 - loss: 0.2622 - val_accuracy: 0.9326 - val_loss: 0.1822 - learning_rate: 1.0000e-04
Epoch 5/50
[1m1347/1347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 89ms/step - accuracy: 0.9368 - loss: 0.2331 - val_accuracy: 0.9260 - val_loss: 0.1983 - learning_rate: 1.0000e-04
Epoch 6/50
[1m1347/1347

KeyboardInterrupt: 