In [71]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
import tensorflow as tf
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re

In [72]:
from textattack.augmentation import EasyDataAugmenter

import random

augmenter = EasyDataAugmenter()

def augment_text(sentence):
    augmented_sentences = augmenter.augment(sentence)
    if augmented_sentences:
        return random.choice(augmented_sentences)
    else:
        return sentence 


[nltk_data] Downloading package omw-1.4 to /Users/aadeesh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [73]:
# Cleans the sentence of links and emojis, removes stop words, and lemmatizes words
def commentCleaner(comment):
    # Remove special symbols, emojis, reddit username mentions, and hyperlinks
    comment = re.sub(r"[^\w\s]|http\S+|www\S+|u/[A-Za-z0-9_-]+", "", comment)
    
    # Tokenize the comment
    tokens = comment.split()
    # tokens = comment.split(' ')
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens back into a single string
    cleaned_comment = " ".join(tokens)
    
    return cleaned_comment

print(commentCleaner("One of the other reviewers mentioned watching 1 oz episode"))


One reviewer mentioned watching 1 oz episode


In [74]:
def cleanDf(df):
    df["Sentence"] = df["Sentence"].apply(commentCleaner)

    return df

In [75]:
def augmentDataFrame(df):
    augmented_data = []
    augmented_labels = []

    for sentence, sentiment in zip(df.Sentence, df.Sentiment):
        random_num = random.randint(1, 2)
        if (random_num == 1):
            augmented_sentence = augment_text(sentence)
            augmented_data.append(augmented_sentence)
            augmented_labels.append(sentiment)
    
    new_df = {"Sentence" : augmented_data, "Sentiment" : augmented_labels}
    new_df = pd.DataFrame(new_df)

    df = pd.concat([df, new_df])
    # df = df.append(pd.DataFrame(new_df))
    return df

In [76]:
data = pd.read_csv('data.csv')

# Augmenting the data
print(data.shape)
print(data.head())
data = augmentDataFrame(data)
print(data.shape)
data.to_csv('augmented_data.csv')


(5842, 2)
                                            Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral
(8844, 2)


In [77]:
# Cleaning our data
cleaned_data = cleanDf(data)
cleaned_data.shape

(8844, 2)

In [78]:
# Giving integer labels to our sentiment labels
lb = LabelEncoder()
cleaned_data["Sentiment"] = lb.fit_transform(data['Sentiment'])
cleaned_data.shape

(8844, 2)

In [79]:
# Tokenizing the sentences
tokenizer = Tokenizer(num_words=500, split=' ') 
tokenizer.fit_on_texts(cleaned_data['Sentence'].values)
X = tokenizer.texts_to_sequences(cleaned_data['Sentence'].values)
X = pad_sequences(X)


In [89]:
# Building our model
model = keras.Sequential()
model.add(Embedding(500, 120, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(120, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 28, 120)           60000     
                                                                 
 spatial_dropout1d_6 (Spati  (None, 28, 120)           0         
 alDropout1D)                                                    
                                                                 
 lstm_6 (LSTM)               (None, 200)               256800    
                                                                 
 dense_18 (Dense)            (None, 100)               20100     
                                                                 
 dropout_12 (Dropout)        (None, 100)               0         
                                                                 
 dense_19 (Dense)            (None, 120)               12120     
                                                      

In [90]:
checkpoint_path = "trial1/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
# Create a ModelCheckpoint callback to save the best model
checkpoint_callback = ModelCheckpoint(
    checkpoint_path,
    monitor='loss',
    save_weights_only=False,
    save_best_only=True,
    verbose=1
)

# Create an EarlyStopping callback to stop training if validation loss doesn't improve
early_stopping_callback = EarlyStopping(
    monitor='loss',
    patience=5,  # Number of epochs with no improvement after which training will stop
    verbose=1
)

In [86]:
# Converting the integer labels to onehot encoding
y=pd.get_dummies(data['Sentiment'])
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [88]:
batch_size=32

model.fit(X_train, y_train, epochs = 80, 
          batch_size=batch_size, 
          verbose = 'auto', 
          callbacks=[checkpoint_callback, early_stopping_callback]
        )

Epoch 1/80
Epoch 1: loss improved from 0.19203 to 0.19007, saving model to trial1/weights-improvement-01-0.1901.hdf5
Epoch 2/80
  2/194 [..............................] - ETA: 15s - loss: 0.1131 - accuracy: 0.9375

  saving_api.save_model(


Epoch 2: loss improved from 0.19007 to 0.18004, saving model to trial1/weights-improvement-02-0.1800.hdf5
Epoch 3/80
Epoch 3: loss did not improve from 0.18004
Epoch 4/80
Epoch 4: loss did not improve from 0.18004
Epoch 5/80
Epoch 5: loss did not improve from 0.18004
Epoch 6/80
Epoch 6: loss did not improve from 0.18004
Epoch 7/80
Epoch 7: loss did not improve from 0.18004
Epoch 7: early stopping


<keras.src.callbacks.History at 0x2dae70370>