In [356]:
import numpy as np
import pandas as pd


In [357]:
data = pd.read_csv('train_augmented.csv')
data = data.drop('id', axis=1)
data.head()

Unnamed: 0,text,review
0,honestly the best part of this place is the un...,Excellent
1,"found indulge on a whim, based on their huge ""...",Excellent
2,my take on mill street is that it's your class...,Very good
3,i think matt's has had its '5 minutes of fame'...,Bad
4,nobody likes going to the auto body shop..peri...,Excellent


In [358]:
# Modifying the table columns for the loaded data
data.columns = ['Text', 'Label']

print("Sentiment Labels ----------")
print(data.Label.unique())
print(data.head())

Sentiment Labels ----------
['Excellent' 'Very good' 'Bad' 'Good' 'Very bad']
                                                Text      Label
0  honestly the best part of this place is the un...  Excellent
1  found indulge on a whim, based on their huge "...  Excellent
2  my take on mill street is that it's your class...  Very good
3  i think matt's has had its '5 minutes of fame'...        Bad
4  nobody likes going to the auto body shop..peri...  Excellent


In [359]:
# apply one hot encoding
onehot = pd.get_dummies(data['Label'], prefix='Label')
onehot = onehot.astype(int)
data = pd.concat([data, onehot], axis=1)

print("\nData after adding new columns ----------")
print(data.head())



Data after adding new columns ----------
                                                Text      Label  Label_Bad  \
0  honestly the best part of this place is the un...  Excellent          0   
1  found indulge on a whim, based on their huge "...  Excellent          0   
2  my take on mill street is that it's your class...  Very good          0   
3  i think matt's has had its '5 minutes of fame'...        Bad          1   
4  nobody likes going to the auto body shop..peri...  Excellent          0   

   Label_Excellent  Label_Good  Label_Very bad  Label_Very good  
0                1           0               0                0  
1                1           0               0                0  
2                0           0               0                1  
3                0           0               0                0  
4                1           0               0                0  


In [360]:
import re
import string
# Removing the punctuation marks
def remove_punctutations(text):
    text_clean = ''
    text_clean = re.sub('['+string.punctuation+']', '', text)
    return text_clean

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punctutations(x))

In [361]:
# Tokenizing the words
from nltk import word_tokenize
tokens = [word_tokenize(sentence) for sentence in data.Text_Clean]

In [362]:
def lowercase_token(tokens): 
    return [word.lower() for word in tokens]    

# Lowercasing the tokens    
lowercased_tokens = [lowercase_token(token) for token in tokens]

In [363]:
# Removing the stop words
from nltk.corpus import stopwords

stoplist = stopwords.words('english')

def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

filtered_words = [remove_stop_words(word) for word in lowercased_tokens]

result = [' '.join(word) for word in filtered_words]

data['Text_Final'] = result
data['Tokens'] = filtered_words
#data = data[['Text_Final', 'Tokens', 'Label', 'happiness', 'sadness', 'surprise', 'anger', 'fear']]

print("\nData after removing punctuation marks, stop words and lower casing ----------")
print(data.head())

#labels = ['Bad', 'Excellent', 'Good', 'Very bad', 'Very good']




Data after removing punctuation marks, stop words and lower casing ----------
                                                Text      Label  Label_Bad  \
0  honestly the best part of this place is the un...  Excellent          0   
1  found indulge on a whim, based on their huge "...  Excellent          0   
2  my take on mill street is that it's your class...  Very good          0   
3  i think matt's has had its '5 minutes of fame'...        Bad          1   
4  nobody likes going to the auto body shop..peri...  Excellent          0   

   Label_Excellent  Label_Good  Label_Very bad  Label_Very good  \
0                1           0               0                0   
1                1           0               0                0   
2                0           0               0                1   
3                0           0               0                0   
4                1           0               0                0   

                                          Text_Cl

In [364]:
# Splitting data into test and train
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(data, test_size=0.2, random_state=42)

print("\nData after splitting into Train and Test sets ----------\n")

training_words = [word for tokens in training_data["Tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in training_data["Tokens"]]
training_vocabulary = sorted(list(set(training_words)))
print("%s total of Training words with a vocabulary size of %s" % (len(training_words), len(training_vocabulary)))
print("Max sentence length is %s" % max(training_sentence_lengths))


Data after splitting into Train and Test sets ----------

716095 total of Training words with a vocabulary size of 27484
Max sentence length is 502


In [365]:
testing_words = [word for tokens in testing_data["Tokens"] for word in tokens]
testing_sentence_lengths = [len(tokens) for tokens in testing_data["Tokens"]]
testing_vocabulary = sorted(list(set(testing_words)))
print()
print("%s total of Testing words with a vocabulary size of %s" % (len(testing_words), len(testing_vocabulary)))
print("Max sentence length is %s" % max(testing_sentence_lengths))


176639 total of Testing words with a vocabulary size of 15237
Max sentence length is 478


In [366]:
from gensim.models import Word2Vec

word2vec = Word2Vec(
    sentences=data["Tokens"],   # list of token lists
    vector_size=300,            # same as Google News dimension
    window=5,
    min_count=1,                # keep all words
    workers=4
)
word2vec = word2vec.wv   # keep only the vectors (KeyedVectors)

print("Completed training custom Word2Vec ----------")

Completed training custom Word2Vec ----------


In [367]:

# Getting Embeddings
def get_average_word2vec(tokens, vector, generate_missing=False, k=300):
    if len(tokens)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[token] if token in vector else np.random.rand(k) for token in tokens]
    else:
        vectorized = [vector[token] if token in vector else np.zeros(k) for token in tokens]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['Tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

training_embeddings = get_word2vec_embeddings(word2vec, training_data, generate_missing=True)

In [368]:
# Tokenizing and Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

tokenizer = Tokenizer(num_words=len(training_vocabulary), lower=True, char_level=False)
tokenizer.fit_on_texts(training_data["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(training_data["Text_Final"].tolist())
training_word_index = tokenizer.word_index

print('\nFound %s unique tokens.' % len(training_word_index))

training_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# create embedding matrix for CNN
train_embedding_weights = np.zeros((len(training_word_index)+1, EMBEDDING_DIM))
for word,index in training_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

testing_sequences = tokenizer.texts_to_sequences(testing_data["Text_Final"].tolist())
testing_cnn_data = pad_sequences(testing_sequences, maxlen=MAX_SEQUENCE_LENGTH)


Found 27482 unique tokens.
(27483, 300)


In [369]:
from keras.optimizers import Adam
# Defining the CNN
def ConvolutionalNeuralNetwork(embeddings,
                               max_sequence_length,
                               num_of_words,
                               embedding_dim,
                               labels_index,
                               learning_rate=0.001):
    
    embedding_layer = Embedding(num_of_words, embedding_dim, weights=[embeddings], input_length=max_sequence_length, trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    sliding_window_heights = [2,3,4,5,6]

    for sliding_window_height in sliding_window_heights:
        l_conv = Conv1D(filters=100, kernel_size=sliding_window_height, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)

    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.5)(l_merge)  
   
    predictions = Dense(labels_index, activation='softmax', kernel_regularizer=keras.regularizers.l2(0.001))(x)

    # optimizer
    optimizer_ = Adam(learning_rate=learning_rate)

    model = Model(sequence_input, predictions)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer_,
                  metrics=['acc'])
    model.summary()
    return model

In [370]:
print(data.columns)

Index(['Text', 'Label', 'Label_Bad', 'Label_Excellent', 'Label_Good',
       'Label_Very bad', 'Label_Very good', 'Text_Clean', 'Text_Final',
       'Tokens'],
      dtype='object')


In [None]:
from keras.callbacks import EarlyStopping
import keras
from keras.layers import Dense, Dropout, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.models import Model
# Training the CNN
print("\nTraining the CNN----------")

#labels = ['Bad', 'Excellent', 'Good', 'Very bad', 'Very good']
labels = ['Label_Bad', 'Label_Excellent', 'Label_Good', 'Label_Very bad', 'Label_Very good']

model = ConvolutionalNeuralNetwork(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(training_word_index)+1, EMBEDDING_DIM, 
               len(list(labels)), learning_rate=0.0008)

y_train = training_data[labels].values
x_train = training_cnn_data
y_tr = y_train

num_epochs = 100
batch_size = 32

es = EarlyStopping(
    monitor='val_loss',
    mode='min',verbose=1,
    patience = 2,
    min_delta = 0.0001,
    restore_best_weights = True
)
  
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size, callbacks=[es])

print("\nCNN trained successfully ----------")


Training the CNN----------




Epoch 1/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - acc: 0.3604 - loss: 1.5959 - val_acc: 0.4401 - val_loss: 1.3149
Epoch 2/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - acc: 0.4725 - loss: 1.2620 - val_acc: 0.5374 - val_loss: 1.1597
Epoch 3/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - acc: 0.5307 - loss: 1.1474 - val_acc: 0.5210 - val_loss: 1.1214
Epoch 4/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - acc: 0.5746 - loss: 1.0601 - val_acc: 0.5814 - val_loss: 1.0307
Epoch 5/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - acc: 0.6025 - loss: 0.9982 - val_acc: 0.5977 - val_loss: 1.0201
Epoch 6/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - acc: 0.6320 - loss: 0.9236 - val_acc: 0.6018 - val_loss: 0.9823
Epoch 7/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict

# 1. Make predictions
predictions = model.predict(testing_cnn_data, batch_size=1024, verbose=1)
labels = ['Bad', 'Excellent', 'Good', 'Very bad', 'Very good']

In [None]:
# Convert predicted probabilities to labels
prediction_labels = [labels[np.argmax(p)] for p in predictions]

# 2. Calculate accuracy
testing_data = testing_data.copy()  # avoid modifying original DataFrame
testing_data['predicted'] = prediction_labels
accuracy = (testing_data['Label'] == testing_data['predicted']).mean() * 100

print(f"Total predictions: {len(testing_data)}")
print(f"Correct predictions: {(testing_data['Label'] == testing_data['predicted']).sum()}")
print(f"Accuracy: {accuracy:.2f}%")


In [None]:
# # Predict the test csv
# test_df = pd.read_csv("test.csv")

# # Assuming you have the same tokenizer you used for training
# test_sequences = tokenizer.texts_to_sequences(test_df['text'])
# testing_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# predictions = model.predict(testing_cnn_data, batch_size=32)

# # labels = ['Bad', 'Excellent', 'Good', 'Very bad', 'Very good'] 
# prediction_labels = [labels[np.argmax(p)] for p in predictions]

# submission = pd.DataFrame({
#     "id": test_df['id'],
#     "review": prediction_labels
# })

# submission.to_csv("submission.csv", index=False)