# Data Import & Preprocessing

In [28]:
import numpy as np
import pandas as pd

In [29]:
data = pd.read_csv('train_augmented.csv')
data = data.drop('id', axis=1)
data.head()

Unnamed: 0,text,review
0,honestly the best part of this place is the un...,Excellent
1,"found indulge on a whim, based on their huge ""...",Excellent
2,my take on mill street is that it's your class...,Very good
3,i think matt's has had its '5 minutes of fame'...,Bad
4,nobody likes going to the auto body shop..peri...,Excellent


In [30]:
# Modifying the table columns for the loaded data
data.columns = ['Text', 'Label']

print("Sentiment Labels ----------")
print(data.Label.unique())
print(data.head())

Sentiment Labels ----------
['Excellent' 'Very good' 'Bad' 'Good' 'Very bad']
                                                Text      Label
0  honestly the best part of this place is the un...  Excellent
1  found indulge on a whim, based on their huge "...  Excellent
2  my take on mill street is that it's your class...  Very good
3  i think matt's has had its '5 minutes of fame'...        Bad
4  nobody likes going to the auto body shop..peri...  Excellent


### Applying One Hot Encoding for output labels

In [31]:
# apply one hot encoding
onehot = pd.get_dummies(data['Label'], prefix='Label')
onehot = onehot.astype(int)
data = pd.concat([data, onehot], axis=1)

print("\nData after adding new columns ----------")
print(data.head())



Data after adding new columns ----------
                                                Text      Label  Label_Bad  \
0  honestly the best part of this place is the un...  Excellent          0   
1  found indulge on a whim, based on their huge "...  Excellent          0   
2  my take on mill street is that it's your class...  Very good          0   
3  i think matt's has had its '5 minutes of fame'...        Bad          1   
4  nobody likes going to the auto body shop..peri...  Excellent          0   

   Label_Excellent  Label_Good  Label_Very bad  Label_Very good  
0                1           0               0                0  
1                1           0               0                0  
2                0           0               0                1  
3                0           0               0                0  
4                1           0               0                0  


### Remove punctuation from Text

In [32]:
import re
import string
# Removing the punctuation marks
def remove_punctutations(text):
    text_clean = ''
    text_clean = re.sub('['+string.punctuation+']', '', text)
    return text_clean

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punctutations(x))

### Text lowecasing

In [33]:
# Tokenizing the words
from nltk import word_tokenize
tokens = [word_tokenize(sentence) for sentence in data.Text_Clean]

In [34]:
def lowercase_token(tokens):
    return [word.lower() for word in tokens]

# Lowercasing the tokens
lowercased_tokens = [lowercase_token(token) for token in tokens]

### Removing stop words

In [35]:
# Removing the stop words
from nltk.corpus import stopwords

stoplist = stopwords.words('english')

def remove_stop_words(tokens):
    return [word for word in tokens if word not in stoplist]

filtered_words = [remove_stop_words(word) for word in lowercased_tokens]

result = [' '.join(word) for word in filtered_words]

data['Text_Final'] = result
data['Tokens'] = filtered_words
#data = data[['Text_Final', 'Tokens', 'Label', 'happiness', 'sadness', 'surprise', 'anger', 'fear']]

print("\nData after removing punctuation marks, stop words and lower casing ----------")
print(data.head())

#labels = ['Bad', 'Excellent', 'Good', 'Very bad', 'Very good']




Data after removing punctuation marks, stop words and lower casing ----------
                                                Text      Label  Label_Bad  \
0  honestly the best part of this place is the un...  Excellent          0   
1  found indulge on a whim, based on their huge "...  Excellent          0   
2  my take on mill street is that it's your class...  Very good          0   
3  i think matt's has had its '5 minutes of fame'...        Bad          1   
4  nobody likes going to the auto body shop..peri...  Excellent          0   

   Label_Excellent  Label_Good  Label_Very bad  Label_Very good  \
0                1           0               0                0   
1                1           0               0                0   
2                0           0               0                1   
3                0           0               0                0   
4                1           0               0                0   

                                          Text_Cl

### Data split into Train, Test

In [36]:
# Splitting data into test and train
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(data, test_size=0.2, random_state=42)

print("\nData after splitting into Train and Test sets ----------\n")

training_words = [word for tokens in training_data["Tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in training_data["Tokens"]]
training_vocabulary = sorted(list(set(training_words)))
print("%s total of Training words with a vocabulary size of %s" % (len(training_words), len(training_vocabulary)))
print("Max sentence length is %s" % max(training_sentence_lengths))


Data after splitting into Train and Test sets ----------

716095 total of Training words with a vocabulary size of 27484
Max sentence length is 502


In [37]:
testing_words = [word for tokens in testing_data["Tokens"] for word in tokens]
testing_sentence_lengths = [len(tokens) for tokens in testing_data["Tokens"]]
testing_vocabulary = sorted(list(set(testing_words)))
print()
print("%s total of Testing words with a vocabulary size of %s" % (len(testing_words), len(testing_vocabulary)))
print("Max sentence length is %s" % max(testing_sentence_lengths))


176639 total of Testing words with a vocabulary size of 15237
Max sentence length is 478


### Applying Word2Vec

In [38]:
from gensim.models import Word2Vec

word2vec = Word2Vec(
    sentences=data["Tokens"],   # list of token lists
    vector_size=300,            
    window=5,
    min_count=1,             
    workers=4
)
word2vec = word2vec.wv  

print("Completed training custom Word2Vec ----------")

Completed training custom Word2Vec ----------


In [39]:
# Getting Embeddings
def get_average_word2vec(tokens, vector, generate_missing=False, k=300):
    if len(tokens)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[token] if token in vector else np.random.rand(k) for token in tokens]
    else:
        vectorized = [vector[token] if token in vector else np.zeros(k) for token in tokens]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['Tokens'].apply(lambda x: get_average_word2vec(x, vectors,
                                                                                generate_missing=generate_missing))
    return list(embeddings)

training_embeddings = get_word2vec_embeddings(word2vec, training_data, generate_missing=True)

### Tokenization

In [40]:
# Tokenizing and Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

tokenizer = Tokenizer(num_words=len(training_vocabulary), lower=True, char_level=False)
tokenizer.fit_on_texts(training_data["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(training_data["Text_Final"].tolist())
training_word_index = tokenizer.word_index

print('\nFound %s unique tokens.' % len(training_word_index))

training_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# create embedding matrix for CNN
train_embedding_weights = np.zeros((len(training_word_index)+1, EMBEDDING_DIM))
for word,index in training_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

testing_sequences = tokenizer.texts_to_sequences(testing_data["Text_Final"].tolist())
testing_cnn_data = pad_sequences(testing_sequences, maxlen=MAX_SEQUENCE_LENGTH)


Found 27482 unique tokens.
(27483, 300)


## Defining CNN

In [41]:
from keras.optimizers import Adam
# Defining the CNN
def ConvolutionalNeuralNetwork(embeddings,
                               max_sequence_length,
                               num_of_words,
                               embedding_dim,
                               labels_index,
                               learning_rate=0.001,
                               drop = 0.5):

    embedding_layer = Embedding(num_of_words, embedding_dim, weights=[embeddings], input_length=max_sequence_length, trainable=False)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    sliding_window_heights = [2,3,4,5,6]

    for sliding_window_height in sliding_window_heights:
        l_conv = Conv1D(filters=100, kernel_size=sliding_window_height, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)

    l_merge = concatenate(convs, axis=1)

    x = Dropout(drop)(l_merge)

    predictions = Dense(labels_index, activation='softmax', kernel_regularizer=keras.regularizers.l2(0.001))(x)

    # optimizer
    optimizer_ = Adam(learning_rate=learning_rate)

    model = Model(sequence_input, predictions)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer_,
                  metrics=['acc'])
    model.summary()
    return model

In [42]:
print(data.columns)

Index(['Text', 'Label', 'Label_Bad', 'Label_Excellent', 'Label_Good',
       'Label_Very bad', 'Label_Very good', 'Text_Clean', 'Text_Final',
       'Tokens'],
      dtype='object')


# Grid Search for best parameters (LR, Drop)

In [43]:
# from keras.callbacks import EarlyStopping
# import keras
# from keras.layers import Dense, Dropout, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
# from keras.models import Model
# from keras.optimizers import Adam
# import numpy as np

# # Training the CNN
# print("\nTraining the CNN----------")

# # Define parameter grid
# learning_rates = [0.0001, 0.0005, 0.001, 0.005, 0.00001]
# dropout_rates = [0.2,0.3, 0.4, 0.5, 0.6, 0.7]
# labels = ['Label_Bad', 'Label_Excellent', 'Label_Good', 'Label_Very bad', 'Label_Very good']

# y_train = training_data[labels].values
# x_train = training_cnn_data
# y_tr = y_train

# num_epochs = 100
# batch_size = 32

# best_accuracy = 0
# best_params = {}
# best_model = None
# results = []

# # Grid search over parameters
# for lr in learning_rates:
#     for drop_rate in dropout_rates:
#         print(f"\n{'='*60}")
#         print(f"Testing: Learning Rate={lr}, Dropout Rate={drop_rate}")
#         print(f"{'='*60}")

#         # Create model with current parameters
#         model = ConvolutionalNeuralNetwork(
#             train_embedding_weights,
#             MAX_SEQUENCE_LENGTH,
#             len(training_word_index)+1,
#             EMBEDDING_DIM,
#             len(list(labels)),
#             learning_rate=lr,
#             drop=drop_rate
#         )

#         # Early stopping callback
#         es = EarlyStopping(
#             monitor='val_loss',
#             mode='min',
#             verbose=0,  # Set to 0 to reduce verbosity during grid search
#             patience=2,
#             min_delta=0.0001,
#             restore_best_weights=True
#         )

#         # Train model
#         history = model.fit(
#             x_train,
#             y_tr,
#             epochs=num_epochs,
#             validation_split=0.1,
#             shuffle=True,
#             batch_size=batch_size,
#             callbacks=[es],
#             verbose=1  # Set to 0 to reduce training output
#         )

#         # Get best validation accuracy
#         best_val_acc = max(history.history['val_acc'])
#         best_val_loss = min(history.history['val_loss'])

#         # Store results
#         results.append({
#             'learning_rate': lr,
#             'dropout_rate': drop_rate,
#             'best_val_accuracy': best_val_acc,
#             'best_val_loss': best_val_loss,
#             'epochs_trained': len(history.history['val_loss']),
#             'history': history.history
#         })

#         print(f"Best Validation Accuracy: {best_val_acc:.4f}")
#         print(f"Best Validation Loss: {best_val_loss:.4f}")
#         print(f"Epochs trained: {len(history.history['val_loss'])}")

#         # Update best model if this one is better
#         if best_val_acc > best_accuracy:
#             best_accuracy = best_val_acc
#             best_params = {'learning_rate': lr, 'dropout_rate': drop_rate}
#             best_model = model
#             print(f"*** New best model! ***")

# print("\n" + "="*60)
# print("GRID SEARCH COMPLETE")
# print("="*60)

# # Print summary of results
# print("\nResults Summary:")
# print("-"*40)
# for i, result in enumerate(results):
#     print(f"Config {i+1}: LR={result['learning_rate']:.4f}, Drop={result['dropout_rate']:.1f}, "
#           f"Val_Acc={result['best_val_accuracy']:.4f}, Val_Loss={result['best_val_loss']:.4f}")

# print("\n" + "="*60)
# print(f"BEST CONFIGURATION:")
# print(f"Learning Rate: {best_params['learning_rate']}")
# print(f"Dropout Rate: {best_params['dropout_rate']}")
# print(f"Best Validation Accuracy: {best_accuracy:.4f}")
# print("="*60)

# # Save or use the best model
# print("\nBest model is ready for use!")

# # If you want to retrain the best model on full data or make predictions:
# # final_model = ConvolutionalNeuralNetwork(
# #     train_embedding_weights,
# #     MAX_SEQUENCE_LENGTH,
# #     len(training_word_index)+1,
# #     EMBEDDING_DIM,
# #     len(list(labels)),
# #     learning_rate=best_params['learning_rate'],
# #     drop=best_params['dropout_rate']
# # )
# #
# # # Train on full data without validation split
# # final_model.fit(x_train, y_tr, epochs=num_epochs, batch_size=batch_size, verbose=1)

# Train the CNN with the best params

In [44]:
from keras.callbacks import EarlyStopping
import keras
from keras.layers import Dense, Dropout, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.models import Model
# Training the CNN
print("\nTraining the CNN----------")

#labels = ['Bad', 'Excellent', 'Good', 'Very bad', 'Very good']
labels = ['Label_Bad', 'Label_Excellent', 'Label_Good', 'Label_Very bad', 'Label_Very good']

model = ConvolutionalNeuralNetwork(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(training_word_index)+1, EMBEDDING_DIM,
               len(list(labels)), learning_rate=0.0001, drop = 0.5)

y_train = training_data[labels].values
x_train = training_cnn_data
y_tr = y_train

num_epochs = 100
batch_size = 32

es = EarlyStopping(
    monitor='val_loss',
    mode='min',verbose=1,
    patience = 2,
    min_delta = 0.0001,
    restore_best_weights = True
)

hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size, callbacks=[es])

print("\nCNN trained successfully ----------")
# Find the epoch with minimum validation loss
best_epoch = np.argmin(hist.history['val_loss']) + 1  # +1 for human-readable
best_val_loss = min(hist.history['val_loss'])

print(f"\nTraining Summary:")
print(f"   Total epochs run: {len(hist.history['val_loss'])}")
print(f"   Best epoch: {best_epoch}")
print(f"   Best validation loss: {best_val_loss:.4f}")
print(f"   Final validation loss: {hist.history['val_loss'][-1]:.4f}")



Training the CNN----------




Epoch 1/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - acc: 0.2670 - loss: 1.8720 - val_acc: 0.4350 - val_loss: 1.3822
Epoch 2/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - acc: 0.3877 - loss: 1.4585 - val_acc: 0.4862 - val_loss: 1.2958
Epoch 3/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - acc: 0.4555 - loss: 1.2962 - val_acc: 0.4759 - val_loss: 1.2767
Epoch 4/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - acc: 0.5045 - loss: 1.2261 - val_acc: 0.4923 - val_loss: 1.2261
Epoch 5/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - acc: 0.5381 - loss: 1.1587 - val_acc: 0.5169 - val_loss: 1.1905
Epoch 6/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - acc: 0.5613 - loss: 1.1168 - val_acc: 0.5159 - val_loss: 1.1757
Epoch 7/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0

# Calculate testing accuracy

In [45]:
import numpy as np
import pandas as pd
from collections import defaultdict

# 1. Make predictions
predictions = model.predict(testing_cnn_data, batch_size=1024, verbose=1)
labels = ['Bad', 'Excellent', 'Good', 'Very bad', 'Very good']

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step


In [46]:
# Convert predicted probabilities to labels
prediction_labels = [labels[np.argmax(p)] for p in predictions]

# 2. Calculate accuracy
testing_data = testing_data.copy()  # avoid modifying original DataFrame
testing_data['predicted'] = prediction_labels
accuracy = (testing_data['Label'] == testing_data['predicted']).mean() * 100

print(f"Total predictions: {len(testing_data)}")
print(f"Correct predictions: {(testing_data['Label'] == testing_data['predicted']).sum()}")
print(f"Accuracy: {accuracy:.2f}%")


Total predictions: 2443
Correct predictions: 1634
Accuracy: 66.88%


# Running Kaggle test cases

In [49]:
# Predict the test csv
test_df = pd.read_csv("test.csv")


# Applying same preprocessing pipeline
print("Applying preprocessing pipeline to test data...")

# punctuation removal
test_df['Text_Clean'] = test_df['text'].apply(lambda x: remove_punctutations(x))

# Tokenize
test_tokens = [word_tokenize(sentence) for sentence in test_df['Text_Clean']]

# Lowercase 
test_lowercased_tokens = [lowercase_token(token) for token in test_tokens]

# Remove stopwords
test_filtered_words = [remove_stop_words(word) for word in test_lowercased_tokens]

# Rejoin tokens to string
test_result = [' '.join(word) for word in test_filtered_words]
test_df['Text_Final'] = test_result
test_df['Tokens'] = test_filtered_words

print(f"Preprocessed {len(test_df)} test samples")
print(f"Sample original text: {test_df['text'].iloc[0][:100]}...")
print(f"Sample cleaned text: {test_df['Text_Final'].iloc[0][:100]}...")


# using the same tokenizer used for training
test_sequences = tokenizer.texts_to_sequences(test_df['text'])
testing_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

predictions = model.predict(testing_cnn_data, batch_size=32)

# labels = ['Bad', 'Excellent', 'Good', 'Very bad', 'Very good']
prediction_labels = [labels[np.argmax(p)] for p in predictions]

submission = pd.DataFrame({
    "id": test_df['id'],
    "review": prediction_labels
})

submission.to_csv("submission.csv", index=False)

Applying preprocessing pipeline to test data...
Preprocessed 3000 test samples
Sample original text: we went back here again this past weekend...actually we went there 3 more times this past weekend al...
Sample cleaned text: went back past weekendactually went 3 times past weekend alone couldnt resist awesome service great ...
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
