###Import several essential libraries for data manipulation, preprocessing, and model building

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

###Mount Google Drive in a Google Colab environment



In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###Load training and testing data from TSV (tab-separated values) files

In [None]:
# Load data
train_data = pd.read_csv('/content/drive/My Drive/Projects/Sentiment Analysis on Movie Reviews- Classify the sentiment of sentences from the Rotten Tomatoes dataset/train.tsv', delimiter='\t')
test_data = pd.read_csv('/content/drive/My Drive/Projects/Sentiment Analysis on Movie Reviews- Classify the sentiment of sentences from the Rotten Tomatoes dataset/test.tsv', delimiter='\t')

# Extract phrases and labels
phrases = train_data['Phrase'].values
labels = train_data['Sentiment'].values

###Set hyperparameters

In [None]:
# Set hyperparameters
vocab_size = 10000
max_length = 100
embedding_dim = 16
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<oov>'

###Model and Prediction

In [None]:
# Tokenize phrases
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(phrases)

# Convert phrases to sequences and pad them
sequences = tokenizer.texts_to_sequences(phrases)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model
num_epochs = 10
history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_val, y_val), verbose=2)

# Prepare test data
test_phrases = test_data['Phrase'].values
test_sequences = tokenizer.texts_to_sequences(test_phrases)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Predict sentiment labels for test data
predictions = model.predict(test_padded_sequences)

# Convert predictions to sentiment labels
predicted_labels = np.argmax(predictions, axis=1)

# Save predictions to a CSV file
output = pd.DataFrame({'PhraseId': test_data['PhraseId'], 'Sentiment': predicted_labels})
output.to_csv('submission.csv', index=False)
output.to_csv('/content/drive/My Drive/Projects/Sentiment Analysis on Movie Reviews- Classify the sentiment of sentences from the Rotten Tomatoes dataset/submission.csv', index=False)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d_3   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_6 (Dense)             (None, 64)                1088      
                                                                 
 dense_7 (Dense)             (None, 5)                 325       
                                                                 
Total params: 161,413
Trainable params: 161,413
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
3902/3902 - 18s - loss: 1.2164 - accuracy: 0.5200 - val_loss: 1.1678 - val_accuracy: 0.5259 - 18s/epoch 

###Visualize the acuuracy and loss

In [None]:
# Plot graphs
import matplotlib.pyplot as plt

def plot_metric(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_' + metric])
    plt.legend([metric, 'val_' + metric])
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.show()

plot_metric(history, "accuracy")
plot_metric(history, "loss")