##Python installations

In [2]:
!pip install pandas==2.1.3
!pip install nltk==3.6.5
!pip install scikit-learn==1.3.2
!pip install scipy==1.11.4
!pip install threadpoolctl==3.2.0
!pip install tensorflow==2.8.0



##Imports

In [3]:
###Imports
##Collecting data
import pandas as pd

##Text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
##Regular functions
import re

##DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
nltk.download('stopwords')
nltk.download('punkt')

#RNN
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, LSTM, CuDNNLSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


##Gather the dataset for sentiment analysis from the UCI Machine Learning Repository

In [5]:
## Read text files

file1 = pd.read_csv("amazon_cells_labelled.txt", delimiter = '\t', quoting = 3, header=None, names=["Phrase", "tag"])
file2 = pd.read_csv("imdb_labelled.txt", delimiter = '\t', quoting = 3, header=None, names=["Phrase", "tag"])
file3 = pd.read_csv("yelp_labelled.txt", delimiter = '\t', quoting = 3, header=None, names=["Phrase", "tag"])

# Concatenate the three files
combined_df = pd.concat([file1, file2, file3], ignore_index=True)

#Check the size of each read file
print(file1.shape)
print(file2.shape)
print(file3.shape)
print(combined_df.shape)

#Number of occurrences of each unique value in the 'tag' column
print(combined_df['tag'].value_counts() / combined_df['tag'].shape[0])


#Print initial combined_df
display(combined_df)



(1000, 2)
(1000, 2)
(1000, 2)
(3000, 2)
tag
0    0.5
1    0.5
Name: count, dtype: float64


Unnamed: 0,Phrase,tag
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
...,...,...
2995,I think food should have flavor and texture an...,0
2996,Appetite instantly gone.,0
2997,Overall I was not impressed and would not go b...,0
2998,"The whole experience was underwhelming, and I ...",0


##Preprocess the text data, including tokenization, lowercasing, and removing stopwords.

In [6]:
# Function to clean text (regular expressions)
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters and punctuation
    return text

In [7]:
# Apply cleaning and tokenization for English words
combined_df['Cleaned_Phrase'] = combined_df['Phrase'].apply(clean_text)
combined_df['Tokenized_Phrase'] = combined_df['Cleaned_Phrase'].apply(word_tokenize)
combined_df['Tokenized_Phrase'] = combined_df['Tokenized_Phrase'].apply(lambda x: [word.lower() for word in x])

#Remove English stopwords

stop_words = set(stopwords.words('english'))
combined_df['Tokenized_Phrase'] = combined_df['Tokenized_Phrase'].apply(lambda x: [word for word in x if word not in stop_words])

# Print the dataframe with original phrases and preprocessed English words
display(combined_df[['Phrase', 'Tokenized_Phrase']])


Unnamed: 0,Phrase,Tokenized_Phrase
0,So there is no way for me to plug it in here i...,"[way, plug, us, unless, go, converter]"
1,"Good case, Excellent value.","[good, case, excellent, value]"
2,Great for the jawbone.,"[great, jawbone]"
3,Tied to charger for conversations lasting more...,"[tied, charger, conversations, lasting, 45, mi..."
4,The mic is great.,"[mic, great]"
...,...,...
2995,I think food should have flavor and texture an...,"[think, food, flavor, texture, lacking]"
2996,Appetite instantly gone.,"[appetite, instantly, gone]"
2997,Overall I was not impressed and would not go b...,"[overall, impressed, would, go, back]"
2998,"The whole experience was underwhelming, and I ...","[whole, experience, underwhelming, think, well..."


##Implement a DummyClassifier

In [8]:
# Assume 'X' are the preprocessed features and 'y' are the labels
X = combined_df['Tokenized_Phrase']
y = combined_df['tag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Strategy: random classification, but maintaining the original class ratio of the training set
dummy_clf = DummyClassifier(strategy="stratified")

#Train the model
dummy_clf.fit(X_train, y_train)


###Calculate evaluation metrics (Test)

In [9]:
# Make predictions on the test set
y_pred_test = dummy_clf.predict(X_test)

# Calculate evaluation metrics on the test set
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
kappa_test = cohen_kappa_score(y_test, y_pred_test)

# Create a dictionary with the test metrics
metrics_test = {
    'Accuracy': accuracy_test,
    'Precision': precision_test,
    'Recall': recall_test,
    'F1 Score': f1_test,
    'Kappa': kappa_test
}

# Create a DataFrame from the test metrics dictionary
metrics_df_test = pd.DataFrame(list(metrics_test.items()), columns=['Metric', 'Value'])

# Print the DataFrame
print('\n' + 'Metrics for test set' + '\n')
display(metrics_df_test)



Metrics for test set



Unnamed: 0,Metric,Value
0,Accuracy,0.491667
1,Precision,0.493506
2,Recall,0.504983
3,F1 Score,0.499179
4,Kappa,-0.016757


###Calculate evaluation metrics (Train)

In [10]:
##Calculate evaluation metrics (Train)
y_pred_train = dummy_clf.predict(X_train)

#Calculate evaluation metrics (Train)
accuracy = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train)
recall = recall_score(y_train, y_pred_train)
f1 = f1_score(y_train, y_pred_train)
kappa = cohen_kappa_score(y_train, y_pred_train)

metrics_train = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'Kappa': kappa
}

metrics_df_train = pd.DataFrame(list(metrics_train.items()), columns=['Metric', 'Value'])

# Show DataFrame
print('\n' + 'Metrics for train set' + '\n')
display(metrics_df_train)


Metrics for train set



Unnamed: 0,Metric,Value
0,Accuracy,0.484583
1,Precision,0.484323
2,Recall,0.489575
3,F1 Score,0.486935
4,Kappa,-0.030825


##Implement a vanilla RNN sentiment analysis model

###Tokenize text

In [11]:
# Maximum number of words to consider
max_words = 10000

# Create a Tokenizer with a specified maximum number of words
tokenizer = Tokenizer(num_words=max_words)

# Fit the Tokenizer on the training text data
tokenizer.fit_on_texts(X_train)

# Convert the training and test text data into sequences of indices
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Adjust the length of the sequences to a fixed size
maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

###Create RNN model

In [12]:

##Define the architecture of the model:
def create_rnn_model(units=50):
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=50, input_length=maxlen))
    model.add(SimpleRNN(units, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

###Configure GridSearchCV to find the best hyperparameters

In [13]:
#Configure GridSearchCV to find the best hyperparameters
# Wrap the Keras model in a scikit-learn estimator
rnn_model = KerasClassifier(build_fn=create_rnn_model, epochs=5, batch_size=32, verbose=1)

# Configure GridSearchCV to find the best hyperparameters
param_grid = {'units': [50, 100, 150], 'batch_size': [32, 64, 128]}
grid_search = GridSearchCV(estimator=rnn_model, param_grid=param_grid, scoring='accuracy', cv=3)

# Fit the model
grid_result = grid_search.fit(X_train_pad, y_train)


  rnn_model = KerasClassifier(build_fn=create_rnn_model, epochs=5, batch_size=32, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


###Train the model with the best hyperparameters

In [14]:
best_units = grid_result.best_params_['units']
best_batch_size = grid_result.best_params_['batch_size']

best_rnn_model = create_rnn_model(units=best_units)
best_rnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=best_batch_size, validation_split=0.2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7d9562131e10>

###Evaluate the performance of the model on the test set

In [17]:
y_pred_rnn = (best_rnn_model.predict(X_test_pad) > 0.5).astype("int32")

# Calcular métricas de evaluación
accuracy_rnn = accuracy_score(y_test, y_pred_rnn)
precision_rnn = precision_score(y_test, y_pred_rnn)
recall_rnn = recall_score(y_test, y_pred_rnn)
f1_rnn = f1_score(y_test, y_pred_rnn)
kappa_rnn = cohen_kappa_score(y_test, y_pred_rnn)

metrics_rnn_test = {
    'Accuracy': accuracy_rnn,
    'Precision': precision_rnn,
    'Recall': recall_rnn,
    'F1 Score': f1_rnn,
    'Kappa': kappa_rnn
}

metrics_rnn = pd.DataFrame(list(metrics_rnn_test.items()), columns=['Metric', 'Value'])

# Show DataFrame
print('\n' + 'Metrics for train set' + '\n')
display(metrics_rnn)



Metrics for train set



Unnamed: 0,Metric,Value
0,Accuracy,0.76
1,Precision,0.800766
2,Recall,0.694352
3,F1 Score,0.743772
4,Kappa,0.520208


##Implement a RNN in LSTM model

###Define the architecture of the LSTM model

In [18]:
def create_lstm_model(units=50):
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=50, input_length=maxlen))
    model.add(LSTM(units, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


###Configuring GridSearchCV to find the best hyperparameters

In [21]:
# Wrap the Keras model in a scikit-learn estimator
lstm_model = KerasClassifier(build_fn=create_lstm_model, epochs=5, batch_size=32, verbose=1)

# Configure GridSearchCV to find the best hyperparameters
param_grid = {'units': [50, 100, 150], 'batch_size': [32, 64, 128]}
grid_search = GridSearchCV(estimator=lstm_model, param_grid=param_grid, scoring='accuracy', cv=3)

# Fit the model
grid_result = grid_search.fit(X_train_pad, y_train)


  lstm_model = KerasClassifier(build_fn=create_lstm_model, epochs=5, batch_size=32, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


###Training the model with the best hyperparameters

In [22]:
best_units = grid_result.best_params_['units']
best_batch_size = grid_result.best_params_['batch_size']

best_lstm_model = create_lstm_model(units=best_units)
best_lstm_model.fit(X_train_pad, y_train, epochs=5, batch_size=best_batch_size, validation_split=0.2)




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7d94c8337910>

###Evaluate the performance of the model on the test set.

In [23]:
y_pred_lstm = (best_lstm_model.predict(X_test_pad) > 0.5).astype("int32")

# Calculate metrics
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
precision_lstm = precision_score(y_test, y_pred_lstm)
recall_lstm = recall_score(y_test, y_pred_lstm)
f1_lstm = f1_score(y_test, y_pred_lstm)
kappa_lstm = cohen_kappa_score(y_test, y_pred_lstm)

metrics_lstm_test = {
    'Accuracy': accuracy_lstm,
    'Precision': precision_lstm,
    'Recall': recall_lstm,
    'F1 Score': f1_lstm,
    'Kappa': kappa_lstm
}

metrics_lstm = pd.DataFrame(list(metrics_lstm_test.items()), columns=['Metric', 'Value'])

# Show DataFrame
print('\n'+'Metrics for LSTM model on test set'+'\n')
display(metrics_lstm)



Metrics for LSTM model on test set



Unnamed: 0,Metric,Value
0,Accuracy,0.791667
1,Precision,0.80137
2,Recall,0.777409
3,F1 Score,0.789207
4,Kappa,0.58337
