In [None]:
from google.colab import drive
import random
import tarfile
import os
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dense

drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/aclImdb_v1.tar.gz"
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall(path="/content/drive/My Drive/")
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def preprocess(input_text):
    # Convert text to lowercase
    processed_text = input_text.lower()
    # Remove punctuation
    processed_text = processed_text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(processed_text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Reconstruct the preprocessed text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

def load_and_preprocess(directory_path, max_files=None):
    documents = []
    file_count = 0
    for file_name in os.listdir(directory_path):
        if max_files is not None and file_count >= max_files:
            break
        with open(os.path.join(directory_path, file_name), 'r', encoding='utf-8') as file:
            content = file.read()
            cleaned_content = preprocess(content)
            documents.append(cleaned_content)
        file_count += 1
    return documents

positive_directory = "/content/drive/MyDrive/aclImdb/train/pos/"
negative_directory = "/content/drive/MyDrive/aclImdb/train/neg/"

max_files_to_load = 2000
positive_documents = load_and_preprocess(positive_directory, max_files_to_load)
negative_documents = load_and_preprocess(negative_directory, max_files_to_load)

print("Number of files in pos folder:", len(positive_documents))
print("Number of files in neg folder:", len(negative_documents))


Number of files in pos folder: 1984
Number of files in neg folder: 2000


In [None]:
#combining documents
all_documents = positive_documents + negative_documents
all_labels = [1] * len(positive_documents) + [0] * len(negative_documents)
#shuffling documents
combined_data = list(zip(all_documents, all_labels))
random.shuffle(combined_data)
all_documents, all_labels = zip(*combined_data)

In [None]:
all_labels[0:10]

(1, 0, 0, 1, 0, 0, 1, 1, 1, 0)

In [None]:
all_labels = np.array(all_labels)
all_labels

array([1, 0, 0, ..., 1, 0, 0])

##Sentiment analysis using deep learning

In [None]:
#spiliting data
train_data, test_data, train_labels, test_labels = train_test_split(all_documents, all_labels, test_size=0.25, random_state=42)

In [None]:
#bag of words using count vectorizer
word_vectorizer = CountVectorizer(max_features=10000)
train_bow = word_vectorizer.fit_transform(train_data)
test_bow = word_vectorizer.transform(test_data)


In [None]:
max_sequence_length = 100  # Maximum sequence length
train_sequences = pad_sequences(train_bow.toarray(), maxlen=max_sequence_length)
test_sequences = pad_sequences(test_bow.toarray(), maxlen=max_sequence_length)
train_seq[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [None]:
# MODELS
# RNN:
def build_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=train_sequences.shape[1], output_dim=64, input_length=max_sequence_length))
    model.add(SimpleRNN(64))
    model.add(Dense(1, activation='sigmoid'))
    return model

# GRU:
def build_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=train_sequences.shape[1], output_dim=64, input_length=max_sequence_length))
    model.add(GRU(64))
    model.add(Dense(1, activation='sigmoid'))
    return model

# LSTM:
def build_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=train_sequences.shape[1], output_dim=64, input_length=max_sequence_length))
    model.add(LSTM(64))
    model.add(Dense(1, activation='sigmoid'))
    return model

# BILSTM:
def build_bilstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=train_sequences.shape[1], output_dim=64, input_length=max_sequence_length))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dense(1, activation='sigmoid'))
    return model
#splitting data
X_train, X_val, y_train, y_val = train_test_split(train_sequences, train_labels, test_size=0.1, random_state=42)


In [None]:
#TRAINING AND EVALUATION
def train_and_evaluate(model_builder):
    model = model_builder()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping])
    y_pred = np.round(model.predict(test_sequences))
    accuracy = accuracy_score(test_labels, y_pred)
    precision = precision_score(test_labels, y_pred, zero_division="warn")
    recall = recall_score(test_labels, y_pred)
    f1 = f1_score(test_labels, y_pred)
    return accuracy, precision, recall, f1

models = {'RNN': build_rnn_model, 'GRU': build_gru_model, 'LSTM': build_lstm_model, 'BiLSTM': build_bilstm_model}

results = {}
for model_name, model_builder in models.items():
    accuracy, precision, recall, f1 = train_and_evaluate(model_builder)
    results[model_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1}


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [None]:
#printing results
print("Results:")
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric_name, value in metrics.items():
        print(f"\t{metric_name}: {value}")

Results:
RNN: {'Accuracy': 0.5622489959839357, 'Precision': 0.5587761674718197, 'Recall': 0.6817288801571709, 'F1-score': 0.6141592920353982}
GRU: {'Accuracy': 0.5080321285140562, 'Precision': 0.5102040816326531, 'Recall': 0.9332023575638507, 'F1-score': 0.6597222222222222}
LSTM: {'Accuracy': 0.5190763052208835, 'Precision': 0.5231481481481481, 'Recall': 0.6660117878192534, 'F1-score': 0.5859982713915298}
BiLSTM: {'Accuracy': 0.6164658634538153, 'Precision': 0.6135957066189625, 'Recall': 0.6738703339882122, 'F1-score': 0.6423220973782771}


##Sentiment analysis using word embeddings

In [None]:
import numpy as np
import random
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dense
#splitting data
X_train, X_test, y_train, y_test = train_test_split(all_documents, all_labels, test_size=0.25, random_state=42)
#tokenizing
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_sequence = tokenizer.texts_to_sequences(X_train)
X_test_sequence = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_sequence, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_sequence, maxlen=maxlen)
X_train_pad[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,  599, 1914, 1066,    1,   52,  504,
        463, 1565,  981,    9,    2,  440,   16,  402, 1565,  318,   70,
        233,    2,  169, 5647,  881,   33,    8, 3517, 1364,   16,    4,
          1,    1,   87, 2234,   11,  285,  529,    5,  422,   49, 1336,
        111], dtype=int32)

In [None]:
X_train_pad = np.array(X_train_pad)
y_train = np.array(y_train)

#MODELS WITH EMBEDDING
#RNN
def build_rnn_with_embedding():
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=64, input_length=max_sequence_length))
    model.add(SimpleRNN(64))
    model.add(Dense(1, activation='sigmoid'))
    return model

#GRU
def build_gru_with_embedding():
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=64, input_length=max_sequence_length))
    model.add(GRU(64))
    model.add(Dense(1, activation='sigmoid'))
    return model

#LSTM
def build_lstm_with_embedding():
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=64, input_length=max_sequence_length))
    model.add(LSTM(64))
    model.add(Dense(1, activation='sigmoid'))
    return model

#BILSTM
def build_bilstm_with_embedding():
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=64, input_length=max_sequence_length))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [None]:
#TRAINING AND EVALUATION
def train_and_evaluate(model_builder):
    model = model_builder()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_pad, y_train, epochs=8, batch_size=32, validation_split=0.1)
    y_pred = np.round(model.predict(X_test_pad))
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1


models = {'RNN with Embedding': build_rnn_with_embedding,'GRU with Embedding': build_gru_with_embedding,'LSTM with Embedding': build_lstm_with_embedding,'BiLSTM with Embedding': build_bilstm_with_embedding}

results = {}
for model_name, model_builder in models.items():
    accuracy, precision, recall, f1 = train_and_evaluate(model_builder)
    results[model_name] = {'Accuracy': accuracy,'Precision': precision,'Recall': recall,'F1-score': f1}


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
print("Results:")
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric_name, value in metrics.items():
        print(f"\t{metric_name}: {value}")

Results:
RNN with Embedding: {'Accuracy': 0.7921686746987951, 'Precision': 0.7859848484848485, 'Recall': 0.8153241650294696, 'F1-score': 0.8003857280617165}
GRU with Embedding: {'Accuracy': 0.8544176706827309, 'Precision': 0.8775933609958506, 'Recall': 0.831041257367387, 'F1-score': 0.8536831483350151}
LSTM with Embedding: {'Accuracy': 0.8704819277108434, 'Precision': 0.8861788617886179, 'Recall': 0.8565815324165029, 'F1-score': 0.8711288711288712}
BiLSTM with Embedding: {'Accuracy': 0.8604417670682731, 'Precision': 0.8530534351145038, 'Recall': 0.8781925343811395, 'F1-score': 0.8654404646660214}
