In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Attention, Concatenate
from keras.models import Model
from keras.utils import to_categorical


In [4]:
data_dir = '/content/drive/My Drive/ML Assinment 2/assinment 2 part 2/Project/aclImdb/'

# Function to load dataset from a directory
def load_data_from_dir(data_dir, sentiment):
    sentences = []
    sentiments = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(data_dir, file_name), 'r', encoding='utf-8') as file:
                sentences.append(file.read())
                sentiments.append(sentiment)
    return sentences, sentiments

# Load train and test data
train_pos_dir = os.path.join(data_dir, 'train/pos')
train_neg_dir = os.path.join(data_dir, 'train/neg')
test_pos_dir = os.path.join(data_dir, 'test/pos')
test_neg_dir = os.path.join(data_dir, 'test/neg')

train_pos_sentences, train_pos_sentiments = load_data_from_dir(train_pos_dir, 1)
train_neg_sentences, train_neg_sentiments = load_data_from_dir(train_neg_dir, 0)
test_pos_sentences, test_pos_sentiments = load_data_from_dir(test_pos_dir, 1)
test_neg_sentences, test_neg_sentiments = load_data_from_dir(test_neg_dir, 0)

# Combine the data
train_sentences = train_pos_sentences + train_neg_sentences
train_sentiments = train_pos_sentiments + train_neg_sentiments
test_sentences = test_pos_sentences + test_neg_sentences
test_sentiments = test_pos_sentiments + test_neg_sentiments

# Tokenize sentences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_sentences + test_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
word_index = tokenizer.word_index

MAX_SEQUENCE_LENGTH = 100
x_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Convert sentiments to numpy arrays
y_train = np.array(train_sentiments)
y_test = np.array(test_sentiments)

# Convert to categorical
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)


In [14]:
lab_dir = os.path.join(data_dir, 'lab')
sentences = []
aspects = []
sentiments = []

# Read labeled data
for file_name in os.listdir(lab_dir):
    if file_name.endswith(".txt"):
        with open(os.path.join(lab_dir, file_name), 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split('",')
                if len(parts) < 2:
                    continue  # Skip lines that do not have the expected format
                sentence = parts[0][1:]
                aspect_sentiment = parts[1].split(',')
                if len(aspect_sentiment) < 2:
                    continue  # Skip parts that do not have the expected format
                aspect = aspect_sentiment[0]
                sentiment = aspect_sentiment[1]

                sentences.append(sentence)
                aspects.append(aspect)
                sentiments.append(sentiment)

# Convert to DataFrame for easier manipulation
df = pd.DataFrame({
    'Sentence': sentences,
    'Aspect': aspects,
    'Sentiment': sentiments
})

# Preprocess the labels
le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

# Tokenize sentences using Keras Tokenizer
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df['Sentence'])
labeled_sequences = tokenizer.texts_to_sequences(df['Sentence'])
MAX_SEQUENCE_LENGTH = 128
labeled_data = pad_sequences(labeled_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Aspect embedding
aspect_dict = {
    'Writing': [1, 0, 0, 0, 0, 0, 0, 0],
    'Acting': [0, 1, 0, 0, 0, 0, 0, 0],
    'Directing': [0, 0, 1, 0, 0, 0, 0, 0],
    'Cinematography': [0, 0, 0, 1, 0, 0, 0, 0],
    'Production': [0, 0, 0, 0, 1, 0, 0, 0],
    'Overall': [0, 0, 0, 0, 0, 1, 0, 0],
    'Music': [0, 0, 0, 0, 0, 0, 1, 0],
    'Actors': [0, 0, 0, 0, 0, 0, 0, 1],
    'Direction': [0, 0, 0, 0, 0, 0, 0, 0]
}

# Function to handle unknown aspects
def get_aspect_embedding(aspect):
    if aspect in aspect_dict:
        return aspect_dict[aspect]
    else:
        print(f"Unknown aspect: {aspect}")
        return [0] * len(aspect_dict['Writing'])  # Default embedding for unknown aspects

# Apply the function to get aspect embeddings
df['Aspect_Embedding'] = df['Aspect'].apply(lambda x: get_aspect_embedding(x))

# Prepare aspect embeddings
aspect_data = np.array(df['Aspect_Embedding'].tolist())

# Prepare labels
aspect_labels = to_categorical(df['Sentiment'], num_classes=3)

# Split the data
labeled_x_train, labeled_x_test, labeled_y_train, labeled_y_test, labeled_aspect_train, labeled_aspect_test = train_test_split(
    labeled_data, aspect_labels, aspect_data, test_size=0.2, random_state=42)

In [15]:
# Assuming 'word_index' is defined from the tokenizer and 'MAX_SEQUENCE_LENGTH' is already set

VOCAB_SIZE = len(tokenizer.word_index) + 1  # Ensure word_index is from the tokenizer used previously
EMBEDDING_DIM = 100
ASPECT_EMBEDDING_DIM = 8  # Updated to match the length of aspect embeddings defined previously
NUM_CLASSES = 3

# Input layers
review_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
aspect_input = Input(shape=(ASPECT_EMBEDDING_DIM,), dtype='float32')

# Embedding layer
embedded_sequences = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(review_input)

# Convolutional layer
conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedded_sequences)
pooled_output = GlobalMaxPooling1D()(conv_layer)

# Aspect attention layer
attention = Dense(EMBEDDING_DIM, activation='tanh')(aspect_input)
attention = Dense(1, activation='softmax')(attention)
attention_output = attention * pooled_output

# Concatenate the outputs
merged_output = Concatenate()([attention_output, aspect_input])

# Fully connected layers
dense_output = Dense(64, activation='relu')(merged_output)
predictions = Dense(NUM_CLASSES, activation='softmax')(dense_output)

# Build the model
model = Model(inputs=[review_input, aspect_input], outputs=predictions)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 8)]                  0         []                            
                                                                                                  
 embedding (Embedding)       (None, 128, 100)             33200     ['input_1[0][0]']             
                                                                                                  
 dense (Dense)               (None, 100)                  900       ['input_2[0][0]']             
                                                                                              

In [16]:
# Train the model
history = model.fit([labeled_x_train, labeled_aspect_train], labeled_y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
# Evaluate the model
loss, accuracy = model.evaluate([labeled_x_test, labeled_aspect_test], labeled_y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Test Accuracy: 42.86%
