In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Attention, Concatenate, Dropout
from keras.models import Model
from keras.utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence



In [None]:
data_dir = '/content/drive/My Drive/ML Assinment 2/assinment 2 part 2/Project/aclImdb/'

# Load data function
def load_data_from_dir(data_dir, sentiment):
    sentences = []
    sentiments = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(data_dir, file_name), 'r', encoding='utf-8') as file:
                sentences.append(file.read())
                sentiments.append(sentiment)
    return sentences, sentiments

# Load train and test data
rain_pos_dir = os.path.join(data_dir, 'train/pos')
train_neg_dir = os.path.join(data_dir, 'train/neg')
test_pos_dir = os.path.join(data_dir, 'test/pos')
test_neg_dir = os.path.join(data_dir, 'test/neg')

# Load data
train_pos_sentences, train_pos_sentiments = load_data_from_dir(train_pos_dir, 1)
train_neg_sentences, train_neg_sentiments = load_data_from_dir(train_neg_dir, 0)
test_pos_sentences, test_pos_sentiments = load_data_from_dir(test_pos_dir, 1)
test_neg_sentences, test_neg_sentiments = load_data_from_dir(test_neg_dir, 0)

# Combine data
train_sentences = train_pos_sentences + train_neg_sentences
train_sentiments = train_pos_sentiments + train_neg_sentiments
test_sentences = test_pos_sentences + test_neg_sentences
test_sentiments = test_pos_sentiments + test_neg_sentiments

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_sentences + test_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
word_index = tokenizer.word_index

MAX_SEQUENCE_LENGTH = 100
x_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Convert sentiments to numpy arrays
y_train = np.array(train_sentiments)
y_test = np.array(test_sentiments)

# Convert to categorical
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)


In [None]:
# Download GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

# Load GloVe embeddings
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
# Create embedding matrix
VOCAB_SIZE = len(tokenizer.word_index) + 1
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i < VOCAB_SIZE:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Embedding layer with pre-trained GloVe embeddings
embedding_layer = Embedding(input_dim=VOCAB_SIZE,
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Data augmentation function
def augment_text(text):
    # Simple augmentation by synonym replacement or other techniques
    words = text_to_word_sequence(text)
    augmented_text = ' '.join(words)  # For simplicity, we are not changing the text
    return augmented_text

# Apply data augmentation
augmented_train_sentences = [augment_text(sentence) for sentence in train_sentences]
augmented_train_sequences = tokenizer.texts_to_sequences(augmented_train_sentences)
x_train_augmented = pad_sequences(augmented_train_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Combine original and augmented data
x_train_combined = np.vstack((x_train, x_train_augmented))
y_train_combined = np.vstack((y_train, y_train))

In [None]:
# Aspect data handling
lab_dir = os.path.join(data_dir, 'lab')
sentences = []
aspects = []
sentiments = []

for file_name in os.listdir(lab_dir):
    if file_name.endswith(".txt"):
        with open(os.path.join(lab_dir, file_name), 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split('",')
                if len(parts) < 2:
                    continue
                sentence = parts[0][1:]
                aspect_sentiment = parts[1].split(',')
                if len(aspect_sentiment) < 2:
                    continue
                aspect = aspect_sentiment[0]
                sentiment = aspect_sentiment[1]
                sentences.append(sentence)
                aspects.append(aspect)
                sentiments.append(sentiment)

df = pd.DataFrame({
    'Sentence': sentences,
    'Aspect': aspects,
    'Sentiment': sentiments
})

# Label encoding
le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

# Tokenize sentences
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df['Sentence'])
labeled_sequences = tokenizer.texts_to_sequences(df['Sentence'])
MAX_SEQUENCE_LENGTH = 128
labeled_data = pad_sequences(labeled_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Aspect embedding
aspect_dict = {
    'Writing': [1, 0, 0, 0, 0, 0, 0, 0],
    'Acting': [0, 1, 0, 0, 0, 0, 0, 0],
    'Directing': [0, 0, 1, 0, 0, 0, 0, 0],
    'Cinematography': [0, 0, 0, 1, 0, 0, 0, 0],
    'Production': [0, 0, 0, 0, 1, 0, 0, 0],
    'Overall': [0, 0, 0, 0, 0, 1, 0, 0],
    'Music': [0, 0, 0, 0, 0, 0, 1, 0],
    'Actors': [0, 0, 0, 0, 0, 0, 0, 1],
    'Direction': [0, 0, 0, 0, 0, 0, 0, 0]
}

def get_aspect_embedding(aspect):
    if aspect in aspect_dict:
        return aspect_dict[aspect]
    else:
        return [0] * len(aspect_dict['Writing'])

df['Aspect_Embedding'] = df['Aspect'].apply(lambda x: get_aspect_embedding(x))

aspect_data = np.array(df['Aspect_Embedding'].tolist())
aspect_labels = to_categorical(df['Sentiment'], num_classes=3)

# Split data
labeled_x_train, labeled_x_test, labeled_y_train, labeled_y_test, labeled_aspect_train, labeled_aspect_test = train_test_split(
    labeled_data, aspect_labels, aspect_data, test_size=0.2, random_state=42)

In [None]:
# Model architecture
review_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
aspect_input = Input(shape=(ASPECT_EMBEDDING_DIM,), dtype='float32')

# Embedding layer
embedded_sequences = embedding_layer(review_input)

# Convolutional layer
conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedded_sequences)
conv_layer = Dropout(0.5)(conv_layer)
pooled_output = GlobalMaxPooling1D()(conv_layer)

# Aspect attention layer
attention = Dense(EMBEDDING_DIM, activation='tanh')(aspect_input)
attention = Dense(1, activation='softmax')(attention)
attention_output = attention * pooled_output

# Concatenate the outputs
merged_output = Concatenate()([attention_output, aspect_input])

# Fully connected layers
dense_output = Dense(64, activation='relu')(merged_output)
dense_output = Dropout(0.5)(dense_output)
predictions = Dense(3, activation='softmax')(dense_output)



In [None]:
# Build the model
model = Model(inputs=[review_input, aspect_input], outputs=predictions)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

In [None]:
# Train the model
history = model.fit([x_train_combined, labeled_aspect_train], y_train_combined,
                    validation_data=([x_test, labeled_aspect_test], y_test),
                    epochs=10, batch_size=32)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate([labeled_x_test, labeled_aspect_test], labeled_y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')
