In [None]:
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Function to load dataset from a directory
def load_data_from_dir(data_dir, sentiment):
    sentences = []
    sentiments = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(data_dir, file_name), 'r', encoding='utf-8') as file:
                sentences.append(file.read())
                sentiments.append(sentiment)
    return sentences, sentiments

# Data directories
data_dir = '/content/drive/My Drive/ML Assinment 2/assinment 2 part 2/Project/aclImdb/'
train_pos_dir = os.path.join(data_dir, 'train/pos')
train_neg_dir = os.path.join(data_dir, 'train/neg')
test_pos_dir = os.path.join(data_dir, 'test/pos')
test_neg_dir = os.path.join(data_dir, 'test/neg')

# Load data
train_pos_sentences, train_pos_sentiments = load_data_from_dir(train_pos_dir, 1)
train_neg_sentences, train_neg_sentiments = load_data_from_dir(train_neg_dir, 0)
test_pos_sentences, test_pos_sentiments = load_data_from_dir(test_pos_dir, 1)
test_neg_sentences, test_neg_sentiments = load_data_from_dir(test_neg_dir, 0)

# Combine data
train_sentences = train_pos_sentences + train_neg_sentences
train_sentiments = train_pos_sentiments + train_neg_sentiments
test_sentences = test_pos_sentences + test_neg_sentiments
test_sentiments = test_pos_sentiments + test_neg_sentiments

# Convert sentiments to numpy arrays
y_train = np.array(train_sentiments)
y_test = np.array(test_sentiments)

# Convert to categorical
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_sentences + test_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
word_index = tokenizer.word_index

MAX_SEQUENCE_LENGTH = 100
x_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [None]:
# Function to load dataset from a directory
def load_data_from_dir(data_dir, sentiment):
    sentences = []
    sentiments = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(data_dir, file_name), 'r', encoding='utf-8') as file:
                sentences.append(file.read())
                sentiments.append(sentiment)
    return sentences, sentiments

# Data directories
data_dir = '/content/drive/My Drive/ML Assinment 2/assinment 2 part 2/Project/aclImdb/'
train_pos_dir = os.path.join(data_dir, 'train/pos')
train_neg_dir = os.path.join(data_dir, 'train/neg')
test_pos_dir = os.path.join(data_dir, 'test/pos')
test_neg_dir = os.path.join(data_dir, 'test/neg')

# Load data
train_pos_sentences, train_pos_sentiments = load_data_from_dir(train_pos_dir, 1)
train_neg_sentences, train_neg_sentiments = load_data_from_dir(train_neg_dir, 0)
test_pos_sentences, test_pos_sentiments = load_data_from_dir(test_pos_dir, 1)
test_neg_sentences, test_neg_sentiments = load_data_from_dir(test_neg_dir, 0)

# Combine data
train_sentences = train_pos_sentences + train_neg_sentences
train_sentiments = train_pos_sentiments + train_neg_sentiments
test_sentences = test_pos_sentences + test_neg_sentiments
test_sentiments = test_pos_sentiments + test_neg_sentiments

# Convert sentiments to numpy arrays
y_train = np.array(train_sentiments)
y_test = np.array(test_sentiments)

# Convert to categorical
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_sentences + test_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
word_index = tokenizer.word_index

MAX_SEQUENCE_LENGTH = 100
x_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [None]:
# Download GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

# Load GloVe embeddings
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
VOCAB_SIZE = len(tokenizer.word_index) + 1
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i < VOCAB_SIZE:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Embedding layer with pre-trained GloVe embeddings
embedding_layer = Embedding(input_dim=VOCAB_SIZE,
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)



In [None]:
# Aspect embedding dictionary
aspect_dict = {
    'Writing': [1, 0, 0, 0, 0, 0, 0, 0],
    'Acting': [0, 1, 0, 0, 0, 0, 0, 0],
    'Directing': [0, 0, 1, 0, 0, 0, 0, 0],
    'Cinematography': [0, 0, 0, 1, 0, 0, 0, 0],
    'Production': [0, 0, 0, 0, 1, 0, 0, 0],
    'Overall': [0, 0, 0, 0, 0, 1, 0, 0],
    'Music': [0, 0, 0, 0, 0, 0, 1, 0],
    'Actors': [0, 0, 0, 0, 0, 0, 0, 1]
}

def get_aspect_embedding(aspect):
    if aspect in aspect_dict:
        return aspect_dict[aspect]
    else:
        return [0] * len(aspect_dict['Writing'])

# Dummy aspects for illustration (you would typically have these in your dataset)
aspects = ['Writing'] * len(train_sentences)  # Example with all 'Writing' aspects

# Get aspect embeddings
aspect_embeddings = np.array([get_aspect_embedding(aspect) for aspect in aspects])

# Define the Sequential model
input_review = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='input_review')
input_aspect = Input(shape=(len(aspect_dict['Writing']),), dtype='float32', name='input_aspect')

embedded_sequences = embedding_layer(input_review)

conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedded_sequences)
pooled_output = GlobalMaxPooling1D()(conv_layer)

merged_output = Concatenate()([pooled_output, input_aspect])

dense = Dense(64, activation='relu')(merged_output)
dropout = Dropout(0.5)(dense)
output = Dense(2, activation='softmax')(dropout)


In [None]:
model = Model(inputs=[input_review, input_aspect], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()



In [None]:
# Train the model
history = model.fit([x_train, aspect_embeddings], y_train, validation_data=([x_test, aspect_embeddings[:len(x_test)]], y_test), epochs=10, batch_size=32)