## Importing Essential Libraries


In [5]:
import numpy as np
import os 
import shutil
import tensorflow as tf
from sklearn.metrics import accuracy_score, confusion_matrix

## Function to Download and Read Labeled Sentences


In [53]:
def download_and_read(url):
    local_file = url.split('/')[-1]
    local_file = local_file.replace('%20', ' ')
    p = tf.keras.utils.get_file(local_file, url, extract=True, cache_dir = '.')
    local_folder = os.path.join('datasets', local_file.split('.')[0])
    labeled_sentences = []
    for labeled_filename in os.listdir(local_folder):
        if labeled_filename.endswith('labelled.txt'):
            with open(os.path.join(
                local_folder, labeled_filename), 'r') as f:
                for line in f:
                    sentence, label = line.strip().split('\t')
                    labeled_sentences.append((sentence, label))
    return labeled_sentences
labeled_sentences = download_and_read(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/" + 
    "00331/sentiment%20labelled%20sentences.zip") 
sentence = [s for (s, l) in labeled_sentences]
labels = [int(l) for (s,l) in labeled_sentences]

## Text Tokenization and Vocabulary Building

In [26]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentence)
vocab_size = len(tokenizer.word_counts)
print('vacab size: {:d}'.format(vocab_size))
word2inx = tokenizer.word_index
ind2word = {v:k for (k,v) in word2inx.items()}

vacab size: 5271


## Analyzing Sentence Length Distribution


In [29]:
seq_lengths = np.array([len(s.split()) for s in sentence])
print([(p, np.percentile(seq_lengths, p)) for p in [75,80, 90,95,99,100]])

[(75, 16.0), (80, 18.0), (90, 22.0), (95, 26.0), (99, 36.0), (100, 71.0)]


## Preparing Dataset with Padded Sequences


In [30]:
max_seqlen = 64
#create dataset
sentences_as_int = tokenizer.texts_to_sequences(sentence)
sentences_as_int = tf.keras.preprocessing.sequence.pad_sequences(
    sentences_as_int, maxlen = max_seqlen)
labels_as_int = np.array(labels)
dataset = tf.data.Dataset.from_tensor_slices((sentences_as_int, labels_as_int))

## Splitting Data into Training, Validation, and Test Sets


In [32]:
dataset.shuffle(10000)
test_size = len(sentence) // 3
val_size = (len(sentence) - test_size) // 10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)
batch_size = 64
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

## Defining, Compiling, and Training the Sentiment Analysis Model


In [38]:
class SentimentalAnalysisModel(tf.keras.Model):
    def __init__(self, vocab_size, max_seqlen, **kwargs):
        super(SentimentalAnalysisModel, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(vocab_size, max_seqlen)
        self.bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(max_seqlen))
        self.dense = tf.keras.layers.Dense(64, activation='relu')
        self.out = tf.keras.layers.Dense(1, activation = 'sigmoid')
    def call(self, x):
        x = self.embedding(x)
        x = self.bilstm(x)
        x = self.dense(x)
        x = self.out(x)
        return x

model = SentimentalAnalysisModel(vocab_size+1, max_seqlen)
model.build(input_shape=(batch_size, max_seqlen))
model.summary()

#compile
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics = ['accuracy'])

#train
data_dir = './data'
logs_dir = os.path.join('./logs')
best_model_file = os.path.join(data_dir, 'best_model.h5')
checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model_file,
                                                save_weights_only=True,
                                                save_best_only=  True)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir = logs_dir)
num_epochs = 10
history = model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset,
                    callbacks = [checkpoint, tensorboard])

Model: "sentimental_analysis_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     multiple                  337408    
                                                                 
 bidirectional_1 (Bidirectio  multiple                 66048     
 nal)                                                            
                                                                 
 dense_2 (Dense)             multiple                  8256      
                                                                 
 dense_3 (Dense)             multiple                  65        
                                                                 
Total params: 411,777
Trainable params: 411,777
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

## Loading and Evaluating the Trained Model


In [39]:
best_model = SentimentalAnalysisModel(vocab_size+1, max_seqlen)
best_model.build(input_shape=(batch_size, max_seqlen))
best_model.load_weights(best_model_file)
best_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
test_loss, test_acc = best_model.evaluate(test_dataset)
print('test loss: {:3f}, test accuracy: {:3f}'.format(test_loss, test_acc))

test loss: 0.528709, test accuracy: 0.745000


## Generating Predictions and Evaluating the Model on Test Data


In [51]:
labels, predictions = [], []
ind2word[0] = 'PAD'
is_first_batch = True
total_processed = 0  # Добавляем счетчик для общего количества обработанных примеров

for test_batch in test_dataset:
    input_b, labels_b = test_batch
    pred_batch = best_model.predict(input_b)
    predictions.extend([(1 if p > .5 else 0) for p in pred_batch])
    labels.extend([l for l in labels_b])
    if is_first_batch:
        for rid in range(input_b.shape[0]):
            words = [ind2word[idx] for idx in input_b[rid].numpy()]
            words = [w for w in words if w != 'PAD']
            sentence = ' '.join(words)
            # Используем total_processed + rid для доступа к правильному индексу
            print('{:d}\t{:d}\t{:s}'.format(labels[total_processed + rid], predictions[total_processed + rid], sentence))
        is_first_batch = False
    total_processed += input_b.shape[0]  # Обновляем счетчик после обработки пакета

print('accuracy score: {:3f}'.format(accuracy_score(labels, predictions)))
print('confusion matrix')
print(confusion_matrix(labels, predictions))


0	1	so there is no way for me to plug it in here in the us unless i go by a converter
1	1	good case excellent value
1	1	great for the jawbone
0	0	tied to charger for conversations lasting more than 45 minutes major problems
1	1	the mic is great
0	0	i have to jiggle the plug to get it to line up right to get decent volume
0	0	if you have several dozen or several hundred contacts then imagine the fun of sending each of them one by one
1	1	if you are razr owner you must have this
0	0	needless to say i wasted my money
0	0	what a waste of money and time
1	1	and the sound quality is great
1	0	he was very impressed when going from the original battery to the extended battery
0	0	if the two were seperated by a mere 5 ft i started to notice excessive static and garbled sound from the headset
1	1	very good quality though
0	0	the design is very odd as the ear clip is not very comfortable at all
1	1	highly recommend for any one who has a blue tooth phone
0	0	i advise everyone do not be fooled
1	1	