In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Input, Dense, concatenate
from keras.models import Model

ImportError: cannot import name 'pad_sequences' from 'keras.preprocessing.sequence' (/Users/aninditakundu/pytorch-test/env/lib/python3.8/site-packages/keras/preprocessing/sequence.py)

In [None]:
# Load pre-trained word embeddings
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Load data
data = pd.read_csv('sentiment_data.csv')

In [None]:
# Initialize sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Calculate sentiment score and sentence length for each sentence
sentiment_scores = []
sentence_lengths = []
for sentence in data['text']:
    sentiment_scores.append(sid.polarity_scores(sentence)['compound'])
    sentence_lengths.append(len(sentence.split()))

# Normalize sentiment score and sentence length
sentiment_scores = (np.array(sentiment_scores) - np.mean(sentiment_scores)) / np.std(sentiment_scores)
sentence_lengths = (np.array(sentence_lengths) - np.mean(sentence_lengths)) / np.std(sentence_lengths)

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
word_index = tokenizer.word_index
max_sequence_length = max(len(s) for s in sequences)

# Pad sequences to a fixed length
data = pad_sequences(sequences, maxlen=max_sequence_length)

In [None]:
# Create embedding matrix
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Create input layers for word embeddings, sentiment score, and sentence length
embedding_layer = Embedding(num_words,
                            100,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)
sentiment_input = Input(shape=(1,))
sentence_length_input = Input(shape=(1,))

In [None]:
# Apply embedding layer to input data
embedded_sequences = embedding_layer(data)

# Flatten embedding output
embedded_sequences = Flatten()(embedded_sequences)

In [None]:
# Concatenate word embeddings, sentiment score, and sentence length features
x = concatenate([embedded_sequences, sentiment_input, sentence_length_input])

In [None]:
# Add fully connected layers
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

In [None]:
# Define model
model = Model(inputs=[embedding_layer.input, sentiment_input, sentence_length_input], outputs=output)

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit([data, sentiment_scores, sentence_lengths], data['label'], validation_split=0.2, epochs=10, batch_size=32)

# Evaluate model
loss, accuracy = model.evaluate([data, sentiment_scores, sentence_lengths], data['label'], verbose=0)
print('Accuracy: %f' % (accuracy*100))


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential

# Load pre-trained word embeddings
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Load data
data = pd.read_csv('text_data.csv')

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
word_index = tokenizer.word_index
max_sequence_length = max(len(s) for s in sequences)

# Pad sequences to a fixed length
data = pad_sequences(sequences, maxlen=max_sequence_length)

# Create embedding matrix
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Initialize embedding layer
embedding_layer = Embedding(num_words,
                            100,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)

# Define model architecture
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(data, data['label'], validation_split=0.2, epochs=10, batch_size=32)

# Evaluate model
loss, accuracy = model.evaluate(data, data['label'], verbose=0)
print('Accuracy: %f' % (accuracy*100))
