In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
# Load the dataset
# Assuming the CSV file is named 'training.1600000.processed.noemoticon.csv'
# The columns are: target, ids, date, flag, user, text
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)

# Select only the target (0 = negative, 4 = positive) and text columns
df = df[[0, 5]]
df.columns = ['target', 'text']

# Convert target 4 to 1 (binary classification: 0 = negative, 1 = positive)
df['target'] = df['target'].replace(4, 1)

In [None]:
# Preprocessing
# Tokenize the text
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, padding='post', maxlen=100)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['target'], test_size=0.2, random_state=42)

In [None]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=100),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), verbose=2)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

In [None]:
# Save the model
model.save('sentiment_analysis_model.h5')

In [None]:
# Example usage for prediction:
# Load the model
# model = tf.keras.models.load_model('sentiment_analysis_model.h5')

# Sample tweet for prediction
# sample_tweet = ["I love this! It's awesome."]
# sample_seq = tokenizer.texts_to_sequences(sample_tweet)
# sample_padded = pad_sequences(sample_seq, padding='post', maxlen=100)

# Predict
# prediction = model.predict(sample_padded)
# print(f"Sentiment: {'Positive' if np.round(prediction[0][0]) == 1 else 'Negative'}")