#1. Load the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, SpatialDropout1D, Bidirectional, Dropout, Flatten
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
file_path = '/content/2021-2024 final_New.excel'
df = pd.read_excel(file_path)

# Display the first few rows to understand the structure
print("Dataset Preview:")
print(df.head())
print("\nDataset Info:")
print(df.info())


#2. DataPreprocessing

In [None]:
# Drop rows where the Title or Prediction is missing
df.dropna(subset=['Title', 'Decision'], inplace=True)

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Remove rows with invalid dates
df.dropna(subset=['Date'], inplace=True)

# Check for duplicates
df.drop_duplicates(subset=['Title'], inplace=True)

# Clean text by removing special characters, numbers, and stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

df['Cleaned_Title'] = df['Title'].apply(clean_text)

# Display cleaned data
print("\nCleaned Data Sample:")
print(df[['Date', 'Cleaned_Title', 'Decision']].head())


In [None]:
# Vectorize the cleaned text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Cleaned_Title']).toarray()

# Encode the target variable ('Prediction')
df['Label'] = df['Decision'].apply(lambda x: 1 if x == 'Positve' else 0)
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData Split Complete")
print(f"Training Samples: {len(X_train)}, Testing Samples: {len(X_test)}")


In [None]:
# Tokenize and pad sequences for LSTM input
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['Cleaned_Title'])
X_seq = tokenizer.texts_to_sequences(df['Cleaned_Title'])
X_padded = pad_sequences(X_seq, maxlen=100)

# Split padded data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Define maximum number of words and sequence length
MAX_VOCAB_SIZE = 5000  # Number of words in the vocabulary
MAX_SEQUENCE_LENGTH = 100  # Length of each input sequence
EMBEDDING_DIM = 128  # Embedding output size

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#print(model.summary())

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.2f}")


In [None]:
# Make predictions on the test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Print evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Plot accuracy and loss curves
plt.figure(figsize=(12, 5))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
# Save the trained model as an .h5 file
model.save('sentiment_model.h5')
print("Model saved successfully as 'sentiment_model.h5'.")