In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import re
import os

# --- NEW: SETTING RANDOM SEEDS FOR REPRODUCIBILITY ---
# This is the most important step to ensure you get the same result every time.
os.environ['PYTHONHASHSEED'] = str(42)
tf.random.set_seed(42)
np.random.seed(42)

# Import the specific Keras components needed for our advanced model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
# Import Tokenizer and pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# --- 1. Data Loading and Preprocessing ---
try:
    df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks(DL)/datasets/Data.csv", encoding="ISO-8859-1")
    print("Successfully loaded Data.csv")
except FileNotFoundError:
    print("Error: 'Data.csv' not found. Please make sure you have uploaded the file to your Colab session.")
    exit()

text_columns = [f'Top{i}' for i in range(1, 26)]
df['Combined_Text'] = df[text_columns].astype(str).agg(' '.join, axis=1)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Combined_Text'] = df['Combined_Text'].apply(clean_text)
df = df[['Combined_Text', 'Label']].dropna()

X = df['Combined_Text'].values
y = df['Label'].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# --- 2. Handle Class Imbalance ---
class_weights_values = class_weight.compute_class_weight(
    'balanced', classes=np.unique(y_encoded), y=y_encoded
)
class_weights = dict(enumerate(class_weights_values))
print(f"Calculated Class Weights: {class_weights}")

# --- 3. Tokenization and Padding ---
MAX_WORDS = 10000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# --- 4. Build the Advanced Multi-Branch 1D CNN Model ---
inputs = Input(shape=(MAX_LEN,))
embedding_layer = Embedding(input_dim=MAX_WORDS, output_dim=128)(inputs)

convs = []
kernel_sizes = [3, 4, 5]

for kernel_size in kernel_sizes:
    conv = Conv1D(filters=128, kernel_size=kernel_size, activation='relu')(embedding_layer)
    pool = GlobalMaxPooling1D()(conv)
    convs.append(pool)

merged = concatenate(convs)
dense1 = Dense(128, activation='relu')(merged)
dropout1 = Dropout(0.5)(dense1)
outputs = Dense(1, activation='sigmoid')(dropout1)

model = Model(inputs=inputs, outputs=outputs)

# --- 5. Compile the Model ---
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
model.summary()

# --- 6. Train the Model ---
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train,
    y_train,
    epochs=20,
    validation_data=(X_test, y_test),
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stop]
)

# --- 7. Evaluation and Prediction ---
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nFinal Test Accuracy: {accuracy * 100:.2f}%")

def predict_sentiment(text):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(sequence, maxlen=MAX_LEN, padding='post', truncating='post')
    prediction = model.predict(padded, verbose=0)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    confidence = prediction if sentiment == "Positive" else 1 - prediction
    print(f"Text: \"{text}\" --> Sentiment: {sentiment} (Confidence: {confidence * 100:.2f}%)")

print("\n--- Making Final Predictions ---")
predict_sentiment("A 'hindrance to operations': extracts from the leaked reports")
predict_sentiment("Stock prices soared after strong earnings report.")
predict_sentiment("Lessons of law's hard heart")
predict_sentiment("Victory and celebration in the city")
predict_sentiment("you are fired .")

Successfully loaded Data.csv
Calculated Class Weights: {0: np.float64(1.0596899224806202), 1: np.float64(0.9466759002770083)}


Epoch 1/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 246ms/step - accuracy: 0.4764 - loss: 0.7000 - val_accuracy: 0.4994 - val_loss: 0.6935
Epoch 2/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 240ms/step - accuracy: 0.6087 - loss: 0.6755 - val_accuracy: 0.4641 - val_loss: 0.6977
Epoch 3/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 233ms/step - accuracy: 0.8035 - loss: 0.5828 - val_accuracy: 0.4970 - val_loss: 0.7198
Epoch 4/20
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 238ms/step - accuracy: 0.9853 - loss: 0.2118 - val_accuracy: 0.5189 - val_loss: 0.7906

Final Test Accuracy: 49.94%

--- Making Final Predictions ---
Text: "A 'hindrance to operations': extracts from the leaked reports" --> Sentiment: Negative (Confidence: 50.82%)
Text: "Stock prices soared after strong earnings report." --> Sentiment: Negative (Confidence: 50.06%)
Text: "Lessons of law's hard heart" --> Sentiment: Positive

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import re
import os
import random

# --- Reproducibility ---
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

# --- Preprocessing function ---
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove urls
    text = re.sub(r"[^a-zA-Z]", " ", text)  # keep only letters
    text = text.lower()
    return text

# Example dataset load
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks(DL)/datasets/Data.csv", encoding="ISO-8859-1")  # <-- replace with your dataset

# Combine 'Top' columns into 'Combined_Text'
text_columns = [f'Top{i}' for i in range(1, 26)]
df['Combined_Text'] = df[text_columns].astype(str).agg(' '.join, axis=1)

df['Combined_Text'] = df['Combined_Text'].apply(clean_text)

# Encode labels
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])  # 0=Negative, 1=Positive

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['Combined_Text'], df['Label'], test_size=0.2, random_state=seed, stratify=df['Label']
)

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB = 20000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

# Class weights (handles imbalance)
weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(weights))

# --- Model Architecture with GRU ---
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout

EMBED_DIM = 100

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Bidirectional(GRU(128, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(GRU(64)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    metrics=['accuracy']
)

# --- Callbacks ---
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True
)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=2
)

# --- Training ---
history = model.fit(
    X_train_seq, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=64,
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# --- Evaluation ---
loss, acc = model.evaluate(X_test_seq, y_test, verbose=0)
print(f"\nFinal Test Accuracy: {acc * 100:.2f}%")

# --- Prediction function ---
def predict_sentiment(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    prediction = model.predict(padded)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    confidence = prediction if sentiment == "Positive" else 1 - prediction
    print(f"Text: \"{text}\" --> Sentiment: {sentiment} (Confidence: {confidence * 100:.2f}%)")

# --- Test Predictions ---
print("\n--- Making Final Predictions ---")
predict_sentiment("A 'hindrance to operations': extracts from the leaked reports")
predict_sentiment("Stock prices soared after strong earnings report.")
predict_sentiment("Lessons of law's hard heart")
predict_sentiment("Victory and celebration in the city")
predict_sentiment("you are fired .")

Epoch 1/20




[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 871ms/step - accuracy: 0.5479 - loss: 0.6921 - val_accuracy: 0.4802 - val_loss: 0.6935 - learning_rate: 0.0010
Epoch 2/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 823ms/step - accuracy: 0.5519 - loss: 0.6789 - val_accuracy: 0.4924 - val_loss: 0.8230 - learning_rate: 0.0010
Epoch 3/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 938ms/step - accuracy: 0.8456 - loss: 0.3688 - val_accuracy: 0.5137 - val_loss: 1.6114 - learning_rate: 0.0010
Epoch 4/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 833ms/step - accuracy: 0.9839 - loss: 0.0385 - val_accuracy: 0.5198 - val_loss: 1.7073 - learning_rate: 5.0000e-04

Final Test Accuracy: 46.77%

--- Making Final Predictions ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Text: "a  hindrance to operations   extracts from the leaked reports" --> Sentiment: Positive (Confidence: 50.24%)
[1m1/1[0m

In [5]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-09-28 15:54:47--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-09-28 15:54:47--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-09-28 15:54:47--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import re
import os
import random

# --- Reproducibility ---
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

# --- Preprocessing function ---
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove urls
    text = re.sub(r"[^a-zA-Z]", " ", text)  # keep only letters
    text = text.lower()
    return text

# --- Load dataset ---
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks(DL)/datasets/Data.csv", encoding="ISO-8859-1")

# Combine 'Top' columns into one text column
text_columns = [f'Top{i}' for i in range(1, 26)]
df['Combined_Text'] = df[text_columns].astype(str).agg(' '.join, axis=1)
df['Combined_Text'] = df['Combined_Text'].apply(clean_text)

# Encode labels
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])  # 0=Negative, 1=Positive

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['Combined_Text'], df['Label'], test_size=0.2, random_state=seed, stratify=df['Label']
)

# --- Tokenization ---
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB = 20000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

# --- Class weights (handles imbalance) ---
weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(weights))

# --- Download & Load GloVe embeddings ---

embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector

embedding_matrix = np.zeros((MAX_VOCAB, 100))
for word, i in tokenizer.word_index.items():
    if i < MAX_VOCAB:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# --- Model Architecture with GRU + GloVe ---
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.regularizers import l2

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=100, weights=[embedding_matrix],
              input_length=MAX_LEN, trainable=False),
    Dropout(0.3),
    Bidirectional(GRU(64, return_sequences=True, kernel_regularizer=l2(1e-4))),
    Dropout(0.4),
    Bidirectional(GRU(32, kernel_regularizer=l2(1e-4))),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=['accuracy']
)

# --- Callbacks ---
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=3
)

# --- Training ---
history = model.fit(
    X_train_seq, y_train,
    validation_split=0.2,
    epochs=25,
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# --- Evaluation ---
loss, acc = model.evaluate(X_test_seq, y_test, verbose=0)
print(f"\nFinal Test Accuracy: {acc * 100:.2f}%")

# --- Prediction function ---
def predict_sentiment(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    prediction = model.predict(padded)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    confidence = prediction if sentiment == "Positive" else 1 - prediction
    print(f"Text: \"{text}\" --> Sentiment: {sentiment} (Confidence: {confidence * 100:.2f}%)")

# --- Test Predictions ---
print("\n--- Making Final Predictions ---")
predict_sentiment("A 'hindrance to operations': extracts from the leaked reports")
predict_sentiment("Stock prices soared after strong earnings report.")
predict_sentiment("Lessons of law's hard heart")
predict_sentiment("Victory and celebration in the city")
predict_sentiment("you are fired .")


Epoch 1/25




[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 225ms/step - accuracy: 0.4758 - loss: 0.7530 - val_accuracy: 0.5198 - val_loss: 0.7404 - learning_rate: 1.0000e-04
Epoch 2/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 206ms/step - accuracy: 0.5082 - loss: 0.7397 - val_accuracy: 0.5107 - val_loss: 0.7396 - learning_rate: 1.0000e-04
Epoch 3/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 217ms/step - accuracy: 0.4892 - loss: 0.7405 - val_accuracy: 0.5091 - val_loss: 0.7386 - learning_rate: 1.0000e-04
Epoch 4/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 204ms/step - accuracy: 0.4873 - loss: 0.7401 - val_accuracy: 0.5061 - val_loss: 0.7377 - learning_rate: 1.0000e-04
Epoch 5/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 200ms/step - accuracy: 0.5071 - loss: 0.7379 - val_accuracy: 0.5000 - val_loss: 0.7370 - learning_rate: 1.0000e-04
Epoch 6/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import re
import os
import random

# --- Reproducibility ---
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

# --- Preprocessing function ---
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove urls
    text = re.sub(r"[^a-zA-Z]", " ", text)  # keep only letters
    text = text.lower()
    return text

# --- Load dataset ---
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks(DL)/datasets/Data.csv", encoding="ISO-8859-1")

# Combine 'Top' columns into one text column
text_columns = [f'Top{i}' for i in range(1, 26)]
df['Combined_Text'] = df[text_columns].astype(str).agg(' '.join, axis=1)
df['Combined_Text'] = df['Combined_Text'].apply(clean_text)

# Encode labels
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])  # 0=Negative, 1=Positive

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['Combined_Text'], df['Label'], test_size=0.2, random_state=seed, stratify=df['Label']
)

# --- Tokenization ---
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB = 50000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

# --- Class weights (handles imbalance) ---
weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(weights))

# --- Download & Load GloVe embeddings ---

embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector

embedding_matrix = np.zeros((MAX_VOCAB, 100))
for word, i in tokenizer.word_index.items():
    if i < MAX_VOCAB:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# --- Model Architecture with GRU + GloVe ---
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.regularizers import l2

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=100, weights=[embedding_matrix],
              input_length=MAX_LEN, trainable=True),
    Dropout(0.3),
    Bidirectional(GRU(64, return_sequences=True, kernel_regularizer=l2(1e-4))),
    Dropout(0.4),
    Bidirectional(GRU(32, kernel_regularizer=l2(1e-4))),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=['accuracy']
)

# --- Callbacks ---
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=3
)

# --- Training ---
history = model.fit(
    X_train_seq, y_train,
    validation_split=0.2,
    epochs=25,
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# --- Evaluation ---
loss, acc = model.evaluate(X_test_seq, y_test, verbose=0)
print(f"\nFinal Test Accuracy: {acc * 100:.2f}%")

# --- Prediction function ---
def predict_sentiment(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    prediction = model.predict(padded)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    confidence = prediction if sentiment == "Positive" else 1 - prediction
    print(f"Text: \"{text}\" --> Sentiment: {sentiment} (Confidence: {confidence * 100:.2f}%)")

# --- Test Predictions ---
print("\n--- Making Final Predictions ---")
predict_sentiment("A 'hindrance to operations': extracts from the leaked reports")
predict_sentiment("Stock prices soared after strong earnings report.")
predict_sentiment("Lessons of law's hard heart")
predict_sentiment("Victory and celebration in the city")
predict_sentiment("you are fired .")



Epoch 1/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 473ms/step - accuracy: 0.4920 - loss: 0.7482 - val_accuracy: 0.4924 - val_loss: 0.7429 - learning_rate: 1.0000e-04
Epoch 2/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 466ms/step - accuracy: 0.5008 - loss: 0.7415 - val_accuracy: 0.4924 - val_loss: 0.7399 - learning_rate: 1.0000e-04
Epoch 3/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 465ms/step - accuracy: 0.5103 - loss: 0.7394 - val_accuracy: 0.5091 - val_loss: 0.7382 - learning_rate: 1.0000e-04
Epoch 4/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 464ms/step - accuracy: 0.5029 - loss: 0.7359 - val_accuracy: 0.5030 - val_loss: 0.7375 - learning_rate: 1.0000e-04
Epoch 5/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 462ms/step - accuracy: 0.5035 - loss: 0.7377 - val_accuracy: 0.5137 - val_loss: 0.7371 - learning_rate: 1.0000e-04
Epoch 6/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import re
import os
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.regularizers import l2

# --- Reproducibility ---
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

# --- Preprocessing function ---
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove urls
    text = re.sub(r"[^a-zA-Z]", " ", text)  # keep only letters
    text = text.lower()
    return text

# --- Load dataset ---
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks(DL)/datasets/Data.csv", encoding="ISO-8859-1")

# Combine 'Top' columns into one text column
text_columns = [f'Top{i}' for i in range(1, 26)]
df['Combined_Text'] = df[text_columns].astype(str).agg(' '.join, axis=1)
df['Combined_Text'] = df['Combined_Text'].apply(clean_text)

# Encode labels
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])  # 0=Negative, 1=Positive

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['Combined_Text'], df['Label'], test_size=0.2, random_state=seed, stratify=df['Label']
)

# --- Tokenization ---
MAX_VOCAB = 40000  # Increased vocabulary size
MAX_LEN = 150      # Increased max sequence length

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

# --- Class weights (handles imbalance) ---
weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(weights))

# --- Download & Load GloVe embeddings ---
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector

embedding_matrix = np.zeros((MAX_VOCAB, 100))
for word, i in tokenizer.word_index.items():
    if i < MAX_VOCAB:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# --- Model Architecture with GRU + GloVe ---
model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=100, weights=[embedding_matrix],
              input_length=MAX_LEN, trainable=True), # Changed trainable to True
    Bidirectional(GRU(64, return_sequences=True, kernel_regularizer=l2(1e-4))),
    Dropout(0.4),
    Bidirectional(GRU(32, kernel_regularizer=l2(1e-4))),
    Dropout(0.4), # Increased Dropout
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4), # Increased learning rate
    metrics=['accuracy']
)

# --- Callbacks ---
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=3
)

# --- Training ---
history = model.fit(
    X_train_seq, y_train,
    validation_split=0.2,
    epochs=25,
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# --- Evaluation ---
loss, acc = model.evaluate(X_test_seq, y_test, verbose=0)
print(f"\nFinal Test Accuracy: {acc * 100:.2f}%")

# --- Prediction function ---
def predict_sentiment(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    prediction = model.predict(padded)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    confidence = prediction if sentiment == "Positive" else 1 - prediction
    print(f"Text: \"{text}\" --> Sentiment: {sentiment} (Confidence: {confidence * 100:.2f}%)")

# --- Test Predictions ---
print("\n--- Making Final Predictions ---")
predict_sentiment("A 'hindrance to operations': extracts from the leaked reports")
predict_sentiment("Stock prices soared after strong earnings report.")
predict_sentiment("Lessons of law's hard heart")
predict_sentiment("Victory and celebration in the city")
predict_sentiment("you are fired .")

--2025-09-19 08:21:05--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-09-19 08:21:05--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-09-19 08:21:05--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202



Epoch 1/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 411ms/step - accuracy: 0.4922 - loss: 0.7559 - val_accuracy: 0.4482 - val_loss: 0.7407 - learning_rate: 5.0000e-04
Epoch 2/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 393ms/step - accuracy: 0.4890 - loss: 0.7400 - val_accuracy: 0.4817 - val_loss: 0.7361 - learning_rate: 5.0000e-04
Epoch 3/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 368ms/step - accuracy: 0.5120 - loss: 0.7327 - val_accuracy: 0.5198 - val_loss: 0.7304 - learning_rate: 5.0000e-04
Epoch 4/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 363ms/step - accuracy: 0.5427 - loss: 0.7252 - val_accuracy: 0.4802 - val_loss: 0.7387 - learning_rate: 5.0000e-04
Epoch 5/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 379ms/step - accuracy: 0.5427 - loss: 0.7165 - val_accuracy: 0.4787 - val_loss: 0.7360 - learning_rate: 5.0000e-04
Epoch 6/25
[1m82/82[0m [32m━━━━━━━━━━━━━━━