<a href="https://colab.research.google.com/github/AKHIRANANDHINI/NLP/blob/main/Lab%205.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import re
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, optimizers, callbacks
import nltk
from nltk.corpus import stopwords
import zipfile
nltk.download('stopwords')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
MAX_NUM_WORDS = 30000
MAX_SEQUENCE_LENGTH = 40
EMBEDDING_DIM = 100
BATCH_SIZE = 64
EPOCHS = 8
VALIDATION_SPLIT = 0.1

# Correctly read 'Tweets.csv' from the zip file
with zipfile.ZipFile("/content/archive (1).zip") as z:
    with z.open("Tweets.csv") as f:
        df = pd.read_csv(f)

df = df.rename(columns={'airline_sentiment': 'target'}) # Rename for consistency
df = df[['text','target']].dropna().reset_index(drop=True)

STOPWORDS = set(stopwords.words('english'))
def clean_tweet(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'http\S+|www.\S+', ' ', text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text) # Fix: removed newline character
    tokens = text.split() # Fix: define tokens here
    tokens = [t for t in tokens if t not in STOPWORDS and len(t)>1]
    return ' '.join(tokens)

tqdm.pandas()
df['clean_text'] = df['text'].progress_apply(clean_tweet)

X = df['clean_text'].values
y = df['target'].values

# Convert target to numerical if it's categorical (e.g., 'positive', 'negative', 'neutral')
# Assuming the goal is binary classification, let's map 'positive' to 1 and 'negative' to 0, and remove 'neutral'
# If the problem requires multi-class, this needs adjustment.
# For now, let's filter for binary classification as implied by binary metrics and sigmoid output
df_binary = df[df['target'].isin(['positive', 'negative'])].copy()
df_binary['target'] = df_binary['target'].map({'positive': 1, 'negative': 0})

X = df_binary['clean_text'].values
y = df_binary['target'].values

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

cv = CountVectorizer(max_features=20000, ngram_range=(1,2))
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2))
X_train_cv = cv.fit_transform(X_train_raw)
X_test_cv = cv.transform(X_test_raw)
X_train_tfidf = tfidf.fit_transform(X_train_raw)
X_test_tfidf = tfidf.transform(X_test_raw)

def eval_and_print(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)
    print(f"{name} -> Acc: {acc:.4f}, Precision: {p:.4f}, Recall: {r:.4f}, F1: {f1:.4f}")
    return {'model': name, 'accuracy': acc, 'precision': p, 'recall': r, 'f1': f1}

results = []
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
lr.fit(X_train_tfidf, y_train)
results.append(eval_and_print(lr, X_test_tfidf, y_test, "LogisticRegression-TFIDF"))

svm = LinearSVC(max_iter=2000, class_weight='balanced', random_state=RANDOM_SEED)
svm.fit(X_train_tfidf, y_train)
results.append(eval_and_print(svm, X_test_tfidf, y_test, "LinearSVC-TFIDF"))

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token='')
tokenizer.fit_on_texts(X_train_raw)
X_train_seq = tokenizer.texts_to_sequences(X_train_raw)
X_test_seq = tokenizer.texts_to_sequences(X_test_raw)
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

word_index = tokenizer.word_index
vocab_size = min(MAX_NUM_WORDS, len(word_index) + 1)

from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

def compute_metrics_from_probs(probs, y_true, threshold=0.5):
    y_pred = (probs >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    return acc, p, r, f1

def train_and_evaluate_keras(model, X_tr, y_tr, X_te, y_te, name, epochs=EPOCHS, batch_size=BATCH_SIZE):
    es = callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    history = model.fit(X_tr, y_tr, validation_split=VALIDATION_SPLIT, epochs=epochs, batch_size=batch_size, class_weight=class_weight_dict, callbacks=[es], verbose=1)
    probs = model.predict(X_te, batch_size=128).ravel()
    acc, p, r, f1 = compute_metrics_from_probs(probs, y_te)
    print(f"{name} -> Acc: {acc:.4f}, Precision: {p:.4f}, Recall: {r:.4f}, F1: {f1:.4f}")
    return {'model': name, 'accuracy': acc, 'precision': p, 'recall': r, 'f1': f1, 'history': history}

def build_mlp_avg(vocab_size, embedding_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH):
    inp = layers.Input(shape=(input_length,))
    x = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length)(inp)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer=optimizers.Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

mlp_model = build_mlp_avg(vocab_size=vocab_size)
results.append(train_and_evaluate_keras(mlp_model, X_train_pad, y_train, X_test_pad, y_test, "MLP-AverageEmb"))

def build_cnn1d(vocab_size, embedding_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH):
    inp = layers.Input(shape=(input_length,))
    x = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length)(inp)
    convs = []
    for fsz in [2,3,4]:
        c = layers.Conv1D(filters=128, kernel_size=fsz, activation='relu')(x)
        c = layers.GlobalMaxPooling1D()(c)
        convs.append(c)
    x = layers.concatenate(convs)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer=optimizers.Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

cnn_model = build_cnn1d(vocab_size=vocab_size)
results.append(train_and_evaluate_keras(cnn_model, X_train_pad, y_train, X_test_pad, y_test, "CNN1D"))

def build_lstm(vocab_size, embedding_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH):
    inp = layers.Input(shape=(input_length,))
    x = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length)(inp)
    x = layers.SpatialDropout1D(0.2)(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=False))(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.25)(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer=optimizers.Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

lstm_model = build_lstm(vocab_size=vocab_size)
results.append(train_and_evaluate_keras(lstm_model, X_train_pad, y_train, X_test_pad, y_test, "BiLSTM"))

print("\n=== RESULTS SUMMARY ==")
res_df = pd.DataFrame(results)
print(res_df[['model','accuracy','precision','recall','f1']].sort_values(by='f1', ascending=False).to_string(index=False))
res_df.to_csv('model_comparison_results.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 14640/14640 [00:00<00:00, 64845.85it/s]


LogisticRegression-TFIDF -> Acc: 0.8982, Precision: 0.7439, Recall: 0.7674, F1: 0.7555
LinearSVC-TFIDF -> Acc: 0.9078, Precision: 0.7826, Recall: 0.7611, F1: 0.7717
Epoch 1/8




[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.4780 - loss: 0.6796 - val_accuracy: 0.7846 - val_loss: 0.5406
Epoch 2/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.7996 - loss: 0.4347 - val_accuracy: 0.8214 - val_loss: 0.4131
Epoch 3/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.8854 - loss: 0.2652 - val_accuracy: 0.8636 - val_loss: 0.3212
Epoch 4/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.9073 - loss: 0.2148 - val_accuracy: 0.8939 - val_loss: 0.2587
Epoch 5/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.9337 - loss: 0.1637 - val_accuracy: 0.8896 - val_loss: 0.2747
Epoch 6/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9440 - loss: 0.1456 - val_accuracy: 0.8864 - val_loss: 0.2941
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━



[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 52ms/step - accuracy: 0.7691 - loss: 0.5795 - val_accuracy: 0.9004 - val_loss: 0.2693
Epoch 2/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 61ms/step - accuracy: 0.9083 - loss: 0.2376 - val_accuracy: 0.9004 - val_loss: 0.2660
Epoch 3/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 55ms/step - accuracy: 0.9549 - loss: 0.1176 - val_accuracy: 0.9037 - val_loss: 0.3017
Epoch 4/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 54ms/step - accuracy: 0.9774 - loss: 0.0602 - val_accuracy: 0.9015 - val_loss: 0.3609
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step
CNN1D -> Acc: 0.8870, Precision: 0.6956, Recall: 0.7970, F1: 0.7429
Epoch 1/8




[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 178ms/step - accuracy: 0.7514 - loss: 0.5650 - val_accuracy: 0.8831 - val_loss: 0.2939
Epoch 2/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 172ms/step - accuracy: 0.9067 - loss: 0.2282 - val_accuracy: 0.8918 - val_loss: 0.2870
Epoch 3/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 184ms/step - accuracy: 0.9481 - loss: 0.1410 - val_accuracy: 0.8929 - val_loss: 0.3143
Epoch 4/8
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 171ms/step - accuracy: 0.9648 - loss: 0.1011 - val_accuracy: 0.8950 - val_loss: 0.3847
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 135ms/step
BiLSTM -> Acc: 0.8670, Precision: 0.6277, Recall: 0.8626, F1: 0.7266

=== RESULTS SUMMARY ==
                   model  accuracy  precision   recall       f1
         LinearSVC-TFIDF  0.907752   0.782609 0.761099 0.771704
LogisticRegression-TFIDF  0.898224   0.743852 0.767442 0.7