<a href="https://colab.research.google.com/github/2303A51786/nlp/blob/main/ASS6_NLP_1786.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Conv1D, GlobalMaxPooling1D, LSTM, Dropout

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset
df = pd.read_csv("/content/tweets.csv")

# ================== 1. PREPROCESSING ==================
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # remove mentions
    text = re.sub(r'#', '', text)               # remove hashtags symbol
    text = re.sub(r'http\S+|www.\S+', '', text) # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)       # remove punctuation & numbers
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(clean_text)

X = df['clean_text'].values
y = df['target'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ================== 2. FEATURE EXTRACTION ==================
# CountVectorizer
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Tokenizer + padding
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# ================== 3. MODELS ==================
# --- MLP on averaged embeddings ---
mlp_model = Sequential([
    Embedding(input_dim=max_words, output_dim=50, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
mlp_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
mlp_model.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_split=0.1, verbose=1)

# --- CNN ---
cnn_model = Sequential([
    Embedding(input_dim=max_words, output_dim=50, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_split=0.1, verbose=1)

# --- LSTM ---
lstm_model = Sequential([
    Embedding(input_dim=max_words, output_dim=50, input_length=max_len),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_split=0.1, verbose=1)

# ================== 4. EVALUATION ==================
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    print(f"===== {model_name} =====")
    print(classification_report(y_test, y_pred))

evaluate_model(mlp_model, X_test_pad, y_test, "MLP")
evaluate_model(cnn_model, X_test_pad, y_test, "CNN")
evaluate_model(lstm_model, X_test_pad, y_test, "LSTM")

# Logistic Regression with TF-IDF
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)
print("===== Logistic Regression (TF-IDF) =====")
print(classification_report(y_test, y_pred_lr))

# SVM with TF-IDF
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
print("===== SVM (TF-IDF) =====")
print(classification_report(y_test, y_pred_svm))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/3




[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7922 - loss: 0.4952 - val_accuracy: 0.7912 - val_loss: 0.5099
Epoch 2/3
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.8195 - loss: 0.4692 - val_accuracy: 0.7912 - val_loss: 0.5073
Epoch 3/3
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8140 - loss: 0.4738 - val_accuracy: 0.7912 - val_loss: 0.5053
Epoch 1/3
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 52ms/step - accuracy: 0.8023 - loss: 0.5127 - val_accuracy: 0.8505 - val_loss: 0.3644
Epoch 2/3
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 33ms/step - accuracy: 0.9079 - loss: 0.2434 - val_accuracy: 0.8923 - val_loss: 0.3043
Epoch 3/3
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.9707 - loss: 0.0937 - val_accuracy: 0.8890 - val_loss: 0.4007
Epoch 1/3
[1m128/128[0m [32m━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
===== CNN =====
              precision    recall  f1-score   support

           0       0.90      0.95      0.93      1851
           1       0.73      0.55      0.63       423

    accuracy                           0.88      2274
   macro avg       0.82      0.75      0.78      2274
weighted avg       0.87      0.88      0.87      2274

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step
===== LSTM =====
              precision    recall  f1-score   support

           0       0.81      1.00      0.90      1851
           1       0.00      0.00      0.00       423

    accuracy                           0.81      2274
   macro avg       0.41      0.50      0.45      2274
weighted avg       0.66      0.81      0.73      2274

===== Logistic Regression (TF-IDF) =====
              precision    recall  f1-score   support

           0       0.87      0.99      0.93      1851
           1   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


===== SVM (TF-IDF) =====
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      1851
           1       0.79      0.48      0.60       423

    accuracy                           0.88      2274
   macro avg       0.84      0.73      0.77      2274
weighted avg       0.87      0.88      0.87      2274

