<a href="https://colab.research.google.com/github/2303A51526/NLP/blob/main/ASSIGNMENT_12_09_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import re
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, optimizers, callbacks
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
MAX_NUM_WORDS = 30000
MAX_SEQUENCE_LENGTH = 40
EMBEDDING_DIM = 100
BATCH_SIZE = 64
EPOCHS = 8
VALIDATION_SPLIT = 0.1

df = pd.read_csv('/content/archive (1).zip')
df = df[['text','target']].dropna().reset_index(drop=True)

STOPWORDS = set(stopwords.words('english'))
def clean_tweet(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'http\S+|www.\S+', ' ', text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in STOPWORDS and len(t)>1]
    return ' '.join(tokens)

tqdm.pandas()
df['clean_text'] = df['text'].progress_apply(clean_tweet)

X = df['clean_text'].values
y = df['target'].values
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y)

cv = CountVectorizer(max_features=20000, ngram_range=(1,2))
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2))
X_train_cv = cv.fit_transform(X_train_raw)
X_test_cv = cv.transform(X_test_raw)
X_train_tfidf = tfidf.fit_transform(X_train_raw)
X_test_tfidf = tfidf.transform(X_test_raw)

def eval_and_print(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)
    print(f"{name} -> Acc: {acc:.4f}, Precision: {p:.4f}, Recall: {r:.4f}, F1: {f1:.4f}")
    return {'model': name, 'accuracy': acc, 'precision': p, 'recall': r, 'f1': f1}

results = []
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
lr.fit(X_train_tfidf, y_train)
results.append(eval_and_print(lr, X_test_tfidf, y_test, "LogisticRegression-TFIDF"))

svm = LinearSVC(max_iter=2000, class_weight='balanced', random_state=RANDOM_SEED)
svm.fit(X_train_tfidf, y_train)
results.append(eval_and_print(svm, X_test_tfidf, y_test, "LinearSVC-TFIDF"))

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token='')
tokenizer.fit_on_texts(X_train_raw)
X_train_seq = tokenizer.texts_to_sequences(X_train_raw)
X_test_seq = tokenizer.texts_to_sequences(X_test_raw)
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

word_index = tokenizer.word_index
vocab_size = min(MAX_NUM_WORDS, len(word_index) + 1)

from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

def compute_metrics_from_probs(probs, y_true, threshold=0.5):
    y_pred = (probs >= threshold).astype(int)
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    return acc, p, r, f1

def train_and_evaluate_keras(model, X_tr, y_tr, X_te, y_te, name, epochs=EPOCHS, batch_size=BATCH_SIZE):
    es = callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    history = model.fit(X_tr, y_tr, validation_split=VALIDATION_SPLIT, epochs=epochs, batch_size=batch_size, class_weight=class_weight_dict, callbacks=[es], verbose=1)
    probs = model.predict(X_te, batch_size=128).ravel()
    acc, p, r, f1 = compute_metrics_from_probs(probs, y_te)
    print(f"{name} -> Acc: {acc:.4f}, Precision: {p:.4f}, Recall: {r:.4f}, F1: {f1:.4f}")
    return {'model': name, 'accuracy': acc, 'precision': p, 'recall': r, 'f1': f1, 'history': history}

def build_mlp_avg(vocab_size, embedding_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH):
    inp = layers.Input(shape=(input_length,))
    x = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length)(inp)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer=optimizers.Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

mlp_model = build_mlp_avg(vocab_size=vocab_size)
results.append(train_and_evaluate_keras(mlp_model, X_train_pad, y_train, X_test_pad, y_test, "MLP-AverageEmb"))

def build_cnn1d(vocab_size, embedding_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH):
    inp = layers.Input(shape=(input_length,))
    x = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length)(inp)
    convs = []
    for fsz in [2,3,4]:
        c = layers.Conv1D(filters=128, kernel_size=fsz, activation='relu')(x)
        c = layers.GlobalMaxPooling1D()(c)
        convs.append(c)
    x = layers.concatenate(convs)
    x = layers.Dropout(0.4)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer=optimizers.Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

cnn_model = build_cnn1d(vocab_size=vocab_size)
results.append(train_and_evaluate_keras(cnn_model, X_train_pad, y_train, X_test_pad, y_test, "CNN1D"))

def build_lstm(vocab_size, embedding_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH):
    inp = layers.Input(shape=(input_length,))
    x = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length)(inp)
    x = layers.SpatialDropout1D(0.2)(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=False))(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.25)(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=inp, outputs=out)
    model.compile(optimizer=optimizers.Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    return model

lstm_model = build_lstm(vocab_size=vocab_size)
results.append(train_and_evaluate_keras(lstm_model, X_train_pad, y_train, X_test_pad, y_test, "BiLSTM"))

print("\n=== RESULTS SUMMARY ===")
res_df = pd.DataFrame(results)
print(res_df[['model','accuracy','precision','recall','f1']].sort_values(by='f1', ascending=False).to_string(index=False))
res_df.to_csv('model_comparison_results.csv', index=False)

Q1: First 3 resumes (raw):
                                              resume
0  John Doe\n• Skilled in Python, Java, SQL.\nWor...
1  Jane Smith - Experienced in project management...
2  Michael Brown\nProficient in C++, JavaScript •...

Check for noisy characters like \n, •, & symbols:
John Doe
• Skilled in Python, Java, SQL.
Worked on data analysis & ML projects.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [18]:
# Download the spaCy English language model
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
