<a href="https://colab.research.google.com/github/BhanuDanda/NLP/blob/main/08-09-2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import string

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/tweets.csv")

X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

print("\nSample training tweets:\n", X_train.head())
print("\nSample test tweets:\n", X_test.head())

Training set size: 9096
Test set size: 2274

Sample training tweets:
 10497    Had a dream (nightmare ) last night that I wor...
1462     There it is, on cue! Body Bags Burrow with the...
997      it's weird how ppl w vaginas literally spent d...
6550     17 This short urgent thread bcoz ALTIF BUKHARI...
11165    3. Tens of thousands of American soldiers died...
Name: text, dtype: object

Sample test tweets:
 611     #USASupportsTerrorist 15 out of the 19 terrori...
4178    V commented on a post on WeVerse about the eru...
6561    Listen to Danrell x Småland - Hostage by HIGH ...
3125    Light Yagami Light Yagami at the start by the ...
4186    Safe to the evil tyranny, but torture to the c...
Name: text, dtype: object


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("/content/tweets.csv")

X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

lr = LogisticRegression(max_iter=200)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8632365875109939
              precision    recall  f1-score   support

           0       0.87      0.99      0.92      1851
           1       0.84      0.33      0.47       423

    accuracy                           0.86      2274
   macro avg       0.85      0.66      0.70      2274
weighted avg       0.86      0.86      0.84      2274



In [6]:
svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)

print("\nSVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))



SVM Accuracy: 0.8773087071240105
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      1851
           1       0.72      0.57      0.63       423

    accuracy                           0.88      2274
   macro avg       0.81      0.76      0.78      2274
weighted avg       0.87      0.88      0.87      2274



In [10]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)

print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))



Random Forest Accuracy: 0.876429199648197
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      1851
           1       0.78      0.47      0.59       423

    accuracy                           0.88      2274
   macro avg       0.83      0.72      0.76      2274
weighted avg       0.87      0.88      0.86      2274



In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv("/content/tweets.csv")

X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "SVM (LinearSVC)": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append([name, acc, prec, rec, f1])

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
print(results_df)

                 Model  Accuracy  Precision    Recall  F1-Score
0  Logistic Regression  0.863237   0.837349  0.328605  0.471986
1      SVM (LinearSVC)  0.877309   0.715569  0.565012  0.631440
2        Random Forest  0.876429   0.775194  0.472813  0.587372


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("/content/tweets.csv")

X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)

lr = LogisticRegression(max_iter=200)
lr.fit(X_train_tfidf, y_train)

feature_names = vectorizer.get_feature_names_out()
coefficients = lr.coef_[0]

top10_idx = coefficients.argsort()[-10:][::-1]
top10_features = [(feature_names[i], coefficients[i]) for i in top10_idx]

print("Top 10 features (words) most indicative of disaster tweets:")
for word, coef in top10_features:
    print(f"{word:15} {coef:.4f}")


Top 10 features (words) most indicative of disaster tweets:
thunderstorm    4.0384
collision       3.3163
killed          3.1023
train           3.0805
volcano         2.8850
died            2.6671
road            2.6268
sinkhole        2.6088
windstorm       2.5516


In [7]:
cnn=Sequential([
                Embedding(input_dim=10000,output_dim=128,input_length=100),
                Conv1D(128,5,activation='relu'),
                GlobalMaxPooling1D(),
                Dense(64,activation='relu'),
                Dropout(0.3),
                Dense(1,activation='sigmoid')])
cnn.summary()




In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and split data (assuming df, X, y are already loaded)
df = pd.read_csv("/content/tweets.csv")
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Tokenize and pad sequences (code from cell iu9kp7VlvVfe)
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train) # Fit only on training data
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test) # Tokenize test data
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100) # Pad test data
vocab_size = len(tokenizer.word_index) + 1

# Define and train the LSTM model
lstm = Sequential([
    Embedding(vocab_size, 128, input_length=100),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=64)

Epoch 1/5




[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 239ms/step - accuracy: 0.8213 - loss: 0.4571 - val_accuracy: 0.8813 - val_loss: 0.2954
Epoch 2/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 241ms/step - accuracy: 0.9303 - loss: 0.2052 - val_accuracy: 0.8839 - val_loss: 0.3083
Epoch 3/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 248ms/step - accuracy: 0.9620 - loss: 0.1134 - val_accuracy: 0.8804 - val_loss: 0.3283
Epoch 4/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 228ms/step - accuracy: 0.9790 - loss: 0.0697 - val_accuracy: 0.8835 - val_loss: 0.4715
Epoch 5/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 249ms/step - accuracy: 0.9859 - loss: 0.0441 - val_accuracy: 0.8751 - val_loss: 0.5203


<keras.src.callbacks.history.History at 0x7e49a4287d40>