<a href="https://colab.research.google.com/github/BhanuDanda/NLP/blob/main/01-09-2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
df = pd.read_csv("/content/tweets.csv")
print(df.head())


   id keyword        location  \
0   0  ablaze             NaN   
1   1  ablaze             NaN   
2   2  ablaze   New York City   
3   3  ablaze  Morgantown, WV   
4   4  ablaze             NaN   

                                                text  target  
0  Communal violence in Bhainsa, Telangana. "Ston...       1  
1  Telangana: Section 144 has been imposed in Bha...       1  
2  Arsonist sets cars ablaze at dealership https:...       1  
3  Arsonist sets cars ablaze at dealership https:...       1  
4  "Lord Jesus, your love brings freedom and pard...       0  


In [14]:
import re
import string

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\\S+|www\\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

df["clean_text"] = df["text"].apply(clean_text)
print(df["clean_text"].head())


0    communal violence in bhainsa telangana stones ...
1    telangana section 144 has been imposed in bhai...
2    arsonist sets cars ablaze at dealership httpst...
3    arsonist sets cars ablaze at dealership httpst...
4    lord jesus your love brings freedom and pardon...
Name: clean_text, dtype: object


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

df["clean_text"] = df["clean_text"].fillna("")

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df["clean_text"].astype(str))
y = df["target"]

print("TF-IDF shape:", X.shape)


TF-IDF shape: (11370, 5000)


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print( "Test shape:", X_test.shape)

Train shape: (9096, 5000)
Test shape: (2274, 5000)


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"\n{name}")
    print("Accuracy :", round(accuracy_score(y_test, preds), 3))
    print("Precision:", round(precision_score(y_test, preds), 3))
    print("Recall   :", round(recall_score(y_test, preds), 3))
    print("F1 Score :", round(f1_score(y_test, preds), 3))
    print(classification_report(y_test, preds, digits=3))



Logistic Regression
Accuracy : 0.885
Precision: 0.894
Recall   : 0.384
F1 Score : 0.537
              precision    recall  f1-score   support

           0      0.884     0.990     0.934      1878
           1      0.894     0.384     0.537       396

    accuracy                          0.885      2274
   macro avg      0.889     0.687     0.736      2274
weighted avg      0.886     0.885     0.865      2274


SVM
Accuracy : 0.895
Precision: 0.751
Recall   : 0.593
F1 Score : 0.663
              precision    recall  f1-score   support

           0      0.918     0.958     0.938      1878
           1      0.751     0.593     0.663       396

    accuracy                          0.895      2274
   macro avg      0.834     0.776     0.800      2274
weighted avg      0.889     0.895     0.890      2274


Random Forest
Accuracy : 0.892
Precision: 0.802
Recall   : 0.503
F1 Score : 0.618
              precision    recall  f1-score   support

           0      0.903     0.974     0.937   

In [23]:
feature_names = vectorizer.get_feature_names_out()

coefs = models["Logistic Regression"].coef_[0]

top10_idx = coefs.argsort()[-10:][::-1]

print("\nTop 10 words indicating disaster tweets:")
for i in top10_idx:
    print(feature_names[i], "->", round(coefs[i], 4))



Top 10 words indicating disaster tweets:
thunderstorm -> 3.9691
train -> 3.1854
killed -> 3.167
collision -> 3.0401
earthquake -> 2.7903
severe -> 2.7067
died -> 2.6482
sinkhole -> 2.5899
windstorm -> 2.5896
