# Importing Libraries

In [70]:
import pandas as pd
import numpy as np
import os
import re
!pip install contractions
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



# Loading Data 

In [71]:
os.chdir('C:\\Users\\Avita\\OneDrive\\Desktop\\Capstone Project\\Disaster Tweets (NLP)')

In [72]:
df=pd.read_csv('tweets.csv')

In [73]:
df

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
...,...,...,...,...,...
11365,11365,wrecked,Blue State in a red sea,Media should have warned us well in advance. T...,0
11366,11366,wrecked,arohaonces,i feel directly attacked 💀 i consider moonbin ...,0
11367,11367,wrecked,🇵🇭,i feel directly attacked 💀 i consider moonbin ...,0
11368,11368,wrecked,auroraborealis,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0


# EDA

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11370 entries, 0 to 11369
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        11370 non-null  int64 
 1   keyword   11370 non-null  object
 2   location  7952 non-null   object
 3   text      11370 non-null  object
 4   target    11370 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 444.3+ KB


In [75]:
df.dtypes

id           int64
keyword     object
location    object
text        object
target       int64
dtype: object

In [76]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [77]:
df['keyword'].value_counts()

keyword
thunderstorm     93
flattened        88
mass%20murder    86
stretcher        86
drown            83
                 ..
electrocuted     16
rainstorm        11
siren            10
deluged          10
tsunami           6
Name: count, Length: 219, dtype: int64

In [78]:
df['target'].value_counts()

target
0    9256
1    2114
Name: count, dtype: int64

In [79]:
df['target'].value_counts()/len(df)*100

target
0    81.407212
1    18.592788
Name: count, dtype: float64

In [80]:
### Since dataset is highly unbalanced so lets do downsampling of majority class

In [81]:
class_0=df[df['target']==0]

In [82]:
class_1=df[df['target']==1]

In [83]:
from sklearn.utils import resample

In [84]:
class_0_downsampled = resample(class_0,replace=False,n_samples=2500,random_state=65)

In [85]:
df_balanced = pd.concat([class_0_downsampled, class_1])

In [86]:
df=df_balanced

In [87]:
df

Unnamed: 0,id,keyword,location,text,target
2667,2667,crash,"London, England",Canada's Trudeau: Iran plane crash victims wou...,0
5002,5002,epicentre,"Penrith, Sydney",Just say it. Australia is run by a propaganda ...,0
3275,3275,debris,"Muirkirk, Scotland",Watch out for debris on the roads today 🍃🍃 htt...,0
7491,7491,military,,"Why is this more important, you ask? Because h...",0
3770,3770,derailment,"Lowca, Cumbria (in spirit)",Deliberately got an early train this morning o...,0
...,...,...,...,...,...
11338,11338,wrecked,Puchong,Kesian ular. We have wrecked their natural hab...,1
11354,11354,wrecked,,"Yeah, proper Liverpool fans wrecked Man City’s...",1
11355,11355,wrecked,Recife,"""Trump and Sisi 'rejected foreign exploitation...",1
11359,11359,wrecked,"Washington, DC","""Trump and Sisi 'rejected foreign exploitation...",1


In [88]:
df=df[['text','target']]

In [89]:
df.head()

Unnamed: 0,text,target
2667,Canada's Trudeau: Iran plane crash victims wou...,0
5002,Just say it. Australia is run by a propaganda ...,0
3275,Watch out for debris on the roads today 🍃🍃 htt...,0
7491,"Why is this more important, you ask? Because h...",0
3770,Deliberately got an early train this morning o...,0


# Data Preprocessing 

In [90]:
# Initialize NLP tools
ps = PorterStemmer()
lm = WordNetLemmatizer()
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('down')
stop_words.remove('not')


# 1. Convert to lowercase and strip whitespaces
def convert_lower(text):
    return text.lower().strip()

# 2. Expand contractions (like can't → cannot)
def expand_contractions(text):
    return contractions.fix(text)

# 3. Remove special characters, usernames, URLs, numbers
def remove_special_characters_and_numbers(text):
    text = re.sub(r"http\S+|www\S+|https\S+", ' link ', text)      # Replace links
    text = re.sub(r"@\w+", "", text)                               # Remove @mentions
    text = re.sub(r"[^a-zA-Z\s]", "", text)                        # Keep only alphabets
    return text

# 4. Remove extra white spaces
def remove_extra_spaces(text):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# 5. Stemming
def stemming(text):
    return ' '.join([ps.stem(word) for word in text.split()])

# 6. Lemmatization 
def lemmatize(text):
    return ' '.join([lm.lemmatize(word, pos='v') for word in text.split()])

# 7. Stopword Removal
def stopword_removal(text):
    return ' '.join([word for word in text.split() if word not in stop_words])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Avita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [91]:
df.head()

Unnamed: 0,text,target
2667,Canada's Trudeau: Iran plane crash victims wou...,0
5002,Just say it. Australia is run by a propaganda ...,0
3275,Watch out for debris on the roads today 🍃🍃 htt...,0
7491,"Why is this more important, you ask? Because h...",0
3770,Deliberately got an early train this morning o...,0


In [92]:
def clean_text_pipeline(text):
    text = convert_lower(text)
    text = expand_contractions(text)
    text = remove_special_characters_and_numbers(text)
    text = remove_extra_spaces(text)
    text = lemmatize(text)           
    text = stopword_removal(text)
    return text

In [93]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Avita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [94]:
df['clean_text'] = df['text'].apply(clean_text_pipeline)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['text'].apply(clean_text_pipeline)


In [95]:
df

Unnamed: 0,text,target,clean_text
2667,Canada's Trudeau: Iran plane crash victims wou...,0,canadas trudeau iran plane crash victims would...
5002,Just say it. Australia is run by a propaganda ...,0,say australia run propaganda mill murdoch lead...
3275,Watch out for debris on the roads today 🍃🍃 htt...,0,watch debris roads today link
7491,"Why is this more important, you ask? Because h...",0,important ask need military side selfinterest ...
3770,Deliberately got an early train this morning o...,0,deliberately get early train morning derailmen...
...,...,...,...
11338,Kesian ular. We have wrecked their natural hab...,1,kesian ular wreck natural habitat
11354,"Yeah, proper Liverpool fans wrecked Man City’s...",1,yeah proper liverpool fan wreck man citys bus ...
11355,"""Trump and Sisi 'rejected foreign exploitation...",1,trump sisi reject foreign exploitation agree p...
11359,"""Trump and Sisi 'rejected foreign exploitation...",1,trump sisi reject foreign exploitation agree p...


In [96]:
df['text'].values[:15]

array(["Canada's Trudeau: Iran plane crash victims would be alive had there been no tension in region https://t.co/RaQoyLebSI https://…",
       'Just say it. Australia is run by a propaganda mill in Murdoch. All 3 ‘leaders’ of the UK, US &amp; Aus were…',
       'Watch out for debris on the roads today 🍃🍃 https://t.co/kePfDBWx7w',
       'Why is this more important, you ask? Because he needs the military to be on his side. Self-interest above all nga eh. Is…',
       'Deliberately got an early train this morning only for there to be a derailment and significant delays https://t.co/i1THHBc1Tq',
       'DM us for more informations, also we encourage everyone who are going to attend ASAP this sunday to donate in any kinds. L…',
       '“Family. Back together again.” Watch this brand-new special look at Marvel Studios’ #BlackWidow, in theaters May 1. http…',
       'This guy is going to steal your girlfriend and obliterate you in the toplane. https://t.co/1KKgxjQg53',
       'The IMF obli

In [97]:
df['clean_text'].values[:15]

array(['canadas trudeau iran plane crash victims would alive tension region link link',
       'say australia run propaganda mill murdoch leaders uk us amp aus',
       'watch debris roads today link',
       'important ask need military side selfinterest nga eh',
       'deliberately get early train morning derailment significant delay link',
       'dm us informations also encourage everyone go attend soon possible sunday donate kinds l',
       'family back together watch brandnew special look marvel studios blackwidow theaters may link',
       'guy go steal girlfriend obliterate toplane link',
       'imf obliterate caribbean economies include put many farm relatives jamaica streets',
       'quarantine bite excessive',
       'flotus wear rain coat close stadium code storm upon us link',
       'explain crz rule demolish maradu flats violate explain newsthe indian express link',
       'mean least emergency alert say radiat',
       'thank kelly rescuers harrow task',
       'mak

In [98]:
### text cleaned 

# Word Embedding 

### TF-IDF + Logistic Regression -  For baseline

In [99]:
X = df['clean_text']
y = df['target']

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

In [101]:
# Initialize TF-IDF
tfidf = TfidfVectorizer(max_features=5000) 

In [102]:
# Fit and transform
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [103]:
# Train a Logistic Regression model
lg_model = LogisticRegression()
lg_model.fit(X_train_tfidf, y_train)

In [104]:
y_pred = lg_model.predict(X_test_tfidf)

In [105]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[422  64]
 [109 328]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.87      0.83       486
           1       0.84      0.75      0.79       437

    accuracy                           0.81       923
   macro avg       0.82      0.81      0.81       923
weighted avg       0.81      0.81      0.81       923


Accuracy: 0.8125677139761647


### Fasttext Embedding + Logistic Regression - For improvement

In [108]:
import gensim
from gensim.models.fasttext import FastText

In [109]:
corpus = [text.split() for text in df['clean_text']]

In [110]:
ftm = FastText(corpus,vector_size=100)

In [111]:
def average_word_embedding(doc, feature_size):
    vector = np.zeros((feature_size,))
    word_list = doc.split(' ')
    total_words = 0
    
    for word in word_list:
        if word in ftm.wv:  #  Avoid KeyError
            vector += ftm.wv[word]
            total_words += 1

    if total_words > 0:
        vector = vector / total_words
    return vector

In [112]:
features_fasttext = []
for sent in df['clean_text']:
    embedding = average_word_embedding(sent, 100)  # match vector size
    features_fasttext.append(embedding)

In [113]:
features_df = pd.DataFrame(features_fasttext)

In [114]:
features_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.146876,0.01013,0.011063,0.138771,0.167831,0.31941,-0.104111,0.428848,0.498322,-0.443198,...,-0.633171,-0.351739,-0.367092,-0.141025,-0.528051,0.500689,0.494767,-0.262855,0.029571,0.601766
1,-0.135284,0.008123,0.010331,0.128102,0.154901,0.293633,-0.096546,0.395287,0.458326,-0.407281,...,-0.581237,-0.322548,-0.338518,-0.128952,-0.485461,0.460895,0.454226,-0.241602,0.026363,0.552298
2,-0.135105,0.008551,0.010425,0.12972,0.156561,0.294919,-0.097047,0.395481,0.460403,-0.408751,...,-0.584347,-0.325169,-0.338616,-0.130166,-0.488876,0.463281,0.457222,-0.241921,0.026979,0.55699
3,-0.09838,0.006558,0.006708,0.093248,0.113268,0.213956,-0.070765,0.287516,0.335072,-0.297401,...,-0.424952,-0.235578,-0.245804,-0.095488,-0.354933,0.335329,0.33059,-0.175936,0.019405,0.403925
4,-0.150398,0.011704,0.012381,0.143459,0.170835,0.324938,-0.107487,0.437896,0.508176,-0.450834,...,-0.644287,-0.358245,-0.3753,-0.142859,-0.539689,0.512344,0.503145,-0.269307,0.029429,0.610957


In [115]:
features_df.shape

(4614, 100)

In [116]:
X_train, X_test, y_train, y_test = train_test_split(features_df, df['target'], test_size=0.2, random_state=32)

In [117]:
lg_model = LogisticRegression(max_iter=1000,class_weight='balanced')
lg_model.fit(X_train, y_train)

In [118]:
y_pred = lg_model.predict(X_test)

In [119]:
print("Confusion Matrix:\n",confusion_matrix(y_test, y_pred))
print('\nClassification Report:\n',classification_report(y_test, y_pred))
print('\nAccuracy:',accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[265 234]
 [185 239]]

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.53      0.56       499
           1       0.51      0.56      0.53       424

    accuracy                           0.55       923
   macro avg       0.55      0.55      0.55       923
weighted avg       0.55      0.55      0.55       923


Accuracy: 0.5460455037919827


### TF-IDF + Random forest 

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=78)
rf_model.fit(X_train_tfidf, y_train)
y_pred = rf_model.predict(X_test_tfidf)

In [135]:
rf_model = RandomForestClassifier(n_estimators=100,class_weight='balanced', random_state=78)

In [136]:
rf_model.fit(X_train_tfidf, y_train)

In [137]:
y_pred = rf_model.predict(X_test_tfidf)

In [138]:
print("Confusion Matrix:\n",confusion_matrix(y_test, y_pred))
print('\nClassification Report:\n',classification_report(y_test, y_pred))
print('\nAccuracy:',accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[305 194]
 [271 153]]

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.61      0.57       499
           1       0.44      0.36      0.40       424

    accuracy                           0.50       923
   macro avg       0.49      0.49      0.48       923
weighted avg       0.49      0.50      0.49       923


Accuracy: 0.4962080173347779


### TF-IDF + XGBoost 

In [139]:
xgb_model = XGBClassifier(random_state=89)

In [140]:
xgb_model.fit(X_train_tfidf, y_train)

In [141]:
y_pred = xgb_model.predict(X_test_tfidf)

In [142]:
print("Confusion Matrix:\n",confusion_matrix(y_test, y_pred))
print('\nClassification Report:\n',classification_report(y_test, y_pred))
print('\nAccuracy:',accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[332 167]
 [282 142]]

Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.67      0.60       499
           1       0.46      0.33      0.39       424

    accuracy                           0.51       923
   macro avg       0.50      0.50      0.49       923
weighted avg       0.50      0.51      0.50       923


Accuracy: 0.5135427952329361


## Based on the above result our baseline model is giving us best result so far , so TF-IDF + logistic regression is selected.