# Natural Language Processing with Disaster Tweets

In [1]:
import pandas as pd

## Loading the Train and Test dataset

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
train_df.head(5)

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
train_df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [4]:
train_df['text'][234]

'@TomcatArts thus explaining why you were all annihilated. But the few or in this case you the only survivor evolved and became godlike'

In [5]:
train_df.isnull().sum()

Unnamed: 0,0
id,0
keyword,61
location,2533
text,0
target,0


### Data Preprocessing and Cleaning

In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

def advanced_clean(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions/hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))
    # Handle contractions
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"n't", " not", text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.lower().strip()

# Add to your preprocessing pipeline
train_df['cleaned_text'] = train_df['text'].apply(advanced_clean)

# BERT
#### The previous word2vec averages might lose contextual informan
### Using the transfor-based embeddings (BERT)

In [None]:
!pip install transformers

import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained BERT
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt',
                      truncation=True, max_length=128,
                      padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:,0,:].numpy()

# Generate BERT embeddings
bert_embeddings = np.array([get_bert_embeddings(t) for t in train_df['cleaned_text']])



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# a. SMOTE Oversampling
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(bert_embeddings, train_df['target'])

# b. Class Weighting
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', classes=[0,1], y=train_df['target'])

## MOdel Architecture

In [None]:
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Attention
from tensorflow.keras.models import Sequential

model = Sequential([
    Bidirectional(LSTM(128, return_sequences=True)),
    Attention(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
from transformers import TFAutoModelForSequenceClassification

bert_model = TFAutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

bert_model.compile(
    optimizer=tf.keras.optimizers.Adam(3e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

gb = GradientBoostingClassifier()
search = RandomizedSearchCV(gb, param_grid, cv=3, scoring='f1')
search.fit(X_train_2d, y_train)

In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[
        ('bert', bert_model),
        ('gbm', GradientBoostingClassifier()),
        ('lstm', lstm_model)
    ],
    voting='soft'
)

In [None]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2

model.add(Dense(64, activation='relu',
               kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True)
for train_idx, val_idx in skf.split(X, y):
    # Training logic