In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split


In [None]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

rm: cannot remove '/root/.kaggle': No such file or directory


In [None]:
!kaggle competitions download -c nlp-getting-started

Downloading nlp-getting-started.zip to /content
100% 593k/593k [00:00<00:00, 976kB/s]
100% 593k/593k [00:00<00:00, 975kB/s]


In [None]:
!unzip nlp-getting-started.zip -d /content/data


Archive:  nlp-getting-started.zip
  inflating: /content/data/sample_submission.csv  
  inflating: /content/data/test.csv  
  inflating: /content/data/train.csv  


In [None]:
df_train = pd.read_csv('/content/data/train.csv')
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df_train['clean_text'] = df_train['text'].apply(clean_text)

print(df_train[['text', 'clean_text']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


                                                text  \
0  Our Deeds are the Reason of this #earthquake M...   
1             Forest fire near La Ronge Sask. Canada   
2  All residents asked to 'shelter in place' are ...   
3  13,000 people receive #wildfires evacuation or...   
4  Just got sent this photo from Ruby #Alaska as ...   

                                          clean_text  
0       deeds reason earthquake may allah forgive us  
1              forest fire near la ronge sask canada  
2  residents asked shelter place notified officer...  
3  people receive wildfires evacuation orders cal...  
4  got sent photo ruby alaska smoke wildfires pou...  


In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df_train['clean_text'])

y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (6090, 5000)
Testing set size: (1523, 5000)


In [None]:
nb_model = MultinomialNB()

nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.7984
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.89      0.83       874
           1       0.82      0.68      0.74       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.80      1523

Confusion Matrix:
[[776  98]
 [209 440]]


In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

# Define scoring metrics for cross-validation
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

# Perform cross-validation with precision, recall, and F1-score
cv_results = cross_validate(nb_model, X, y, cv=5, scoring=scoring)

# Print average scores across all folds
print(f"Cross-Validation Accuracy: {cv_results['test_accuracy'].mean():.4f}")
print(f"Cross-Validation Precision: {cv_results['test_precision'].mean():.4f}")
print(f"Cross-Validation Recall: {cv_results['test_recall'].mean():.4f}")
print(f"Cross-Validation F1-Score: {cv_results['test_f1'].mean():.4f}")


Cross-Validation Accuracy: 0.7252
Cross-Validation Precision: 0.7145
Cross-Validation Recall: 0.6060
Cross-Validation F1-Score: 0.6513


In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.7978
[[779  95]
 [213 436]]
              precision    recall  f1-score   support

           0       0.79      0.89      0.83       874
           1       0.82      0.67      0.74       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

lr_model = LogisticRegression(max_iter=1000)

# Define scoring metrics for cross-validation
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

cv_results = cross_validate(lr_model, X, y, cv=5, scoring=scoring)

print(f"Cross-Validation Accuracy: {cv_results['test_accuracy'].mean():.4f}")
print(f"Cross-Validation Precision: {cv_results['test_precision'].mean():.4f}")
print(f"Cross-Validation Recall: {cv_results['test_recall'].mean():.4f}")
print(f"Cross-Validation F1-Score: {cv_results['test_f1'].mean():.4f}")


Cross-Validation Accuracy: 0.7024
Cross-Validation Precision: 0.7383
Cross-Validation Recall: 0.4827
Cross-Validation F1-Score: 0.5807


In [None]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)


In [None]:
print(f"svm_model Accuracy: {accuracy_score(y_test, svm_predictions):.4f}")
print(confusion_matrix(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))

svm_model Accuracy: 0.7951
[[787  87]
 [225 424]]
              precision    recall  f1-score   support

           0       0.78      0.90      0.83       874
           1       0.83      0.65      0.73       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.78      1523
weighted avg       0.80      0.80      0.79      1523



In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the SVM model
svm_model = SVC()

# Define scoring metrics for cross-validation
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

# Perform cross-validation with precision, recall, and F1-score
cv_results = cross_validate(svm_model, X, y, cv=5, scoring=scoring)

# Print average scores across all folds
print(f"Cross-Validation Accuracy: {cv_results['test_accuracy'].mean():.4f}")
print(f"Cross-Validation Precision: {cv_results['test_precision'].mean():.4f}")
print(f"Cross-Validation Recall: {cv_results['test_recall'].mean():.4f}")
print(f"Cross-Validation F1-Score: {cv_results['test_f1'].mean():.4f}")


Cross-Validation Accuracy: 0.6955
Cross-Validation Precision: 0.7541
Cross-Validation Recall: 0.4418
Cross-Validation F1-Score: 0.5525


In [None]:

model = Sequential()

model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_data=(X_test.toarray(), y_test))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.5862 - loss: 0.6660 - val_accuracy: 0.7984 - val_loss: 0.4653
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8382 - loss: 0.3834 - val_accuracy: 0.7938 - val_loss: 0.4587
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8897 - loss: 0.2867 - val_accuracy: 0.7794 - val_loss: 0.5082
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9174 - loss: 0.2173 - val_accuracy: 0.7669 - val_loss: 0.5785
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9429 - loss: 0.1651 - val_accuracy: 0.7564 - val_loss: 0.6396
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.9523 - loss: 0.1304 - val_accuracy: 0.7538 - val_loss: 0.7030
Epoch 7/10
[1m191/191[0m

<keras.src.callbacks.history.History at 0x7e3b71d3dd20>

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
y_pred_nn_prob = model.predict(X_test.toarray())
y_pred_nn = (y_pred_nn_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)

precision_nn = precision_score(y_test, y_pred_nn)

recall_nn = recall_score(y_test, y_pred_nn)

f1_nn = f1_score(y_test, y_pred_nn)

print(f"Neural Network Accuracy: {accuracy_nn:.4f}")
print(f"Neural Network Precision: {precision_nn:.4f}")
print(f"Neural Network Recall: {recall_nn:.4f}")
print(f"Neural Network F1-Score: {f1_nn:.4f}")

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Neural Network Accuracy: 0.7564
Neural Network Precision: 0.7158
Neural Network Recall: 0.7103
Neural Network F1-Score: 0.7131


In [None]:

model = Sequential()

model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='relu'))

model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_data=(X_test.toarray(), y_test))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.6150 - loss: 1.4143 - val_accuracy: 0.8004 - val_loss: 0.4694
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8301 - loss: 0.4276 - val_accuracy: 0.7905 - val_loss: 0.5283
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8874 - loss: 0.3559 - val_accuracy: 0.7873 - val_loss: 0.7393
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.9182 - loss: 0.2839 - val_accuracy: 0.7800 - val_loss: 1.0777
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9402 - loss: 0.2797 - val_accuracy: 0.7741 - val_loss: 1.3824
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.9540 - loss: 0.2382 - val_accuracy: 0.7722 - val_loss: 1.4944
Epoch 7/10
[1m191/191[0

<keras.src.callbacks.history.History at 0x7e3b5b512ce0>

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
y_pred_nn_prob = model.predict(X_test.toarray())
y_pred_nn = (y_pred_nn_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)

precision_nn = precision_score(y_test, y_pred_nn)

recall_nn = recall_score(y_test, y_pred_nn)

f1_nn = f1_score(y_test, y_pred_nn)

print(f"Neural Network Accuracy: {accuracy_nn:.4f}")
print(f"Neural Network Precision: {precision_nn:.4f}")
print(f"Neural Network Recall: {recall_nn:.4f}")
print(f"Neural Network F1-Score: {f1_nn:.4f}")

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Neural Network Accuracy: 0.7538
Neural Network Precision: 0.7134
Neural Network Recall: 0.7057
Neural Network F1-Score: 0.7095


In [None]:
from sklearn.model_selection import KFold
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scipy.sparse import csr_matrix

# Assuming X_train is a sparse matrix and y_train is a Pandas Series
if isinstance(X_train, csr_matrix):
    X_train = X_train.toarray()
y_train = y_train.values

# Define the number of folds
kf = KFold(n_splits=5)

# Initialize lists to store results
accuracy_scores = []



AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [None]:
# Loop through each fold
for train_index, val_index in kf.split(X_train):
    # Split the data
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Build the model
    model = Sequential()
    model.add(Dense(128, input_dim=X_train_fold.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))  # Use 'sigmoid' for binary classification

    # Compile the model
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=32, verbose=1)

    # Evaluate the model
    scores = model.evaluate(X_val_fold, y_val_fold, verbose=1)
    accuracy_scores.append(scores[1])  # Append the accuracy

# Calculate the average accuracy
average_accuracy = np.mean(accuracy_scores)
print(f'Average Accuracy: {average_accuracy:.4f}')


Epoch 1/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.5761 - loss: 0.6738
Epoch 2/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8354 - loss: 0.4114
Epoch 3/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.8928 - loss: 0.2844
Epoch 4/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9318 - loss: 0.2050
Epoch 5/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9434 - loss: 0.1615
Epoch 6/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9597 - loss: 0.1173
Epoch 7/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9629 - loss: 0.0999
Epoch 8/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9694 - loss: 0.0941
Epoch 9/10
[1m153/153[0m [32m━━━━━━

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
y_pred_nn_prob = model.predict(X_test.toarray())
y_pred_nn = (y_pred_nn_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)

precision_nn = precision_score(y_test, y_pred_nn)
recall_nn = recall_score(y_test, y_pred_nn)
f1_nn = f1_score(y_test, y_pred_nn)

print(f"Neural Network Accuracy: {accuracy_nn:.4f}")
print(f"Neural Network Precision: {precision_nn:.4f}")
print(f"Neural Network Recall: {recall_nn:.4f}")
print(f"Neural Network F1-Score: {f1_nn:.4f}")

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Neural Network Accuracy: 0.7426
Neural Network Precision: 0.6980
Neural Network Recall: 0.6980
Neural Network F1-Score: 0.6980


In [None]:
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    model = Sequential()
    model.add(Dense(128, input_dim=X_train_fold.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))  # Use 'sigmoid' for binary classification

    # Compile the model
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=32, verbose=1)

    # Evaluate the model
    scores = model.evaluate(X_val_fold, y_val_fold, verbose=1)
    accuracy_scores.append(scores[1])  # Append the accuracy

# Calculate the average accuracy
average_accuracy = np.mean(accuracy_scores)
print(f'Average Accuracy: {average_accuracy:.4f}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.5883 - loss: 1.6702
Epoch 2/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.8251 - loss: 0.4305
Epoch 3/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8886 - loss: 0.3399
Epoch 4/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9169 - loss: 0.2775
Epoch 5/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9195 - loss: 0.3146
Epoch 6/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9460 - loss: 0.2463
Epoch 7/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9597 - loss: 0.2352
Epoch 8/10
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9580 - loss: 0.2437
Epoch 9/10
[1m153/153[0m [32m━━━━━

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
y_pred_nn_prob = model.predict(X_test.toarray())
y_pred_nn = (y_pred_nn_prob > 0.5).astype(int)
accuracy_nn = accuracy_score(y_test, y_pred_nn)

precision_nn = precision_score(y_test, y_pred_nn)
recall_nn = recall_score(y_test, y_pred_nn)
f1_nn = f1_score(y_test, y_pred_nn)

print(f"Neural Network Accuracy: {accuracy_nn:.4f}")
print(f"Neural Network Precision: {precision_nn:.4f}")
print(f"Neural Network Recall: {recall_nn:.4f}")
print(f"Neural Network F1-Score: {f1_nn:.4f}")

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Neural Network Accuracy: 0.7656
Neural Network Precision: 0.7401
Neural Network Recall: 0.6934
Neural Network F1-Score: 0.7160



---

### 1. **Data Preprocessing:**

   The key steps in the preprocessing pipeline include:

   - **Removing URLs:** URLs typically don't contribute much to the actual meaning of a tweet or message, especially in disaster contexts. Removing them ensures a cleaner text input.
   
   - **Removing HTML Tags:** This is important if the data has been scraped from web pages or APIs that return HTML content. These tags add noise to the text.
   
   - **Removing Non-Alphabetical Characters:** This includes removing punctuation, numbers, and special symbols that don’t provide semantic value.
   
   - **Lowercasing:** Ensures that "Disaster" and "disaster" are treated as the same word.
   
   - **Tokenization:** Splits text into individual words, allowing us to represent the document as a collection of words (tokens).
   
   - **Stopword Removal:** Words like "the," "is," "and" are removed because they do not carry significant meaning and can reduce noise in the data.

   After preprocessing, the cleaned text is used for vectorization.

---

### 2. **TF-IDF Vectorization:**

   The **Term Frequency-Inverse Document Frequency (TF-IDF)** vectorizer converts the textual data into numerical form. TF-IDF is effective because it balances word frequency across documents (term frequency) and gives less importance to commonly used words (inverse document frequency).

   - **Max Features (5000):** The most important 5000 words across the dataset are kept. Choosing the right number of features is crucial to balance dimensionality and performance.
   - This representation allows models like Naive Bayes, Logistic Regression, and SVM to process the text as numerical input.

---

### 3. **Model Selection and Performance:**

   Several machine learning models are trained and evaluated. Below is a detailed breakdown of each model's behavior and performance.

   #### a. **Naive Bayes (MultinomialNB):**
   - **Model Overview:** Naive Bayes works well for text classification tasks because of its strong assumptions of word independence. Even though it assumes features are independent, this model performs surprisingly well on text data.
   - **Results:**
     - **Accuracy:** ~80%
     - **Precision/Recall for Class 1:** Precision is relatively high (~0.82), but recall is lower (~0.68). This means the model identifies most positive cases but misses some (lower recall).
     - **F1-Score:** Balanced F1-score (~0.74), which shows a good trade-off between precision and recall.

   #### b. **Logistic Regression:**
   - **Model Overview:** Logistic Regression, which models the probability of a class occurring, works well for binary classification like disaster/non-disaster detection.
   - **Results:**
     - **Accuracy:** ~79.78%
     - **Precision/Recall for Class 1:** Similar precision to Naive Bayes (~0.82), but recall is a bit lower (~0.67).
     - **Cross-Validation:** Lower precision and recall in cross-validation (Precision ~0.73, Recall ~0.48), suggesting possible overfitting or dataset imbalance.
   
   #### c. **Support Vector Machine (SVM):**
   - **Model Overview:** SVMs can capture complex patterns in data by transforming it into higher dimensions (via kernels). They are effective for high-dimensional data like TF-IDF vectors.
   - **Results:**
     - **Accuracy:** ~79.51%
     - **Precision/Recall for Class 1:** Precision remains high (~0.83), but recall is slightly lower (~0.65), showing that while the model is good at detecting positives, it still misses some instances.

---

### 4. **Deep Learning with Neural Networks:**

   #### a. **Architecture:**
   - **Layers:**
     - **Dense Layer (128 units, ReLU activation):** Extracts higher-level features from the TF-IDF input. ReLU is chosen because it avoids the vanishing gradient problem.
     - **Dropout Layers (0.5):** A technique used to prevent overfitting by randomly dropping 50% of the neurons during training.
     - **Output Layer:** Uses the sigmoid activation function, ideal for binary classification since it outputs a probability between 0 and 1.
   
   - **Optimizer and Loss Function:**
     - **Adam Optimizer:** Adaptive learning rate optimizes training speed.
     - **Binary Cross-Entropy Loss:** Appropriate for binary classification tasks, focusing on minimizing the difference between predicted probabilities and actual labels.
   
   #### b. **Performance:**
   - **Accuracy (~75%):** Slightly lower than the traditional models like Naive Bayes or SVM. However, with more tuning (e.g., more epochs, different learning rates, batch sizes), this can be improved.
   - **F1-Score (~0.71):** Shows a balanced trade-off between precision and recall but requires improvement for real-time applications.
   - **Challenges:**
     - Overfitting in later epochs: The validation loss increases, which shows overfitting. Techniques like early stopping or further hyperparameter tuning could help.
     - High computational cost: Neural networks, especially when applied to text data, may require more resources compared to simpler models.

---

### 5. **Cross-Validation and Model Robustness:**

   Cross-validation helps assess the consistency of models across different splits of the dataset, mitigating overfitting on a single train-test split.

   - **Naive Bayes (Cross-Validation Results):**
     - **Accuracy:** ~72.5%
     - **Precision:** ~71.45%
     - **Recall:** ~60.6%
     - The drop in recall suggests that the model may miss more true positives in different data splits, making it less reliable for real-time disaster identification.

   - **Logistic Regression:**
     - **Cross-Validation Accuracy:** ~70%
     - **F1-Score:** The F1-score for class 1 is lower (~0.58), suggesting that Logistic Regression may underperform in certain disaster scenarios.
   
   - **Neural Networks:**
     - After performing k-fold cross-validation on the neural network, the average accuracy is around **76.07%**, showing the model generalizes reasonably well but still struggles with recall.

---

### 6. **Real-World Applications:**

   The model has the potential for use in **disaster response** and **emergency management**. Here’s how it can be applied:

   #### a. **Disaster Monitoring:**
   - The model can be integrated with **real-time social media feeds** (e.g., Twitter API) to monitor disaster-related posts. For instance, it can classify tweets about earthquakes, floods, wildfires, etc.
   - In an emergency, this model can be trained to distinguish between false alarms and real crises, allowing emergency response teams to prioritize urgent needs.

   #### b. **Information Dissemination:**
   - The system can automatically send out alerts based on high-confidence predictions of disaster-related events.
   - It can also categorize tweets or messages into different types of disaster reports (e.g., damage reports, requests for help), and direct them to appropriate authorities.

   #### c. **Resource Allocation and Response Prioritization:**
   - By tracking the volume and urgency of disaster-related tweets, the model could guide emergency teams in efficiently allocating resources (e.g., rescue teams, medical supplies) based on real-time data.

   #### d. **Improving Communication with the Public:**
   - This model can be used in government apps or systems to monitor and filter social media to give verified information and avoid misinformation during disaster periods.
   - An extension of the model could be used to respond to users in real-time, providing instructions or offering emergency services through automation.

---

### 7. **Challenges and Improvements:**

   - **Handling Imbalanced Datasets:**
     - Disaster datasets are often imbalanced (e.g., there are far fewer disaster-related posts than non-disaster posts). Techniques like **oversampling** (SMOTE) or **class weighting** could be used to improve recall for the minority class (disaster-related posts).

   - **Feature Engineering:**
     - Additional features like **sentiment analysis**, **keyword extraction**, or **topic modeling** could improve model performance.
   
   - **Model Optimization:**
     - More sophisticated neural network architectures, like **LSTMs** (for sequential data) or **Transformers** (like BERT), could capture more context from text data, leading to better accuracy and recall.

---

### 8. **Best Model Selection:**

Based on the performance metrics and real-world applicability, **Naive Bayes (MultinomialNB)** stands out as the best model for this disaster classification project. Here's why:

#### a. **Performance Metrics:**
   - **Accuracy (~80%):** Naive Bayes achieves one of the highest accuracy scores among the models evaluated, making it reliable for overall predictions.
   - **Precision and Recall:**
     - **Precision (Class 1 - Disaster):** ~0.82, which is crucial for minimizing false positives in disaster detection. It means that when the model predicts a disaster, it is correct most of the time.
     - **Recall (Class 1 - Disaster):** ~0.68, which, although slightly lower, is still better than many other models. This ensures that the model captures a significant portion of actual disaster-related tweets.
   - **F1-Score (~0.74):** Provides a good balance between precision and recall, essential for a scenario where missing disaster-related information can be critical.

#### b. **Simplicity and Speed:**
   - **Low Computational Overhead:** Naive Bayes is computationally efficient compared to more complex models like SVM or neural networks. This is crucial when scaling the model for real-time disaster monitoring, where thousands of tweets or messages might need to be processed per minute.
   - **Ease of Implementation:** Naive Bayes is simple to implement and train, requiring fewer hyperparameters and less tuning than neural networks or SVMs.

#### c. **Suitability for Text Classification:**
   - **Works Well with TF-IDF:** Naive Bayes, particularly MultinomialNB, is well-suited for the TF-IDF vectorized representation of text data. It naturally handles high-dimensional data (e.g., thousands of words in the feature space) and performs well even when individual features (words) are independent.
   - **Handling Imbalanced Data:** While not explicitly designed for imbalanced datasets, Naive Bayes still performs well on this dataset, particularly in detecting disaster-related tweets, without needing advanced techniques like oversampling.

#### d. **Comparison with Other Models:**
   - **Logistic Regression:** Although similar in performance, Logistic Regression’s cross-validation results show lower recall, meaning it misses more disaster-related tweets, which is a critical drawback in this application.
   - **SVM:** While SVM offers comparable precision, it is computationally expensive and slower to train, making it less ideal for real-time use.
   - **Neural Networks:** Despite their ability to capture complex patterns, neural networks in this case underperform in accuracy and are prone to overfitting, especially with limited disaster data. They also require significantly more computational resources, which may not justify the slight improvement in flexibility.

#### e. **Real-World Applicability:**
   - In real-time disaster monitoring systems, models must be fast, reliable, and easy to update or retrain. **Naive Bayes** fits these criteria, making it a practical choice for deployment.
   - The model’s high precision ensures that when it flags a disaster tweet, it is more likely to be accurate, which is vital for emergency response teams. False alarms can lead to wasted resources, so the high precision helps mitigate that risk.

---

### Conclusion:

**Naive Bayes** is the best model for this disaster classification task due to its strong performance, computational efficiency, and ease of implementation. It provides a reliable balance of precision and recall, which is crucial in real-time disaster detection systems, ensuring the right balance between identifying true disaster-related posts and avoiding unnecessary false positives.