In [18]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from nltk.stem import PorterStemmer


# Data Acquisition

In [19]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [20]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Data Preprocessing

In [21]:
def clean_text(text):
    # Remove special characters, URLs, and HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    
    # Convert to lowercase
    cleaned_text = text.lower()
    
    # Remove stopwords and apply lemmatization
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    tokens = nltk.word_tokenize(cleaned_text)
    cleaned_text = " ".join([lemmatizer.lemmatize(token) for token in tokens if token not in stop_words])
    
    return cleaned_text

 # Apply preprocessing to train and test data
train_df['clean_text'] = train_df['text'].apply(preprocess_text)
test_df['clean_text'] = test_df['text'].apply(preprocess_text)

# Feature Extraction

In [22]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['clean_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_df['clean_text']).toarray()

y_train = train_df['target'].values


# Model Training and Selection

In [24]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [25]:
# Splitting data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

In [26]:
# Initialize models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC()
}

# Model Evaluation&Interpretation and Application

In [33]:
# Train and evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    print(f"Model: {model_name}")
    
    # Evaluation Metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    # Print metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    
    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    
    # Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred))
    
    print("\n")

    # Interpretation and Application
    print(f"Interpretation and Application of {model_name}:")
    
    # Example of interpretation based on performance
    if recall > 0.80:
        print(f"The {model_name} model has high recall ({recall:.2f}), meaning it effectively identifies most disaster tweets.")
    else:
        print(f"The {model_name} model needs improvement in recall ({recall:.2f}), as it misses identifying some disaster tweets.")
    
    # Discuss practical application
    print(f"This model's predictions could assist in real-time disaster monitoring by quickly identifying relevant tweets.")
    print(f"Its precision score ({precision:.2f}) indicates it accurately identifies disaster tweets without many false alarms.")
    
    print("\n")



Model: Naive Bayes
Accuracy: 0.8063
Precision: 0.8242
Recall: 0.6934
F1-score: 0.7531

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       874
           1       0.82      0.69      0.75       649

    accuracy                           0.81      1523
   macro avg       0.81      0.79      0.80      1523
weighted avg       0.81      0.81      0.80      1523

Confusion Matrix:
[[778  96]
 [199 450]]


Interpretation and Application of Naive Bayes:
The Naive Bayes model needs improvement in recall (0.69), as it misses identifying some disaster tweets.
This model's predictions could assist in real-time disaster monitoring by quickly identifying relevant tweets.
Its precision score (0.82) indicates it accurately identifies disaster tweets without many false alarms.


Model: Logistic Regression
Accuracy: 0.7925
Precision: 0.8044
Recall: 0.6780
F1-score: 0.7358

Classification Report:
              precision    recal