In [1]:
# Importing the neccessary Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# Loading the Dataset

train_df = pd.read_csv('Dataset/train.csv')
test_df = pd.read_csv('Dataset/train.csv')

In [3]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
test_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [7]:
import nltk
from nltk.corpus import stopwords
# Step 2: Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chizz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\chizz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chizz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chizz\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chizz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Initialize Stemmer and Lemmatizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [9]:
# Step 3: Define text cleaning functions
import re
def clean_text(text):
    text = re.sub(r'http\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#', '', text)  # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetical characters
    text = text.lower()
    return text

In [10]:
# Apply Stemming, Lemmatization, Stopword Removal
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    text = clean_text(text)
    words = word_tokenize(text)
    # Apply Lemmatization and Stemming
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]
    return ' '.join(words)

In [11]:
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

In [12]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquak may allah forgiv u,1
1,4,,,forest fire near la rong sask canada,1
2,5,,,resid ask shelter place notifi offic evacu she...,1
3,6,,,peopl receiv wildfir evacu order california,1
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,1
...,...,...,...,...,...
7608,10869,,,two giant crane hold bridg collaps nearbi home,1
7609,10870,,,control wild fire california even northern par...,1
7610,10871,,,utckm volcano hawaii,1
7611,10872,,,polic investig ebik collid car littl portug eb...,1


In [13]:
test_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquak may allah forgiv u,1
1,4,,,forest fire near la rong sask canada,1
2,5,,,resid ask shelter place notifi offic evacu she...,1
3,6,,,peopl receiv wildfir evacu order california,1
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,1
...,...,...,...,...,...
7608,10869,,,two giant crane hold bridg collaps nearbi home,1
7609,10870,,,control wild fire california even northern par...,1
7610,10871,,,utckm volcano hawaii,1
7611,10872,,,polic investig ebik collid car littl portug eb...,1


In [14]:
# Step 4: N-grams (Unigrams, Bigrams, Trigrams)
from nltk.util import ngrams

def extract_ngrams(text, n=2):
    tokens = word_tokenize(text)
    n_grams = ngrams(tokens, n)
    return [' '.join(gram) for gram in n_grams]

In [15]:
# Example: Adding bigrams and trigrams as features
train_df['bigrams'] = train_df['text'].apply(lambda x: extract_ngrams(x, n=2))
train_df['trigrams'] = train_df['text'].apply(lambda x: extract_ngrams(x, n=3))

test_df['bigrams'] = test_df['text'].apply(lambda x: extract_ngrams(x, n=2))
test_df['trigrams'] = test_df['text'].apply(lambda x: extract_ngrams(x, n=3))

In [16]:
train_df

Unnamed: 0,id,keyword,location,text,target,bigrams,trigrams
0,1,,,deed reason earthquak may allah forgiv u,1,"[deed reason, reason earthquak, earthquak may,...","[deed reason earthquak, reason earthquak may, ..."
1,4,,,forest fire near la rong sask canada,1,"[forest fire, fire near, near la, la rong, ron...","[forest fire near, fire near la, near la rong,..."
2,5,,,resid ask shelter place notifi offic evacu she...,1,"[resid ask, ask shelter, shelter place, place ...","[resid ask shelter, ask shelter place, shelter..."
3,6,,,peopl receiv wildfir evacu order california,1,"[peopl receiv, receiv wildfir, wildfir evacu, ...","[peopl receiv wildfir, receiv wildfir evacu, w..."
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,1,"[got sent, sent photo, photo rubi, rubi alaska...","[got sent photo, sent photo rubi, photo rubi a..."
...,...,...,...,...,...,...,...
7608,10869,,,two giant crane hold bridg collaps nearbi home,1,"[two giant, giant crane, crane hold, hold brid...","[two giant crane, giant crane hold, crane hold..."
7609,10870,,,control wild fire california even northern par...,1,"[control wild, wild fire, fire california, cal...","[control wild fire, wild fire california, fire..."
7610,10871,,,utckm volcano hawaii,1,"[utckm volcano, volcano hawaii]",[utckm volcano hawaii]
7611,10872,,,polic investig ebik collid car littl portug eb...,1,"[polic investig, investig ebik, ebik collid, c...","[polic investig ebik, investig ebik collid, eb..."


In [7]:
# Step 5: Data splitting

from sklearn.model_selection import train_test_split

X = train_df['text']
y = train_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
X_train,X_test

(4996    Courageous and honest analysis of need to use ...
 3263    @ZachZaidman @670TheScore wld b a shame if tha...
 4907    Tell @BarackObama to rescind medals of 'honor'...
 2855    Worried about how the CA drought might affect ...
 4716    @YoungHeroesID Lava Blast &amp; Power Red #Pan...
                               ...                        
 5226    @Eganator2000 There aren't many Obliteration s...
 5390    just had a panic attack bc I don't have enough...
 860     Omron HEM-712C Automatic Blood Pressure Monito...
 7603    Officials say a quarantine is in place at an A...
 7270    I moved to England five years ago today. What ...
 Name: text, Length: 6090, dtype: object,
 2644    So you have a new weapon that can cause un-ima...
 2227    The f$&amp;@ing things I do for #GISHWHES Just...
 5448    DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...
 132     Aftershock back to school kick off was great. ...
 6845    in response to trauma Children of Addicts deve...
              

In [9]:
# Step 6: TF-IDF with N-grams
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 3))  # Using bigrams and trigrams
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [13]:
# Step 7: Handle class imbalance using resampling
from sklearn.utils import resample

def balance_classes(df):
    df_majority = df[df['target'] == 1]
    df_minority = df[df['target'] == 0]
    df_minority_upsampled = resample(
        df_minority,
        replace=True,
        n_samples=len(df_majority),
        random_state=42
    )
    return pd.concat([df_majority, df_minority_upsampled])


df_upsampled = balance_classes(train_df)
X_upsampled = df_upsampled['text']
y_upsampled = df_upsampled['target']

In [14]:
X_upsampled

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
344     **OFFICIAL VID** #TheReal &gt;&gt;&gt; https:/...
4571    @jamienye u can't blame it all on coaching man...
7152    This LA Startup Is So Hot that Their Flowers C...
278     @HoneyBunzGem @primalkitchen I feel like me do...
5847    I understand you wanting to hang out with your...
Name: text, Length: 6542, dtype: object

In [15]:
X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(
    X_upsampled, y_upsampled, test_size=0.2, random_state=42)
X_train_tfidf_upsampled = tfidf.fit_transform(X_train_upsampled)
X_test_tfidf_upsampled = tfidf.transform(X_test_upsampled)

In [17]:
# Step 8: Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


model_lr = LogisticRegression(max_iter=200)
model_lr.fit(X_train_tfidf_upsampled, y_train_upsampled)
y_pred_lr = model_lr.predict(X_test_tfidf_upsampled)
# Evaluate Logistic Regression
print("Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_upsampled, y_pred_lr):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test_upsampled, y_pred_lr)}")
print(f"Classification Report:\n{classification_report(y_test_upsampled, y_pred_lr)}")

Logistic Regression Model Evaluation:
Accuracy: 0.8105
Confusion Matrix:
[[532 104]
 [144 529]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       636
           1       0.84      0.79      0.81       673

    accuracy                           0.81      1309
   macro avg       0.81      0.81      0.81      1309
weighted avg       0.81      0.81      0.81      1309



In [18]:
from sklearn.svm import SVC

# Step 9: Support Vector Classifier (SVC)
model_svc = SVC(kernel='linear', random_state=42)
model_svc.fit(X_train_tfidf_upsampled, y_train_upsampled)
y_pred_svc = model_svc.predict(X_test_tfidf_upsampled)

# Evaluate SVC
print("Support Vector Classifier Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_upsampled, y_pred_svc):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test_upsampled, y_pred_svc)}")
print(f"Classification Report:\n{classification_report(y_test_upsampled, y_pred_svc)}")

Support Vector Classifier Model Evaluation:
Accuracy: 0.7930
Confusion Matrix:
[[533 103]
 [168 505]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.84      0.80       636
           1       0.83      0.75      0.79       673

    accuracy                           0.79      1309
   macro avg       0.80      0.79      0.79      1309
weighted avg       0.80      0.79      0.79      1309



In [19]:
# Step 10: Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train_tfidf_upsampled, y_train_upsampled)
y_pred_rf = model_rf.predict(X_test_tfidf_upsampled)

# Evaluate Random Forest
print("Random Forest Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test_upsampled, y_pred_rf):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test_upsampled, y_pred_rf)}")
print(f"Classification Report:\n{classification_report(y_test_upsampled, y_pred_rf)}")

Random Forest Model Evaluation:
Accuracy: 0.8205
Confusion Matrix:
[[572  64]
 [171 502]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.90      0.83       636
           1       0.89      0.75      0.81       673

    accuracy                           0.82      1309
   macro avg       0.83      0.82      0.82      1309
weighted avg       0.83      0.82      0.82      1309



In [None]:
# Ritesh - For Model Evaluation

In [None]:



from nltk import pos_tag



from sklearn.pipeline import Pipeline
import gensim.downloader as api


In [None]:





























# Plot for each model
plot_confusion_matrix(y_test_upsampled, y_pred_lr, "Logistic Regression")
plot_confusion_matrix(y_test_upsampled, y_pred_svc, "Support Vector Classifier")
plot_confusion_matrix(y_test_upsampled, y_pred_rf, "Random Forest")

# Step 12: Predict outcomes on test data using Random Forest Model
new_tweets = test_df['text']
X_test_tfidf_testing = tfidf.transform(new_tweets)
predictions = model_rf.predict(X_test_tfidf_testing)

# Display predictions
for tweet, pred in zip(new_tweets[:10], predictions[:10]):
    print(f"Tweet: {tweet}")
    print(f"Prediction: {'Real Disaster' if pred == 1 else 'Not Real Disaster'}")
    print("-" * 50)