## Step 1: Importing Data

In [None]:
import numpy as np
import pandas as pd

In [2]:
#Supress warning
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,keyword,place,tweet,disaster
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,id,keyword,place,tweet
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Step 2: Inspecting the dataframe

In [7]:
# Let's check the dimensions of the dataframe
df.shape

(7613, 5)

In [8]:
df_test.shape

(3263, 4)

In [9]:
# let's look at the statistical aspects of the dataframe
df.describe()

Unnamed: 0,id,disaster
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   place     5080 non-null   object
 3   tweet     7613 non-null   object
 4   disaster  7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       3263 non-null   int64 
 1   keyword  3237 non-null   object
 2   place    2158 non-null   object
 3   tweet    3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


## Step 3: Removing noise from tweets

In [13]:
#Let's convert into lowercase first
df.tweet = df.tweet.str.lower()
df_test.tweet = df_test.tweet.str.lower()

In [14]:
df.head()

Unnamed: 0,id,keyword,place,tweet,disaster
0,1,,,our deeds are the reason of this #earthquake m...,1
1,4,,,forest fire near la ronge sask. canada,1
2,5,,,all residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,just got sent this photo from ruby #alaska as ...,1


In [15]:
df_test.head()

Unnamed: 0,id,keyword,place,tweet
0,0,,,just happened a terrible car crash
1,2,,,"heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,apocalypse lighting. #spokane #wildfires
4,11,,,typhoon soudelor kills 28 in china and taiwan


In [16]:
import re

In [17]:
#removing URLs from tweets
def remove_noise(text):
    text = re.sub(r'@\w+', '', text) 
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return re.sub(r'http\S+|www\S+', '', text)

df['tweet'] = df['tweet'].apply(remove_noise)
df_test['tweet'] = df_test['tweet'].apply(remove_noise)

### Handling NAN values

In [18]:
df.isna().sum()

id             0
keyword       61
place       2533
tweet          0
disaster       0
dtype: int64

In [19]:
df_test.isna().sum()

id            0
keyword      26
place      1105
tweet         0
dtype: int64

In [20]:
most_frequent_place = df['place'].mode()[0]
df['place'].fillna(most_frequent_place, inplace=True)

In [21]:
most_frequent_place = df_test['place'].mode()[0]
df_test['place'].fillna(most_frequent_place, inplace=True)

In [22]:
#dropping rows with keyword as NAN
df.dropna(subset=['keyword'], inplace=True)
df_test.dropna(subset=['keyword'], inplace=True)

In [24]:
print(df.isna().sum())
print("<------------------->")
print(df_test.isna().sum())

id          0
keyword     0
place       0
tweet       0
disaster    0
dtype: int64
<------------------->
id         0
keyword    0
place      0
tweet      0
dtype: int64


## Step 4: Feature Extraction

### Performing tokenization

In [26]:
from nltk.tokenize import word_tokenize
import nltk

In [27]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\risha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
def tokenize_tweet(text):
    return word_tokenize(text)

In [29]:
df['tweet'] = df['tweet'].apply(tokenize_tweet)
df_test['tweet'] = df_test['tweet'].apply(tokenize_tweet)

### Removing stop words

In [30]:
from nltk.corpus import stopwords

In [31]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\risha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

In [33]:
df['tweet'] = df['tweet'].apply(remove_stopwords)
df_test['tweet'] = df_test['tweet'].apply(remove_stopwords)

### applying lemmatization on tweets token

In [34]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [35]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [36]:
df['tweet'] = df['tweet'].apply(lemmatize)
df_test['tweet'] = df_test['tweet'].apply(lemmatize)

In [37]:
df.head()

Unnamed: 0,id,keyword,place,tweet,disaster
31,48,ablaze,Birmingham,"[wholesale, market, ablaze]",1
32,49,ablaze,Est. September 2012 - Bristol,"[always, try, bring, heavy, metal, rt]",0
33,50,ablaze,AFRICA,"[africanbaze, breaking, newsnigeria, flag, set...",1
34,52,ablaze,"Philadelphia, PA","[cry, set, ablaze]",0
35,53,ablaze,"London, UK","[plus, side, look, sky, last, night, ablaze]",0


In [38]:
df_test.head()

Unnamed: 0,id,keyword,place,tweet
15,46,ablaze,London,"[birmingham, wholesale, market, ablaze, bbc, n..."
16,47,ablaze,Niall's place | SAF 12 SQUAD |,"[wear, short, race, ablaze]"
17,51,ablaze,NIGERIA,"[previouslyondoyintv, toke, makinwas, marriage..."
18,58,ablaze,Live On Webcam,"[check, nsfw]"
19,60,ablaze,"Los Angeles, Califnordia","[psa, im, splitting, personality, techie, foll..."


### Vectorization

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['tweet'].apply(lambda tokens: ' '.join(tokens)))

In [44]:
X_test_csv = vectorizer.transform(df_test['tweet'].apply(lambda tokens: ' '.join(tokens)))

In [42]:
X.shape

(7552, 13229)

In [45]:
X_test_csv.shape

(3237, 13229)

## Step 5: Splitting data for model building

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
y = df['disaster']

In [48]:
# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

## Step 6: Model Building

### Using Naive Bytes

In [49]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [50]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [51]:
y_pred = model.predict(X_train)

# Evaluating the model
print("Accuracy:", accuracy_score(y_train, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred))
print("Classification Report:\n", classification_report(y_train, y_pred))

Accuracy: 0.8920708491971527
Confusion Matrix:
 [[3341  122]
 [ 530 2048]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.96      0.91      3463
           1       0.94      0.79      0.86      2578

    accuracy                           0.89      6041
   macro avg       0.90      0.88      0.89      6041
weighted avg       0.90      0.89      0.89      6041



### hyperparameter tuning 

In [52]:
from sklearn.model_selection import GridSearchCV

In [53]:
param_grid = {
    'alpha': [0.1, 0.5, 1.0]
}

In [56]:
nb_model = MultinomialNB()
grid_search = GridSearchCV(estimator=nb_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Assuming X_train and y_train are defined
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_train_pred_best = best_model.predict(X_train)

print("Best Model Parameters:", best_params)
print("Best Model Accuracy on Training Data:", accuracy_score(y_train, y_train_pred_best))
print("Best Model Classification Report on Training Data:\n", classification_report(y_train, y_train_pred_best))

Best Model Parameters: {'alpha': 0.5}
Best Model Accuracy on Training Data: 0.9092865419632511
Best Model Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.89      0.97      0.92      3463
           1       0.95      0.83      0.89      2578

    accuracy                           0.91      6041
   macro avg       0.92      0.90      0.91      6041
weighted avg       0.91      0.91      0.91      6041



#### applying the model on test dataset

In [57]:
y_test_pred = best_model.predict(X_test_csv)
print("Predictions on Test Data:\n", y_test_pred)

Predictions on Test Data:
 [1 0 1 ... 0 0 0]


In [60]:
df_test['Disaster_Predicted'] = y_test_pred

In [61]:
df_test.head()

Unnamed: 0,id,keyword,place,tweet,Predicted,Disaster_Predicted
15,46,ablaze,London,"[birmingham, wholesale, market, ablaze, bbc, n...",1,1
16,47,ablaze,Niall's place | SAF 12 SQUAD |,"[wear, short, race, ablaze]",0,0
17,51,ablaze,NIGERIA,"[previouslyondoyintv, toke, makinwas, marriage...",1,1
18,58,ablaze,Live On Webcam,"[check, nsfw]",0,0
19,60,ablaze,"Los Angeles, Califnordia","[psa, im, splitting, personality, techie, foll...",0,0
