In [45]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# The Problem Statement
We have to build a machine learning model that predicts which Tweets are about real disasters and which ones aren't.

# Importing Libraries 

In [46]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Reading Dataset

In [47]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# About Dataset
* id - a unique identifier for each tweet
* text - the text of the tweet
* location - the location the tweet was sent from (may be blank)
* keyword - a particular keyword from the tweet (may be blank)
* target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)

You can read more about the dataset from [here](https://www.kaggle.com/competitions/nlp-getting-started/data) 

# EDA

In [48]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [49]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [50]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [51]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [52]:
train['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [53]:
train['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [54]:
train['location'].value_counts()

location
USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: count, Length: 3341, dtype: int64

In [55]:
train['location'].unique()

array([nan, 'Birmingham', 'Est. September 2012 - Bristol', ...,
       'Vancouver, Canada', 'London ', 'Lincoln'], dtype=object)

In [56]:
train.drop(['id','keyword','location'],axis=1,inplace=True)
test.drop(['id','keyword','location'],axis=1,inplace=True)

# Handling Imbalance Dataset

In [57]:
train['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

As you can see, we have 4342 tweet that are not real disaster tweets and 3271 are disaster tweets.

**I'm using downsampling method for this**

In [58]:
len_target1 = len(train[train['target']==1])

In [59]:
trainl = train[train['target']==0]

In [60]:
trainl = trainl.sample(len_target1)

In [61]:
trainm  = train[train['target']==1]

In [62]:
df = pd.concat([trainm, trainl])
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [63]:
df['target'].value_counts()

target
1    3271
0    3271
Name: count, dtype: int64

# Preprocessing

Text data preprocessing aims to make text data suitable for analysis by reducing noise, standardizing text, and extracting meaningful features. It improves the quality of input data for machine learning models and helps them perform better in various NLP tasks. The specific preprocessing steps you apply may vary depending on the task and the nature of the text data you're working with

In [64]:
# creating a function for preprocessing 
def preprocessing(text):
    
  #convert all to lowercase
    text = text.lower()
    
  #remove puntuations
    text = re.sub(r'[^\w\s]','',text)
    
  # converting sentence into words
    text = nltk.word_tokenize(text)

  #remove stopword
    text = [word for word in text if not word in stopwords.words('english')]

  #stemming
    text = [PorterStemmer().stem(w) for w in text]

    text = ' '.join(text)
    return text

In [65]:
# applying above function on text of train data
df['text'] = df['text'].apply(preprocessing)

In [66]:
df.head()

Unnamed: 0,text,target
0,deed reason earthquak may allah forgiv us,1
1,forest fire near la rong sask canada,1
2,resid ask shelter place notifi offic evacu she...,1
3,13000 peopl receiv wildfir evacu order california,1
4,got sent photo rubi alaska smoke wildfir pour ...,1


In [67]:
# applying preprocessing function on test data
test['text'] = test['text'].apply(preprocessing)

In [68]:
test.head()

Unnamed: 0,text
0,happen terribl car crash
1,heard earthquak differ citi stay safe everyon
2,forest fire spot pond gees flee across street ...
3,apocalyps light spokan wildfir
4,typhoon soudelor kill 28 china taiwan


# Model Building

I'm storing text data in x variable and target in y variable.

In [69]:
x = df['text']
y = df['target']

Splitting x and y into training and testing datasets.

In [70]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

Creating a fucntion that will calculate metrics for our models, so we don't have to write code again and again

In [71]:
def metrix(pred, y_test):
    print(f'acc: {accuracy_score(y_test, pred)}')
    print(f'confusion:')
    print(confusion_matrix(y_test, pred))
    print(f'classification:')
    print(classification_report(y_test,pred))

Creating a function, it will help us to reuse for model creation and prediction.

In [72]:
def pipe(model):
    # creating a pipeline to convert textual data to numerical data and also giving model for fitting
    pipeline = Pipeline(steps=[('cv', CountVectorizer()),
                               ('tf', TfidfTransformer()),
                          ('model', model)])
    
    # fitting pipeline on train dataset
    pipeline.fit(x_train,y_train)
    
    # making prediction using test data
    pred = pipeline.predict(x_test)
    
    # using metrix function to get info about our model performance
    return metrix(pred, y_test)

**We will try different models and will select one which is performing best**

**Logistics Regression**

In [73]:
%%time
from sklearn.linear_model import LogisticRegression
pipe(LogisticRegression())

acc: 0.7885085574572127
confusion:
[[645 151]
 [195 645]]
classification:
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       796
           1       0.81      0.77      0.79       840

    accuracy                           0.79      1636
   macro avg       0.79      0.79      0.79      1636
weighted avg       0.79      0.79      0.79      1636

CPU times: user 1 s, sys: 712 ms, total: 1.72 s
Wall time: 653 ms


**SVM with linear classifier**

In [74]:
from sklearn.svm import LinearSVC
pipe(LinearSVC())

acc: 0.7720048899755502
confusion:
[[621 175]
 [198 642]]
classification:
              precision    recall  f1-score   support

           0       0.76      0.78      0.77       796
           1       0.79      0.76      0.77       840

    accuracy                           0.77      1636
   macro avg       0.77      0.77      0.77      1636
weighted avg       0.77      0.77      0.77      1636



**Bernoulli Naive Bayes**

In [75]:
from sklearn.naive_bayes import BernoulliNB
pipe(BernoulliNB())

acc: 0.793398533007335
confusion:
[[689 107]
 [231 609]]
classification:
              precision    recall  f1-score   support

           0       0.75      0.87      0.80       796
           1       0.85      0.72      0.78       840

    accuracy                           0.79      1636
   macro avg       0.80      0.80      0.79      1636
weighted avg       0.80      0.79      0.79      1636



Multinomial Naive Bayes

In [76]:
from sklearn.naive_bayes import MultinomialNB
pipe(MultinomialNB())

acc: 0.7811735941320294
confusion:
[[635 161]
 [197 643]]
classification:
              precision    recall  f1-score   support

           0       0.76      0.80      0.78       796
           1       0.80      0.77      0.78       840

    accuracy                           0.78      1636
   macro avg       0.78      0.78      0.78      1636
weighted avg       0.78      0.78      0.78      1636



**RandomForest**

In [77]:
from sklearn.ensemble import RandomForestClassifier
pipe(RandomForestClassifier())

acc: 0.7677261613691931
confusion:
[[656 140]
 [240 600]]
classification:
              precision    recall  f1-score   support

           0       0.73      0.82      0.78       796
           1       0.81      0.71      0.76       840

    accuracy                           0.77      1636
   macro avg       0.77      0.77      0.77      1636
weighted avg       0.77      0.77      0.77      1636



**Gradient Boosting**

In [78]:
from sklearn.ensemble import GradientBoostingClassifier
pipe(GradientBoostingClassifier())

acc: 0.7310513447432763
confusion:
[[715  81]
 [359 481]]
classification:
              precision    recall  f1-score   support

           0       0.67      0.90      0.76       796
           1       0.86      0.57      0.69       840

    accuracy                           0.73      1636
   macro avg       0.76      0.74      0.73      1636
weighted avg       0.76      0.73      0.72      1636



**Decision Tree**

In [79]:
from sklearn.tree import DecisionTreeClassifier
pipe(DecisionTreeClassifier())

acc: 0.7004889975550123
confusion:
[[539 257]
 [233 607]]
classification:
              precision    recall  f1-score   support

           0       0.70      0.68      0.69       796
           1       0.70      0.72      0.71       840

    accuracy                           0.70      1636
   macro avg       0.70      0.70      0.70      1636
weighted avg       0.70      0.70      0.70      1636



**KNN**

In [80]:
from sklearn.neighbors import KNeighborsClassifier
pipe(KNeighborsClassifier())

acc: 0.7451100244498777
confusion:
[[639 157]
 [260 580]]
classification:
              precision    recall  f1-score   support

           0       0.71      0.80      0.75       796
           1       0.79      0.69      0.74       840

    accuracy                           0.75      1636
   macro avg       0.75      0.75      0.74      1636
weighted avg       0.75      0.75      0.74      1636



# Making Submission

Reading submission file

In [81]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


**From all the above models, Bernoulli Naive Bayes is perfoming well, so we will select that model for making prediction on our test dataset**

In [82]:
# creating a pipeline for Bernoulli
pipeline = Pipeline(steps=[('cv', CountVectorizer()),
                               ('tf', TfidfTransformer()),
                          ('Lsvc', BernoulliNB())])
pipeline.fit(x_train,y_train)
pred = pipeline.predict(x_test)
metrix(pred, y_test)

acc: 0.793398533007335
confusion:
[[689 107]
 [231 609]]
classification:
              precision    recall  f1-score   support

           0       0.75      0.87      0.80       796
           1       0.85      0.72      0.78       840

    accuracy                           0.79      1636
   macro avg       0.80      0.80      0.79      1636
weighted avg       0.80      0.79      0.79      1636



In [83]:
test.head()

Unnamed: 0,text
0,happen terribl car crash
1,heard earthquak differ citi stay safe everyon
2,forest fire spot pond gees flee across street ...
3,apocalyps light spokan wildfir
4,typhoon soudelor kill 28 china taiwan


In [84]:
# making prediction on test dataset
pred = pipeline.predict(test['text'])

In [85]:
# converting predictions into series
pred = pd.Series(pred)

In [86]:
# replacing target column of submission with our prediction
sample_submission['target'] = pred
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [87]:
# checking summary of our submission file
sample_submission.describe()

Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.372663
std,3146.427221,0.483588
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,0.0
75%,8176.0,1.0
max,10875.0,1.0


In [88]:
# saving submission file for submission 
sample_submission.to_csv("submission.csv", index=False)

# Conclusion


In this project, we set out to build a tweet classification model to determine whether tweets are related to real disasters or not. Our Bernoulli classifier achieved an impressive accuracy of 0.8026 on the test dataset, demonstrating its ability to effectively distinguish between disaster-related and non-disaster-related tweets.

**Key Takeaways:**

1. **Data Preprocessing Matters:** We started by preprocessing the text data, which included tasks like tokenization, stop word removal, and feature extraction. Proper data preprocessing played a crucial role in improving the model's performance.

2. **Feature Selection:** The choice of features and representation of the text data significantly influenced our model's accuracy. At the start we removed few features that were having null values. The Bernoulli classifier, which is suitable for binary features, proved to be a good choice for this task.

3. **Model Evaluation:** It's important to note that accuracy is just one metric. Depending on the application, other metrics like precision, recall, and F1-score may be more relevant, especially if dealing with imbalanced datasets.

4. **Further Improvements:** While our model performed well, there is always room for improvement. Experimenting with different algorithms, hyperparameter tuning, and incorporating domain-specific knowledge could potentially lead to even better results.

**Future Directions:**

Our success with this tweet classification task opens up several avenues for future work:

- Exploring more advanced NLP techniques, such as deep learning models like LSTM and BERT, to see if they can further boost performance.
- Collecting and incorporating additional data to improve the model's generalization.
- Investigating techniques to handle class imbalance, if it exists, for better model performance.

In conclusion, our Bernoulli classifier has demonstrated strong capabilities in classifying tweets, which can have real-world applications in disaster monitoring and response. This project serves as a solid foundation for future endeavors in NLP and text classification tasks.



## **If you like the NoteBook, please give an upvote.**