In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

# suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# read the data
df = pd.read_csv('https://raw.githubusercontent.com/nikjohn7/Disaster-Tweets-Kaggle/main/data/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# how many rows and columns are in the data set?
df.shape

(7613, 5)

In [4]:
import nltk
nltk.download('stopwords')

stopwords = set(nltk.corpus.stopwords.words('english'))


#I was thinking about including all of the location attribute values as stop words, 
#but I decided against it, as often distasters like wildfires are centered
#around a specific location, and I don't want to lose that information.

#However, it should be noted that the model trained as is will attribute distaster or
#non-disaster weight to certain locations due to the presence of those locations and the content
#of the tweets in the training data. This is a limitation of the model.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\godpi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
# build a text processing and classifier pipeline
# to predict whether a tweet is about a real disaster or not

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


df2 = df.copy()

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df2['text'], df2['target'], test_size=0.2)

# Create a pipeline that first transforms the text data into TF-IDF vectors, then applies SVM
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(stopwords))),
    ('clf', svm.SVC(random_state=np.random.seed(1))),
])

# Train the classifier
text_clf.fit(X_train, y_train)

# Predict the test set results
y_pred = text_clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['Disaster', 'Non-Disaster']))


              precision    recall  f1-score   support

    Disaster       0.80      0.89      0.84       886
Non-Disaster       0.82      0.68      0.75       637

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523



In [6]:
# This script creates a new column 'sentiment' in the dataframe, 
# which contains the sentiment score of the text. 
# The sentiment score is a float within the range [-1.0, 1.0], 
# where -1.0 denotes a very negative sentiment, 
# 1.0 denotes a very positive sentiment, 
# and values around 0 denote a neutral sentiment.

from textblob import TextBlob

# Define a function to apply sentiment analysis to a text
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity  # returns a value between -1 and 1

# Create a new column 'sentiment' in the DataFrame
df2['sentiment'] = df2['text'].apply(get_sentiment)

# Display the DataFrame
df2

Unnamed: 0,id,keyword,location,text,target,sentiment
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.000000
1,4,,,Forest fire near La Ronge Sask. Canada,1,0.100000
2,5,,,All residents asked to 'shelter in place' are ...,1,-0.018750
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0.000000
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0.000000
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,0.000000
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,0.150000
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0.000000
7611,10872,,,Police investigating after an e-bike collided ...,1,-0.260417


In [7]:
# find average sentiment for each tweet in each class in df2
df2.groupby('target')['sentiment'].mean()

target
0    0.070622
1    0.018631
Name: sentiment, dtype: float64

In [8]:
# find average sentiment for each keyword in df2
# order the results from most positive to most negative

df2.groupby('keyword')['sentiment'].mean().sort_values(ascending=False)

keyword
hazardous               0.457891
razed                   0.418946
outbreak                0.312661
mayhem                  0.277262
wreckage                0.273440
                          ...   
trapped                -0.160049
structural%20failure   -0.195099
airplane%20accident    -0.202232
violent%20storm        -0.510888
bloody                 -0.522698
Name: sentiment, Length: 221, dtype: float64

In [9]:
# Out of curiousity, I want to build a classifier that uses the sentiment of a tweet along with
# the text to predict whether a tweet is about a real disaster or not.


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# Define a function to select the 'sentiment' attribute
def select_sentiment(X):
    return X[['sentiment']].values

# Create a pipeline that first transforms the 'text' attribute into TF-IDF vectors, and leaves
# the 'sentiment' attribute as is, then applies SVM
tweet_pipeline = Pipeline([
    ('features', ColumnTransformer([
        ('text', TfidfVectorizer(stop_words=list(stopwords)), 'text'),
        ('sentiment', FunctionTransformer(select_sentiment, validate=False), ['sentiment']),
    ])),
    ('clf', svm.SVC()),
])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df2[['text', 'sentiment']], df2['target'], test_size=0.2)

# Train the classifier
tweet_pipeline.fit(X_train, y_train)

# Predict the test set results
y_pred = tweet_pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['Disaster', 'Non-Disaster']))



              precision    recall  f1-score   support

    Disaster       0.77      0.89      0.83       846
Non-Disaster       0.83      0.66      0.74       677

    accuracy                           0.79      1523
   macro avg       0.80      0.78      0.78      1523
weighted avg       0.80      0.79      0.79      1523



As you can see, the impact is negligable, which likely implies that the SVM likely figures out the relationships between the words and the sentiment on its own, without the help of TextBlob