In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
import re
import string
import contractions
import collections

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# load the nltk specific parameters
stop_words = nltk.corpus.stopwords.words('english')

In [6]:
# import trained text vectorizer
with open('./WINNING_MODEL/training_vectorizer.pkl', 'rb') as vec_file:
    vectorizer = pickle.load(vec_file)

In [7]:
# import trained classifier:
with open('./WINNING_MODEL/yes_no_svc_plus.pkl', 'rb') as file:
    svc = pickle.load(file)

In [8]:
# csv files from the API

# df = pd.read_csv('./ADD_HUR_DATA/sandy_2012_3day_increment_v2.csv')
# df.shape

In [9]:
# df = pd.read_csv('./ADD_HUR_DATA/arthur_2014_3day_increment_v2.csv')
# df.shape

In [10]:
# df = pd.read_csv('./ADD_HUR_DATA/barry_2019_3day_increment.csv')
# df.shape

In [11]:
# df = pd.read_csv('./ADD_HUR_DATA/florence_2018_3day_increment.csv')
# df.shape

In [12]:
# df = pd.read_csv('./ADD_HUR_DATA/isaias_2020_3day_increment.csv')
# df.shape

In [13]:
# df = pd.read_csv('./ADD_HUR_DATA/nicholas_2021_3day_increment.csv')
# df.shape

In [14]:
df = pd.read_csv('./ADD_HUR_DATA/hurricane_ian_3day_increment2.csv')
df.shape

(8627, 16)

In [15]:
# select or combine lemmatization, stemming, stopword removal to compare performance

def preprocess_text(text, flg_stemm = True, flg_lemm = True, lst_stopwords=None):
    text_clean = re.sub(r'[^\w\s.,]', '', str(text).strip())
    lst_text = text_clean.split()
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
    if flg_stemm == True:
        stemm = nltk.stem.porter.PorterStemmer()
        lst_text = [stemm.stem(word) for word in lst_text]
    if flg_lemm == True:
        lem=nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    text_clean = ' '.join(filter(None, lst_text))
    text_clean = text_clean.replace(" ,",",").replace(' .', '.')
    text_clean = contractions.fix(text_clean)
    return text_clean

In [16]:
X = df['text'].apply(lambda x: preprocess_text(x, flg_stemm=True, flg_lemm=True, lst_stopwords=stop_words))
X.shape

(8627,)

In [17]:
# transform the target tweets
X_vec = vectorizer.transform(X)

In [18]:
preds = svc.predict(X_vec)

In [19]:
preds.shape

(8627,)

In [20]:
df['predictions'] = preds
df.shape

(8627, 17)

In [21]:
final_df = df[df['predictions'] == 1]
final_df.shape

(2605, 17)

In [22]:
final_df.tail()

Unnamed: 0,id,text,author_id,created_at,public_metrics,reply_settings,entities,possibly_sensitive,edit_history_tweet_ids,source,lang,referenced_tweets,conversation_id,in_reply_to_user_id,attachments,withheld,predictions
8597,1588001572419764224,RT via @NHC_Atlantic: Tropical Storm #Lisa Adv...,818842601327562752,2022-11-03T02:54:13.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",everyone,"{'urls': [{'start': 153, 'end': 176, 'url': 'h...",False,['1588001572419764224'],IFTTT,en,,1588001572419764224,,,,1
8606,1588001558960054272,"Make it rain, hurricane in it",700558609411780608,2022-11-03T02:54:09.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",everyone,,False,['1588001558960054272'],Twitter for iPhone,en,,1588001558960054272,,,,1
8608,1588001556762337285,@Sam_Valance Anyone else notice the Republican...,363549180,2022-11-03T02:54:09.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",everyone,"{'urls': [{'start': 263, 'end': 286, 'url': 'h...",False,['1588001556762337285'],Twitter Web App,en,"[{'type': 'replied_to', 'id': '158794864055495...",1587948640554950656,16232484.0,{'media_keys': ['16_1588001551028830208']},,1
8615,1588001541553782784,Roofing and Solar in Hurricane Ian! What an ex...,353364316,2022-11-03T02:54:05.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",everyone,"{'annotations': [{'start': 21, 'end': 33, 'pro...",False,['1588001541553782784'],Twitter for iPhone,en,,1588001541553782784,,,,1
8625,1588001507194142721,@CharlieCrist Another Democrat lie. It was NO...,937844576672538626,2022-11-03T02:53:57.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",everyone,"{'mentions': [{'start': 0, 'end': 13, 'usernam...",False,['1588001507194142721'],Twitter for iPhone,en,"[{'type': 'replied_to', 'id': '158794943720846...",1587949437208469504,38970940.0,,,1


In [23]:
# final_sandy.to_csv('./CLEAN_ADD_HUR_DATA/sandy_2012_yn.csv', index=False)

In [24]:
# final_df.to_csv('./CLEAN_ADD_HUR_DATA/arthur_2014_yn.csv', index=False)

In [25]:
# final_df.to_csv('./CLEAN_ADD_HUR_DATA/barry_2019_yn.csv', index=False)

In [26]:
# final_df.to_csv('./CLEAN_ADD_HUR_DATA/florence_2018_yn.csv', index=False)

In [27]:
# final_df.to_csv('./CLEAN_ADD_HUR_DATA/isaias_2020_yn.csv', index=False)

In [28]:
final_df.to_csv('./CLEAN_ADD_HUR_DATA/ian_2022_yn.csv', index=False)