In [1]:
import os
import pandas as pd
import pickle
import sys
import spacy
from spacy.language import Language
import re
from sklearn.linear_model import LogisticRegression

In [2]:
# project imports
from tools_zupan import make_str

In [3]:
# once we have tweets of interest the upstream will change
# to the data generation step we are actually interested in
upstream = ["recommended_actions_upstream", "category_classification_models", "vectorizer"]

In [4]:
# Parameters
upstream = {
    "recommended_actions_upstream": {
        "nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/recommended_actions_upstream.ipynb",
        "file": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/twitter_actions.csv",
    },
    "vectorizer": {
        "nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer.ipynb",
        "vectorizer": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vectorizer.pkl",
        "vocab": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/vocab.pkl",
    },
    "category_classification_models": {
        "nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/category_classification_models.ipynb",
        "model_lr": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/model_lr.pkl",
        "model_rf": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/model_rf.pkl",
        "model_nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/model_nb.csv",
        "model_votingc": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/model_votingc.csv",
    },
}
product = {
    "nb": "/Users/mboussarov/_umsi/Capstone/umads_697_data_medics/pipeline/output/recommended_actions.ipynb"
}


In [5]:
# load a spacy language model
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

In [6]:
# df = pd.read_csv(params['file'])
df = pd.read_csv("output/twitter_actions.csv")

In [7]:
# load the vectorizer
vectorizer = pickle.load(open(os.path.join(".", "output", "vectorizer.pkl"), "rb"))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
# load the model
clf_model = pickle.load(open(os.path.join(".", "output", "model_lr.pkl"), "rb"))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [9]:
# prepare text for model - vectorize the tweets 
raw_tweets_vectorized = vectorizer.transform(df['tweet_text'])

In [10]:
tweet_class_preds = clf_model.predict(raw_tweets_vectorized)

In [11]:
df["predicted_class"] = tweet_class_preds

In [12]:
# filter to only the tweets we are interested in - those callling for an action
action_tweets = df[df.predicted_class == "rescue_volunteering_or_donation_effort"].copy()
action_tweets = action_tweets.sort_values("tweet_count", ascending=False)

In [13]:
action_tweets["spacy_text"] = action_tweets["tweet_text"].apply(nlp)

In [14]:
verb_list = ["donate", "volunteer", "evacuate"]
regex = re.compile('|'.join(re.escape(x) for x in verb_list), re.IGNORECASE)

In [15]:
for idx, data in action_tweets.iterrows():
    # find the recommended action
    verb_matches = re.findall(regex, data["tweet_text"])
    total_tweet_count = data.tweet_count - 1
    
    # at least on word has been found
    if len(verb_matches) > 0:
        
        # find all the links (often more than 1 donation site)
        donation_url_list = []
        
        # check for a retweet
        original_tweeter = re.findall("RT @([a-zA-z0-9_]*)", data["tweet_text"])
        
        # find and record all the urls in the tweet
        for token in data["spacy_text"]:
            if token.like_url:
                donation_url_list.append(token)

        
        if len(donation_url_list) > 0:
            print(f"Original tweet:\n{data['tweet_text']}\n")
            if len(original_tweeter) > 0:
                tweet_author = original_tweeter[0]
            else:
                tweet_author = data["name"]
            for idx, url in enumerate(donation_url_list):
                if idx == 0:
                    print(f"{tweet_author} and {total_tweet_count} others recommend you {make_str(verb_matches)}.  More information at {url}")
                else:
                    print(f"Please also consider donating to {url}")
            print("\n\n")

Original tweet:
RT @soompi: #IU, #Suzy, #GOT7‚Äôs #Jinyoung, And More Donate To Aid Flood Relief Efforts
https://t.co/pdrAQuO0ko https://t.co/aT4jt3fjyC

soompi and 904 others recommend you donate.  More information at https://t.co/pdrAQuO0ko
Please also consider donating to https://t.co/aT4jt3fjyC



Original tweet:
RT @rbrbszsz: TRENDING #1 NATE NEWS! 
"Suzy donate 100 million won to flood damage, and donate 300 million won to Hope Bridge"
https://t.co‚Ä¶

rbrbszsz and 214 others recommend you donate.  More information at https://t.co



Original tweet:
RT @mckendree74: https://t.co/6Mrn2IBPdT

Thank you @BoomerSquadNFT1! Boomers have listed this ultra RARE awesome NFT to donate to the floo‚Ä¶

mckendree74 and 64 others recommend you donate.  More information at https://t.co/6Mrn2IBPdT



Original tweet:
RT @WYMT: Texas Roadhouse to donate 100% of sales at some locations Tuesday to EKY flood relief https://t.co/KeZQDeCm6w

WYMT and 29 others recommend you donate.  More information at