In [45]:
import re
import warnings
import pandas as pd

import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import spacy
import en_core_web_sm

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

# Word tokenization
from spacy.matcher import Matcher
from spacy.lang.en import English

from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from tqdm.notebook import tqdm_notebook as tqdm

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
pd.set_option('max_colwidth',500)

sns.set(rc={'figure.figsize':(9,6),'lines.linewidth': 5, 'lines.markersize': 10})
plt.style.use('seaborn-whitegrid')
sns.set_context("notebook", font_scale=1.2)
sns.set_style("whitegrid",{"font.family": ["Corbel"]})

In [79]:
covid_tweets = pd.read_csv("D:/UHC_Twitter/data/02_covid_with_sentiments.csv")
covid_tweets = covid_tweets[covid_tweets.is_retweet == False]
covid_tweets.reset_index(drop=True,inplace=True)
covid_tweets.drop('Unnamed: 0',inplace=True,axis=1)
covid_tweets.head(3)

Unnamed: 0,id,tweeter_handle,init_reach,timestamp,orig_tweet,subjectivity,polarity,platform,location,likes,retweets,hashtags,mentions,is_retweet,KeyTopic,clean_tweet,Sentiments,Day,Hour
0,1249849625227464705,lauriefmiller,844,2020-04-13 23:59:06,Do you have a decision maker who will speak for you when you can't speak for yourself?\n\nJoin us Thursday April 16th at 3:00 for a thoughtful webinar to discuss the implications of #COVID19 on these important conversations.\n\nRegister at https://t.co/tqPWI9i3k3,0.75,0.4,Web App,"Dallas, Texas",0,0,COVID19,,False,MediaCoverage,Do you have a decision maker who will speak for you when you can't speak for yourself? Join us Thursday April 16th at 3:00 for a thoughtful webinar to discuss the implications of #COVID19 on these important conversations. Register at,0.6808,104,23
1,1249849328014782465,COVID19_AUS,1724,2020-04-13 23:57:55,"COVID-19 AUSTRALIA *UPDATE*\n14/4/2020 10:00 AM AEST\n\nNSW: 2,870 (+7)\nQLD: 998 (+11)\nVIC: 1291 (+10)\nSA: 431\nWA: 523 \nTAS: 150 \nACT: 103 \nNT: 28 \n\nICU: 82 (+3)\nTests: 364,082 (+1,843) \nNation Total: 6,394\nNew cases today: 28\nDeaths: 61\n\n7 days ago: 5917 cases",0.602273,0.068182,Web App,NSW,4,3,,,False,CaseReports,"COVID-19 AUSTRALIA *UPDATE* 14/4/2020 10:00 AM AEST NSW: 2,870 (+7) QLD: 998 (+11) VIC: 1291 (+10) SA: 431 WA: 523 TAS: 150 ACT: 103 NT: 28 ICU: 82 (+3) Tests: 364,082 (+1,843) Nation Total: 6,394 New cases today: 28 Deaths: 61 7 days ago: 5917 cases",0.0,104,23
2,1249848025452691456,JDSupra,21371,2020-04-13 23:52:44,"[Webinar] Law and Order in the Time of COVID-19: Does EPA’s Temporary Enforcement Policy Apply to Me? - April 17th, 10:00 am - 11:00 am CT - Register now https://t.co/FhZfRU9ds9",0.0,0.0,Web Client,JDSupra.com,0,0,,,False,MediaCoverage,"[Webinar] Law and Order in the Time of COVID-19: Does EPA’s Temporary Enforcement Policy Apply to Me? - April 17th, 10:00 am - 11:00 am CT - Register now",0.0,104,23


In [70]:
one_sentence = covid_tweets.clean_tweet[2]
doc = nlp(one_sentence)
spacy.displacy.render(doc, style='ent',jupyter=True)

In [88]:
nlp = en_core_web_sm.load()
def get_entities(tweet):
    doc = nlp(tweet)
    entities = [str(ent) for ent in doc.ents]  #if ent.label_ =="GPE"
    if len(entities) > 0:
        return ','.join(entities)
    else:
        return None
    
def get_entity_labels(tweet):
    doc = nlp(tweet)
    entities = [str(ent.label_) for ent in doc.ents] #if ent.label_ =="GPE"
    if len(entities) > 0:
        return ','.join(entities)
    else:
        return None

In [89]:
tqdm.pandas()
covid_tweets['Entities'] = covid_tweets.clean_tweet.progress_apply(get_entities)
covid_tweets['EntityTypes'] = covid_tweets.clean_tweet.progress_apply(get_entity_labels)

HBox(children=(FloatProgress(value=0.0, max=14228.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14228.0), HTML(value='')))




In [90]:
covid_tweets[['Entities','EntityTypes']].tail(20)

Unnamed: 0,Entities,EntityTypes
14208,"Iran,Coronavirus Update,Over 20,400,April 7, 2020,6:00 PM CET,The People’s Mojahedin Organization of Iran,this afternoon,April 7,Coronavirus,more than 20,400,245,Iran","GPE,ORG,CARDINAL,DATE,TIME,ORG,TIME,DATE,PERSON,CARDINAL,CARDINAL,MONEY"
14209,"B.C.,first,Canada,N95","GPE,ORDINAL,GPE,ORG"
14210,"Paris,daytime,Paris,the day,Wednesday","GPE,TIME,GPE,DATE,DATE"
14211,"80,00,2020,COVID","CARDINAL,CARDINAL,ORG"
14212,"Kevin Chapman,founder &amp,The Kentucky Center for Anxiety &amp,Related Disorders,StayCentred","PERSON,ORG,ORG,WORK_OF_ART,PERSON"
14213,"Michigan,5,000,6:00 PM","GPE,MONEY,TIME"
14214,"two,10:00 AM and 2:00 PM,👇","CARDINAL,TIME,CARDINAL"
14215,"@samajwadiparty,India,three thousands,3000.00,today,India,111,India,Corporates,half,HUNGER","ORG,GPE,CARDINAL,CARDINAL,DATE,GPE,CARDINAL,GPE,ORG,CARDINAL,ORG"
14216,"Canada,1.5","GPE,MONEY"
14217,"CURES,Secondly,29.00,the Tweeting Cure","PRODUCT,ORDINAL,MONEY,FAC"


In [None]:
covid_tweets.to_csv("D:/UHC_Twitter/data/01_covid_with_topics.csv")