# Initial Sentiment Analyses

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import pymongo
from pymongo import MongoClient
import re

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.stem.snowball import SnowballStemmer

import itertools
from langdetect import detect
import seaborn as sns

#!pip install textblob
from textblob import TextBlob

#!pip install plotly==4.9.0
#!pip install cufflinks
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

from textblob.sentiments import NaiveBayesAnalyzer
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ear51\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

## Import Data

In [3]:
client = pymongo.MongoClient("mongodb+srv://group3:group3psu!@squid.36jsw.mongodb.net/CORD19?retryWrites=true&w=majority")
db = client.CORD19
db.list_collection_names()

['scratch', 'clean', 'preprocess', 'working2']

In [4]:
collection_clean = db.preprocess
mongo_df_clean = pd.DataFrame(list(collection_clean.find()))
df_1 = mongo_df_clean 
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57921 entries, 0 to 57920
Data columns (total 16 columns):
_id               57921 non-null object
level_0           57921 non-null int64
index             57921 non-null int64
abstract          57921 non-null object
authors           57921 non-null object
journal           57921 non-null object
license           57921 non-null object
publish_time      57921 non-null datetime64[ns]
title             57921 non-null object
language          57921 non-null object
word_count        57921 non-null int64
char_count        57921 non-null int64
sent_count        57921 non-null int64
avg_word_len      57921 non-null float64
stopwords         57921 non-null int64
cleanAbtstract    57921 non-null object
dtypes: datetime64[ns](1), float64(1), int64(6), object(8)
memory usage: 7.1+ MB


In [5]:
def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace("(<br/>)", "")
    ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
    ReviewText = ReviewText.str.replace('(&amp)', '')
    ReviewText = ReviewText.str.replace('(&gt)', '')
    ReviewText = ReviewText.str.replace('(&lt)', '')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')  
    return ReviewText

## Continuous sentiment analyses using rule-based model trained on a pattern library

### Continuous Sentiment Analyses - Cleaned/Preprocessed Abstract

In [6]:
df_1b = df_1[['cleanAbtstract']]
df_1b.rename(columns={ df_1b.columns[0]: "AbstractCleaned" }, inplace = True)
df_1b = df_1b.dropna()
df_1b = df_1b.reset_index(drop=True)
df_1b.head()

Unnamed: 0,AbstractCleaned
0,"[background, anxieti, depress, common, symptom..."
1,"[counterregulatori, arm, renin, angiotensin, s..."
2,"[sever, studi, suggest, baricitinib, potenti, ..."
3,"[background, aim, healthcar, deliveri, requir,..."
4,"[coronavirus, disea, covid, present, two, urge..."


In [7]:
def convert_list_to_string(list, seperator=' '):
    return seperator.join(list)

df_1b['String'] = df_1b['AbstractCleaned'].apply(lambda row: convert_list_to_string(row))
df_1b.head()

Unnamed: 0,AbstractCleaned,String
0,"[background, anxieti, depress, common, symptom...",background anxieti depress common symptom pati...
1,"[counterregulatori, arm, renin, angiotensin, s...",counterregulatori arm renin angiotensin system...
2,"[sever, studi, suggest, baricitinib, potenti, ...",sever studi suggest baricitinib potenti drug m...
3,"[background, aim, healthcar, deliveri, requir,...",background aim healthcar deliveri requir suppo...
4,"[coronavirus, disea, covid, present, two, urge...",coronavirus disea covid present two urgent hea...


In [8]:
df_1b['Abstract2'] = preprocess(df_1b['String'])

df_1b['polarity'] = df_1b['Abstract2'].map(lambda text: TextBlob(text).sentiment.polarity) #calculate sentiment polarity which lies in the range of [-1,1] where 1 means positive sentiment and -1 means a negative sentiment.
df_1b['len'] = df_1b['Abstract2'].astype(str).apply(len) #Create new feature
df_1b['word_count'] = df_1b['Abstract2'].apply(lambda x: len(str(x).split())) #Create new feature
df_1b.head()

Unnamed: 0,AbstractCleaned,String,Abstract2,polarity,len,word_count
0,"[background, anxieti, depress, common, symptom...",background anxieti depress common symptom pati...,background anxieti depress common symptom pati...,-0.089286,1002,168
1,"[counterregulatori, arm, renin, angiotensin, s...",counterregulatori arm renin angiotensin system...,counterregulatori arm renin angiotensin system...,-0.021429,1261,193
2,"[sever, studi, suggest, baricitinib, potenti, ...",sever studi suggest baricitinib potenti drug m...,sever studi suggest baricitinib potenti drug m...,0.45,449,63
3,"[background, aim, healthcar, deliveri, requir,...",background aim healthcar deliveri requir suppo...,background aim healthcar deliveri requir suppo...,0.070606,973,145
4,"[coronavirus, disea, covid, present, two, urge...",coronavirus disea covid present two urgent hea...,coronavirus disea covid present two urgent hea...,0.05,239,31


In [9]:
print('3 random articles with the relatively high positive sentiment polarity: \n')
cl = df_1b.loc[df_1b.polarity >= 0.6, ['String']].sample(1).values
for c in cl:
    print(c[0])

3 random articles with the relatively high positive sentiment polarity: 

use advanc learn technolog learn manag system lm great assist learn process especi use univ environ promot develop selfregul learn increa academ perform student satisfact toward person learn one innov resourc lm may intellig person assist ipa work sampl thirdgrad student follow health scienc degr aim verifi whether signif differ student access lm depend use versus nonus ipa verifi whether signif differ student learn outcom depend use versus nonus ipa verifi whether signif differ student satisfact teach covid pandem depend use versus nonus ipa analyz student percept use ipa lm found greater function access lm satisfact teach especi health crisi group student use ipa howev expan avail inform usabl featur emb ipa still challeng issu


In [10]:
print('3 random articles with the most neutral sentiment(zero) polarity: \n')
cl = df_1b.loc[df_1b.polarity == 0.0, ['String']].sample(3).values
for c in cl:
    print(c[0])

3 random articles with the most neutral sentiment(zero) polarity: 

covid present herculean challeng research scientif communiti produc diagnost treatment solut return normalci requir rapid develop countermeasur anim model serv critic tool test vaccin therapeut anim disea status potenti covid exposur prior studi execut may sever bias efficaci test develop toolbox immunolog molecular test monitor countermeasur impact disea outcom evalu prechalleng covid status assay applic show critic necess anim prescreen specif realtim pcr result document preexposur african green monkey prior sarscov challeng sequenc confirm communityacquir exposur longitudin monitor nasopharyng swab serum show preexposur impact viral disea cour result immunolog respon studi demonstr util comprehen prescreen strategi anim model captur first document case communityacquir nonhuman primat infect one sentenc summari preexposur sarscov affect biomark respon anim model highlight need robust prescreen protocol prior medic co

In [11]:
print('3 articles with the most negative polarity: \n')
cl = df_1b.loc[df_1b.polarity <= 0.8, ['String']].sample(3).values
for c in cl:
    print(c[0])

3 articles with the most negative polarity: 

report first time therapyresist hypernatremia plasma sodium concentr mmol per liter develop critic ill coronavirus disea covid patient age year requir mechan ventil correl plasma sodium concentr sodium input plasma concentr chlorid elev potassium decrea find consist abnorm increa renal sodium reabsorpt possibl caus increa angiotensin ii activ secondari sever acut respiratori syndrom coronavirus sarscovinduc downregul angiotensinconvert enzym ace receptor hypernatremia associ increa length inten care unit stay special attent paid electrolyt status covid patient
recent coronavirus disea covid event present challeng health care system worldwid air medic movement individu potenti infecti disea pose uniqu challeng threat crew receiv personnel u depart health human servic air medic evacu team nation disast medic system direct support flight move individu infect control precaut focus sourc engin control person protect equip safe work practic limit

In [12]:
df_1b['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

### Continuous Sentiment Analyses - Semi-Cleaned Abstract

In [13]:
df_1c = df_1[['abstract']]
df_1c = df_1c.dropna()
df_1c = df_1c.reset_index(drop=True)
df_1c.head()

Unnamed: 0,abstract
0,background anxiety depression common symptoms ...
1,counterregulatory arm renin angiotensin system...
2,several studies suggested baricitinib potentia...
3,background aims healthcare delivery requires s...
4,coronavirus disease covid19 presents two urgen...


In [14]:
df_1c['Abstract2'] = preprocess(df_1c['abstract'])

df_1c['polarity'] = df_1c['Abstract2'].map(lambda text: TextBlob(text).sentiment.polarity) #calculate sentiment polarity which lies in the range of [-1,1] where 1 means positive sentiment and -1 means a negative sentiment.
df_1c['len'] = df_1c['Abstract2'].astype(str).apply(len) #Create new feature
df_1c['word_count'] = df_1c['Abstract2'].apply(lambda x: len(str(x).split())) #Create new feature
df_1c.head()

Unnamed: 0,abstract,Abstract2,polarity,len,word_count
0,background anxiety depression common symptoms ...,background anxiety depression common symptoms ...,-0.090909,1243,178
1,counterregulatory arm renin angiotensin system...,counterregulatory arm renin angiotensin system...,0.179167,1672,215
2,several studies suggested baricitinib potentia...,several studies suggested baricitinib potentia...,0.22,546,67
3,background aims healthcare delivery requires s...,background aims healthcare delivery requires s...,0.052803,1170,145
4,coronavirus disease covid19 presents two urgen...,coronavirus disease covid19 presents two urgen...,0.233333,273,31


In [15]:
print('3 random articles with the relatively high positive sentiment polarity: \n')
cl = df_1c.loc[df_1c.polarity >= 0.6, ['abstract']].sample(1).values
for c in cl:
    print(c[0])

3 random articles with the relatively high positive sentiment polarity: 

article contains series analyses done sarscov2 outbreak rio grande sul rs south brazil analyses focused highincidence cities state capital porto alegre state level provide methodological details estimates effective reproduction number r_t joint analysis mobility data together estimated r_t well icu simulations icu los length stay estimation hospitalizations porto alegrers


In [16]:
print('3 random articles with the most neutral sentiment(zero) polarity: \n')
cl = df_1c.loc[df_1c.polarity == 0.0, ['abstract']].sample(3).values
for c in cl:
    print(c[0])

3 random articles with the most neutral sentiment(zero) polarity: 

outbreak sarscov2 virus causing loss lives property world 2 1 million cases covid19 death 1 2 lakh patients worldwide numbers still rising virus spreads rapidly droplets coming nose mouth infected person sandoiu sarscov2 spread easily medical news today 2020 httpswww medicalnewstoday comarticleswhydoessarscov2spreadsoeasily situation proper quarantining monitoring already infected patients essential cases patients need transferred different locations ambulances monitoring ambulances traffic police help ensure distancing faster movement vehicle inside city paper presents development realtime global positioning systembased tracking app ambulances carrying covid19 patients would help traffic police ensure distancing patients public
paper develops downandout call option model introducing structural break volatility capture coronavirus covid19 outbreak life insurers equity boards utility evaluated optimal guaranteed rate eq

In [17]:
print('3 articles with the most negative polarity: \n')
cl = df_1c.loc[df_1c.polarity <= 0.8, ['abstract']].sample(3).values
for c in cl:
    print(c[0])

3 articles with the most negative polarity: 

exported cases 2019 novel coronavirus covid19 infection confirmed outside china provide opportunity estimate cumulative incidence confirmed case fatality risk ccfr mainland china knowledge ccfr critical characterize severity understand pandemic potential covid19 early stage epidemic using exponential growth rate incidence present study statistically estimated ccfr basic reproduction numberthe average number secondary cases generated single primary case naïve population modeled epidemic growth either single index case illness onset 8 december 2019 scenario 1 using growth rate fitted along parameters scenario 2 based data 20 exported cases reported 24 january 2020 cumulative incidence china 24 january estimated 6924 cases 95 confidence interval ci 4885 9211 19289 cases 95 ci 10901 30158 respectively latest estimated values ccfr 53 95 ci 35 75 scenario 1 84 95 ci 53 123 scenario 2 basic reproduction number estimated 21 95 ci 20 22 32 95 ci 27 

In [18]:
df_1c['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

### Continuous Sentiment Analyses - Titles

In [19]:
df_1c = df_1[['title']]
df_1c = df_1c.dropna()
df_1c = df_1c.reset_index(drop=True)
df_1c.head()

Unnamed: 0,title
0,the 24-form tai chi improves anxiety and depre...
1,relationship between circulating levels of ang...
2,baricitinib - a januase kinase inhibitor - not...
3,artificial intelligence (ai) applications for ...
4,coronavirus disease: challenges for psychiatry


In [20]:
df_1c['title2'] = preprocess(df_1c['title'])

df_1c['polarity'] = df_1c['title2'].map(lambda text: TextBlob(text).sentiment.polarity) #calculate sentiment polarity which lies in the range of [-1,1] where 1 means positive sentiment and -1 means a negative sentiment.
df_1c['len'] = df_1c['title2'].astype(str).apply(len) #Create new feature
df_1c['word_count'] = df_1c['title2'].apply(lambda x: len(str(x).split())) #Create new feature
df_1c.head()

Unnamed: 0,title,title2,polarity,len,word_count
0,the 24-form tai chi improves anxiety and depre...,the 24-form tai chi improves anxiety and depre...,0.0,153,20
1,relationship between circulating levels of ang...,relationship between circulating levels of ang...,0.0,128,13
2,baricitinib - a januase kinase inhibitor - not...,baricitinib - a januase kinase inhibitor - not...,0.9,89,16
3,artificial intelligence (ai) applications for ...,artificial intelligence (ai) applications for ...,-0.6,63,7
4,coronavirus disease: challenges for psychiatry,coronavirus disease: challenges for psychiatry,0.0,46,5


In [21]:
print('3 random titles with the relatively high positive sentiment polarity: \n')
cl = df_1c.loc[df_1c.polarity >= 0.6, ['title']].sample(3).values
for c in cl:
    print(c[0])

3 random titles with the relatively high positive sentiment polarity: 

analysis of adjunctive serological detection to nucleic acid test for severe acute respiratory syndrome coronavirus 2 (sars-cov-2) infection diagnosis
role of comorbidities like diabetes on severe acute respiratory syndrome coronavirus-2: a review
impact of the covid-19 outbreak on acute stroke care


In [22]:
print('3 random titles with the most neutral sentiment(zero) polarity: \n')
cl = df_1c.loc[df_1c.polarity == 0.0, ['title']].sample(3).values
for c in cl:
    print(c[0])

3 random titles with the most neutral sentiment(zero) polarity: 

multicenter point-prevalence evaluation of the utilization and safety of drug therapies for covid-19
haemodynamic monitoring and management in covid-19 intensive care patients: an international survey
overview of digital health surveillance system during covid-19 pandemic: public health issues and misapprehensions


In [23]:
print('3 titles with the most negative polarity: \n')
cl = df_1c.loc[df_1c.polarity <= 0.8, ['title']].sample(3).values
for c in cl:
    print(c[0])

3 titles with the most negative polarity: 

a critical appraisal of the acs "medically-necessary, time-sensitive procedures" (ments) scoring system, urology consensus recommendations, and individual surgeon case prioritization for resumption of elective urologic surgery during the covid-19 pandemic
a case report of covid-19 with false negative rt-pcr test: necessity of chest ct
provision of pediatric immunization services during the covid-19 pandemic: an assessment of capacity among pediatric immunization providers participating in the vaccines for children program - united states, may 2020.


In [24]:
df_1c['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')