# Initial Sentiment Analyses

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import pymongo
from pymongo import MongoClient
import re

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.stem.snowball import SnowballStemmer

import itertools
from langdetect import detect
#import seaborn as sns

#!pip install textblob
from textblob import TextBlob

#!pip install plotly==4.9.0
#!pip install cufflinks
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

from textblob.sentiments import NaiveBayesAnalyzer
#nltk.download('movie_reviews')

from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Import Data

In [3]:
client = pymongo.MongoClient("mongodb+srv://group3:group3psu!@squid.36jsw.mongodb.net/CORD19?retryWrites=true&w=majority")
db = client.CORD19
db.list_collection_names()

['scratch', 'clusterFiftyTen', 'preprocess', 'umap']

In [6]:
collection_clean = db.preprocess
mongo_df_clean = pd.DataFrame(list(collection_clean.find()))
df_1 = mongo_df_clean 
df_1.info()

KeyboardInterrupt: 

In [5]:
def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace("(<br/>)", "")
    ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
    ReviewText = ReviewText.str.replace('(&amp)', '')
    ReviewText = ReviewText.str.replace('(&gt)', '')
    ReviewText = ReviewText.str.replace('(&lt)', '')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')  
    return ReviewText

## Continuous sentiment analyses using rule-based model trained on a pattern library

### Continuous Sentiment Analyses - Cleaned/Preprocessed Abstract

In [None]:
df_1b = df_1[['_id', 'cleanAbtstract', 'publish_time']]
df_1b = df_1b.dropna()
df_1b = df_1b.reset_index(drop=True)
df_1b.head()

In [None]:
def convert_list_to_string(list, seperator=' '):
    return seperator.join(list)

df_1b['String'] = df_1b['cleanAbtstract'].apply(lambda row: convert_list_to_string(row))
df_1b.head()

In [None]:
df_1b['Abstract2'] = preprocess(df_1b['String'])

df_1b['polarity'] = df_1b['Abstract2'].map(lambda text: TextBlob(text).sentiment.polarity) #calculate sentiment polarity which lies in the range of [-1,1] where 1 means positive sentiment and -1 means a negative sentiment.
df_1b['len'] = df_1b['Abstract2'].astype(str).apply(len) #Create new feature
df_1b['word_count'] = df_1b['Abstract2'].apply(lambda x: len(str(x).split())) #Create new feature
df_1b.head()

In [None]:
print('3 random articles with the relatively high positive sentiment polarity: \n')
cl = df_1b.loc[df_1b.polarity >= 0.6, ['String']].sample(1).values
for c in cl:
    print(c[0])

In [None]:
print('3 random articles with the most neutral sentiment(zero) polarity: \n')
cl = df_1b.loc[df_1b.polarity == 0.0, ['String']].sample(3).values
for c in cl:
    print(c[0])

In [None]:
print('3 articles with the most negative polarity: \n')
cl = df_1b.loc[df_1b.polarity <= 0.8, ['String']].sample(3).values
for c in cl:
    print(c[0])

In [None]:
df_1b['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

## Sentiment over time (Monthly)

In [None]:
#monthly increment by sentiment
df_2 = df_1b
df_2['yearMonth'] = df_2['publish_time'].astype(str).str[0:7]
order = df_2.yearMonth.unique()
order.sort()
df_index = pd.DataFrame(order, columns = ["yearMonth"])
df_index['order'] = range(1, len(df_index) + 1)
df_2 = df_2.merge(df_index, how='left', on = "yearMonth")

In [None]:
#plot 

df_2.plot(x='order', y='sentimentPolarity', style='o')  
plt.title('Sentiment Over Time')  
plt.xlabel('Chronological Order by Month and Year (2019 - 2021)')  
plt.ylabel('Sentiment Rating')  
plt.show()

In [None]:
#Correlation - R-squared

x = df_2['order']
y = df_2['sentimentPolarity']

correlation_matrix = np.corrcoef(x, y)
correlation_xy = correlation_matrix[0,1]
r_squared = correlation_xy**2
print(r_squared)

## Push results to new field in Preprocess Collection

In [13]:
df_1b.head()

Unnamed: 0,_id,cleanAbtstract,String,Abstract2,polarity,len,word_count
0,5f71105c4d4ac17de8212ba2,"[background, anxieti, depress, common, symptom...",background anxieti depress common symptom pati...,background anxieti depress common symptom pati...,-0.089286,1002,168
1,5f7110604d4ac17de8213220,"[counterregulatori, arm, renin, angiotensin, s...",counterregulatori arm renin angiotensin system...,counterregulatori arm renin angiotensin system...,-0.021429,1261,193
2,5f7110634d4ac17de8213975,"[sever, studi, suggest, baricitinib, potenti, ...",sever studi suggest baricitinib potenti drug m...,sever studi suggest baricitinib potenti drug m...,0.45,449,63
3,5f7110654d4ac17de821437e,"[background, aim, healthcar, deliveri, requir,...",background aim healthcar deliveri requir suppo...,background aim healthcar deliveri requir suppo...,0.070606,973,145
4,5f7110654d4ac17de82143fb,"[coronavirus, disea, covid, present, two, urge...",coronavirus disea covid present two urgent hea...,coronavirus disea covid present two urgent hea...,0.05,239,31


In [16]:
cursor = collection_clean.find()
 
for document in cursor:
    id = document["_id"]
    record = df_1b.loc[(df_1b['_id'] == id)]
    polarity = record["polarity"]
    sentimentPolarity = document.get("sentimentPolarity")
 
    if sentimentPolarity is None:
        collection_clean.update_one({"_id": id}, {"$set": {"sentimentPolarity": polarity.tolist()}})

In [17]:
collection_clean = db.preprocess
mongo_df_clean = pd.DataFrame(list(collection_clean.find()))
df_1 = mongo_df_clean 
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57921 entries, 0 to 57920
Data columns (total 20 columns):
_id                  57921 non-null object
level_0              57921 non-null int64
index                57921 non-null int64
abstract             57921 non-null object
authors              57921 non-null object
journal              57921 non-null object
license              57921 non-null object
publish_time         57921 non-null datetime64[ns]
title                57921 non-null object
language             57921 non-null object
word_count           57921 non-null int64
char_count           57921 non-null int64
sent_count           57921 non-null int64
avg_word_len         57921 non-null float64
stopwords            57921 non-null int64
cleanAbtstract       57921 non-null object
bert_abstract        57921 non-null object
abstract_tfidf       57921 non-null object
cluster              31803 non-null float64
sentimentPolarity    57921 non-null object
dtypes: datetime64[ns](1), 

In [18]:
df_1.head()

Unnamed: 0,_id,level_0,index,abstract,authors,journal,license,publish_time,title,language,word_count,char_count,sent_count,avg_word_len,stopwords,cleanAbtstract,bert_abstract,abstract_tfidf,cluster,sentimentPolarity
0,5f71105c4d4ac17de8212ba2,0,0,background anxiety depression common symptoms ...,"liu, jia; yu, ping; lv, wei; wang, xinxin",front physiol,cc-by,2020-03-11,the 24-form tai chi improves anxiety and depre...,en,307,1512,12,4.925081,122,"[background, anxieti, depress, common, symptom...","[-0.2850953936576843, -0.31552305817604065, 0....",background anxieti depress common symptom pati...,-1.0,[-0.08928571428571429]
1,5f7110604d4ac17de8213220,1,1,counterregulatory arm renin angiotensin system...,"zhou, xiaomin; zhang, ping; liang, tao; chen, ...",heart vessels,no-cc,2019-07-29,relationship between circulating levels of ang...,en,341,1973,11,5.785924,116,"[counterregulatori, arm, renin, angiotensin, s...","[-0.7387855052947998, -0.5667943358421326, 0.9...",counterregulatori arm renin angiotensin system...,-1.0,[-0.021428571428571432]
2,5f7110634d4ac17de8213975,2,2,several studies suggested baricitinib potentia...,"praveen, d.; chowdary, puvvada ranadheer; aana...",int j antimicrob agents,no-cc,2020-04-04,baricitinib - a januase kinase inhibitor - not...,en,107,594,6,5.551402,35,"[sever, studi, suggest, baricitinib, potenti, ...","[-0.49490848183631897, -0.631615936756134, 1.1...",sever studi suggest baricitinib potenti drug m...,-1.0,[0.45]
3,5f7110654d4ac17de821437e,3,3,background aims healthcare delivery requires s...,"vaishya, raju; javaid, mohd; khan, ibrahim hal...",diabetes metab syndr,no-cc,2020-04-14,artificial intelligence (ai) applications for ...,en,240,1320,11,5.5,95,"[background, aim, healthcar, deliveri, requir,...","[-0.8326570391654968, -0.1865071803331375, 1.1...",background aim healthcar deliveri requir suppo...,111.0,[0.0706060606060606]
4,5f7110654d4ac17de82143fb,4,4,coronavirus disease covid19 presents two urgen...,"kelly, brendan d.",,cc-by,2020-04-15,coronavirus disease: challenges for psychiatry,en,51,321,3,6.294118,20,"[coronavirus, disea, covid, present, two, urge...","[-0.32281801104545593, -0.3492285907268524, 1....",disea present two urgent health problem ill ...,98.0,[0.04999999999999999]


In [19]:
#Check 
id = "5f71105c4d4ac17de8212ba2"
record = df_1b.loc[(df_1b['_id'] == id)]
polarity = record["polarity"]
polarity

0   -0.089286
Name: polarity, dtype: float64

In [None]:
#df_2['sentimentPolarity'] = df_2.sentimentPolarity.map(lambda x: x[0]) #Convert series back to float

### Continuous Sentiment Analyses - Semi-Cleaned Abstract

In [20]:
df_1c = df_1[['abstract']]
df_1c = df_1c.dropna()
df_1c = df_1c.reset_index(drop=True)
df_1c.head()

Unnamed: 0,abstract
0,background anxiety depression common symptoms ...
1,counterregulatory arm renin angiotensin system...
2,several studies suggested baricitinib potentia...
3,background aims healthcare delivery requires s...
4,coronavirus disease covid19 presents two urgen...


In [21]:
df_1c['Abstract2'] = preprocess(df_1c['abstract'])

df_1c['polarity'] = df_1c['Abstract2'].map(lambda text: TextBlob(text).sentiment.polarity) #calculate sentiment polarity which lies in the range of [-1,1] where 1 means positive sentiment and -1 means a negative sentiment.
df_1c['len'] = df_1c['Abstract2'].astype(str).apply(len) #Create new feature
df_1c['word_count'] = df_1c['Abstract2'].apply(lambda x: len(str(x).split())) #Create new feature
df_1c.head()

Unnamed: 0,abstract,Abstract2,polarity,len,word_count
0,background anxiety depression common symptoms ...,background anxiety depression common symptoms ...,-0.090909,1243,178
1,counterregulatory arm renin angiotensin system...,counterregulatory arm renin angiotensin system...,0.179167,1672,215
2,several studies suggested baricitinib potentia...,several studies suggested baricitinib potentia...,0.22,546,67
3,background aims healthcare delivery requires s...,background aims healthcare delivery requires s...,0.052803,1170,145
4,coronavirus disease covid19 presents two urgen...,coronavirus disease covid19 presents two urgen...,0.233333,273,31


In [22]:
print('3 random articles with the relatively high positive sentiment polarity: \n')
cl = df_1c.loc[df_1c.polarity >= 0.6, ['abstract']].sample(1).values
for c in cl:
    print(c[0])

3 random articles with the relatively high positive sentiment polarity: 

read great interest comment suarezperez et al article1 share concerns regarding need cautious interpretation antiphospholipid antibodies apla positivity patients coronary virus disease 2019 covid19 herein would like add insights discussion


In [23]:
print('3 random articles with the most neutral sentiment(zero) polarity: \n')
cl = df_1c.loc[df_1c.polarity == 0.0, ['abstract']].sample(3).values
for c in cl:
    print(c[0])

3 random articles with the most neutral sentiment(zero) polarity: 

since december 2019 coronavirus disease 2019 covid19 caused sarscov2 spread lot countries worldwide12 jan 30 2020 world health organization declared outbreak covid19 public health emergency international concern march 11 2020 spread covid19 declared pandemic article protected copyright rights reserved
appreciate gao et al interest recent manuscript association elevated liver biochemistries severe covid19 infection would like address comments elucidate interpretation findings
covid19 pandemic hit world starting december 2019 recent studies international statistics shown increased prevalence morbidity well mortality disease male patients compared female patients aim brief communication describe pathophysiology sexdiscrepancy based infectivity mechanism coronavirus including angiotensinconverting enzyme 2 ace2 type ii transmembrane serine protease tmprss2 androgen receptor could help understand susceptibility urological p

In [24]:
print('3 articles with the most negative polarity: \n')
cl = df_1c.loc[df_1c.polarity <= 0.8, ['abstract']].sample(3).values
for c in cl:
    print(c[0])

3 articles with the most negative polarity: 

mask wearing integral reducing spread sarscov2 information prevalence face mask usage required model disease spread improve compliance mask usage targeted messaging sought 1 estimate prevalence mask usage populous county vermont chittenden county25 state population 2 assess effect age sex mask use monitored entrances eight different business types visually assessed individuals age gender mask use distance collected 1004 observations 16 may 30 may 2020 businesses began reopen following extended statewide lock analyzed data using bayesian random effects logistic regression model found overall 755 individuals used mask significant effects age gender mask usage females likely wear masks males 838 n488 vs 676 n516 mask usage respectively odds male wearing mask 53 female odds across age groups elderly likely wear mask 914 n209 followed young adults 748 n246 middleaged adults 707 n519 children 533 n30 odds elderly person wearing mask 167 times chi

In [25]:
df_1c['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

### Continuous Sentiment Analyses - Titles

In [26]:
df_1c = df_1[['title']]
df_1c = df_1c.dropna()
df_1c = df_1c.reset_index(drop=True)
df_1c.head()

Unnamed: 0,title
0,the 24-form tai chi improves anxiety and depre...
1,relationship between circulating levels of ang...
2,baricitinib - a januase kinase inhibitor - not...
3,artificial intelligence (ai) applications for ...
4,coronavirus disease: challenges for psychiatry


In [27]:
df_1c['title2'] = preprocess(df_1c['title'])

df_1c['polarity'] = df_1c['title2'].map(lambda text: TextBlob(text).sentiment.polarity) #calculate sentiment polarity which lies in the range of [-1,1] where 1 means positive sentiment and -1 means a negative sentiment.
df_1c['len'] = df_1c['title2'].astype(str).apply(len) #Create new feature
df_1c['word_count'] = df_1c['title2'].apply(lambda x: len(str(x).split())) #Create new feature
df_1c.head()

Unnamed: 0,title,title2,polarity,len,word_count
0,the 24-form tai chi improves anxiety and depre...,the 24-form tai chi improves anxiety and depre...,0.0,153,20
1,relationship between circulating levels of ang...,relationship between circulating levels of ang...,0.0,128,13
2,baricitinib - a januase kinase inhibitor - not...,baricitinib - a januase kinase inhibitor - not...,0.9,89,16
3,artificial intelligence (ai) applications for ...,artificial intelligence (ai) applications for ...,-0.6,63,7
4,coronavirus disease: challenges for psychiatry,coronavirus disease: challenges for psychiatry,0.0,46,5


In [28]:
print('3 random titles with the relatively high positive sentiment polarity: \n')
cl = df_1c.loc[df_1c.polarity >= 0.6, ['title']].sample(3).values
for c in cl:
    print(c[0])

3 random titles with the relatively high positive sentiment polarity: 

management of acute kidney injury in covid-19
risk factors associated with acute respiratory distress syndrome and death in patients with coronavirus disease 2019 pneumonia in wuhan, china
covid-19: unravelling the clinical progression of nature's virtually perfect biological weapon.


In [29]:
print('3 random titles with the most neutral sentiment(zero) polarity: \n')
cl = df_1c.loc[df_1c.polarity == 0.0, ['title']].sample(3).values
for c in cl:
    print(c[0])

3 random titles with the most neutral sentiment(zero) polarity: 

european society of trauma and emergency surgery (estes) recommendations for trauma and emergency surgery preparation during times of covid-19 infection
renal infarct in a covid-19-positive kidney-pancreas transplant recipient
virus-ckb: an integrated bioinformatics platform and analysis resource for covid-19 research


In [30]:
print('3 titles with the most negative polarity: \n')
cl = df_1c.loc[df_1c.polarity <= 0.8, ['title']].sample(3).values
for c in cl:
    print(c[0])

3 titles with the most negative polarity: 

inhibition of sars-cov-2 (previously 2019-ncov) infection by a highly potent pan-coronavirus fusion inhibitor targeting its spike protein that harbors a high capacity to mediate membrane fusion
aerosol-generating otolaryngology procedures and the need for enhanced ppe during the covid-19 pandemic: a literature review
[expert consensus on chloroquine phosphate for the treatment of novel coronavirus pneumonia].


In [31]:
df_1c['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')