# Getting the required libraries

In [1]:
import pandas as pd
!pip install --upgrade tensorflow
!pip install torch===1.5.0 torchvision===0.6.0 -f https://download.pytorch.org/whl/torch_stable.html

Requirement already up-to-date: tensorflow in /usr/local/lib/python3.6/dist-packages (2.2.0)
Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [2]:
!pip install transformers



# Importing extracted tweets

In [3]:
tweets = pd.read_csv('/content/drive/My Drive/CoronaWhy/Inferred datasets/tweets_articles_vaccine_covid2019.csv')
tweets

Unnamed: 0.1,Unnamed: 0,drug,tweet_id,user_id,username,text
0,0,Research and development on therapeutic agents...,1257454089308422146,1119344077613481984,Changet59700909,RT @science_cited: @themessengerdrg Moderan SA...
1,1,Research and development on therapeutic agents...,1257321781779509250,52349626,StabiHH,Update #covid19: Report der American Chemical ...
2,2,Research and development on therapeutic agents...,1257006164644114432,953849474065096704,science_cited,@themessengerdrg SARS-CoV-2 Spike protein rece...
3,3,Research and development on therapeutic agents...,1257006098608992258,953849474065096704,science_cited,@themessengerdrg Moderan SARS Vaccine Patent\n...
4,4,Preliminary identification of potential vaccin...,1258330189228134402,20509756,Pediatria,Preliminary Identification of Potential Vaccin...
...,...,...,...,...,...,...
2665,2665,Evidence that vitamin D supplementation could ...,1256805837542010883,1466841072,trombson,@CAgovernor Evidence that vitamin D supplement...
2666,2666,Evidence that vitamin D supplementation could ...,1256714640127668226,120893104,Sarah_Roberts_1,@PaulSchmehl I forgot to give you this link to...
2667,2667,Epidemiologic features and clinical course of ...,1258251463559532544,718066634334846976,sangmotiani,RT @thelonevirologi: Epidemiologic Features an...
2668,2668,Epidemiologic features and clinical course of ...,1256937040631025664,1214232500664295424,PandemicCovid20,RT @thelonevirologi: Epidemiologic Features an...


# Data Cleaning

In [4]:
# converting the column into a text file
tweets_list = tweets.text

tweets_text = (" ").join(tweets_list)
tweets_text



In [0]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_tweets_text_round1(tweets_text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    tweets_text = tweets_text.lower()
    tweets_text = re.sub('\[.*?\]', '', tweets_text)
    tweets_text = re.sub('[%s]' % re.escape(string.punctuation), '', tweets_text)
    tweets_text = re.sub('\w*\d\w*', '', tweets_text)
    return tweets_text

round1 = lambda x: clean_tweets_text_round1(x)

In [6]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(tweets.text.apply(round1))
data_clean

Unnamed: 0,text
0,rt sciencecited themessengerdrg moderan sars v...
1,update report der american chemical society s...
2,themessengerdrg spike protein receptor bindin...
3,themessengerdrg moderan sars vaccine patent\nh...
4,preliminary identification of potential vaccin...
...,...
2665,cagovernor evidence that vitamin d supplementa...
2666,paulschmehl i forgot to give you this link to ...
2667,rt thelonevirologi epidemiologic features and ...
2668,rt thelonevirologi epidemiologic features and ...


In [0]:
# Apply a second round of cleaning
def clean_tweets_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_tweets_text_round2(x)

In [8]:
# Let's take a look at the updated text
tweets_cleaned = pd.DataFrame(data_clean.text.apply(round2))
tweets_cleaned

Unnamed: 0,text
0,rt sciencecited themessengerdrg moderan sars v...
1,update report der american chemical society s...
2,themessengerdrg spike protein receptor bindin...
3,themessengerdrg moderan sars vaccine patenthtt...
4,preliminary identification of potential vaccin...
...,...
2665,cagovernor evidence that vitamin d supplementa...
2666,paulschmehl i forgot to give you this link to ...
2667,rt thelonevirologi epidemiologic features and ...
2668,rt thelonevirologi epidemiologic features and ...


# Setting up 'Transformers' & sentiment analysis of the tweets

In [9]:
from transformers import pipeline

# Allocate a pipeline for sentiment-analysis
nlp = pipeline('sentiment-analysis')
nlp('We are very happy to include pipeline into the transformers repository.')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




[{'label': 'POSITIVE', 'score': 0.9978193640708923}]

In [10]:
nlp('Preliminary Identification of Potential Vaccine Targets for the COVID-19 Coronavirus (SARS-CoV-2) Based on SARS-CoV Immunological Studies. - PubMed - NCBI https://t.co/4ieKve3SH9')

[{'label': 'NEGATIVE', 'score': 0.9814944267272949}]

In [12]:
tweets_cleaned['sentiment'] = tweets_cleaned['text'].apply(nlp)
tweets_cleaned

Unnamed: 0,text,sentiment
0,rt sciencecited themessengerdrg moderan sars v...,"[{'label': 'NEGATIVE', 'score': 0.993150353431..."
1,update report der american chemical society s...,"[{'label': 'NEGATIVE', 'score': 0.966924309730..."
2,themessengerdrg spike protein receptor bindin...,"[{'label': 'NEGATIVE', 'score': 0.916745424270..."
3,themessengerdrg moderan sars vaccine patenthtt...,"[{'label': 'NEGATIVE', 'score': 0.993583202362..."
4,preliminary identification of potential vaccin...,"[{'label': 'NEGATIVE', 'score': 0.962183773517..."
...,...,...
2665,cagovernor evidence that vitamin d supplementa...,"[{'label': 'NEGATIVE', 'score': 0.976552367210..."
2666,paulschmehl i forgot to give you this link to ...,"[{'label': 'NEGATIVE', 'score': 0.982065141201..."
2667,rt thelonevirologi epidemiologic features and ...,"[{'label': 'POSITIVE', 'score': 0.801105737686..."
2668,rt thelonevirologi epidemiologic features and ...,"[{'label': 'POSITIVE', 'score': 0.801105737686..."


In [0]:
sentiment = pd.DataFrame(tweets_cleaned['sentiment'])
sentiment.head()

sentiment.to_csv('/content/sentiment.csv')

In [23]:
df = pd.read_csv("/content/sentiment.csv")
df = df.iloc[:,1:]
df["label"] = df["sentiment"].str.extract(r'([A-Z]\w+ *)')
df["score"] = df["sentiment"].str.extract(r'(\d.\d+ *)')
df = df[["label","score"]]
df

Unnamed: 0,label,score
0,NEGATIVE,0.9931503534317017
1,NEGATIVE,0.9669243097305298
2,NEGATIVE,0.9167454242706299
3,NEGATIVE,0.9935832023620605
4,NEGATIVE,0.9621837735176086
...,...,...
2665,NEGATIVE,0.9765523672103882
2666,NEGATIVE,0.9820651412010193
2667,POSITIVE,0.8011057376861572
2668,POSITIVE,0.8011057376861572


In [25]:
tweets_cleaned = pd.DataFrame(tweets_cleaned['text'])

sentiment_analysis = tweets_cleaned.join(df, how='left')
sentiment_analysis

Unnamed: 0,text,label,score
0,rt sciencecited themessengerdrg moderan sars v...,NEGATIVE,0.9931503534317017
1,update report der american chemical society s...,NEGATIVE,0.9669243097305298
2,themessengerdrg spike protein receptor bindin...,NEGATIVE,0.9167454242706299
3,themessengerdrg moderan sars vaccine patenthtt...,NEGATIVE,0.9935832023620605
4,preliminary identification of potential vaccin...,NEGATIVE,0.9621837735176086
...,...,...,...
2665,cagovernor evidence that vitamin d supplementa...,NEGATIVE,0.9765523672103882
2666,paulschmehl i forgot to give you this link to ...,NEGATIVE,0.9820651412010193
2667,rt thelonevirologi epidemiologic features and ...,POSITIVE,0.8011057376861572
2668,rt thelonevirologi epidemiologic features and ...,POSITIVE,0.8011057376861572


In [26]:
sentiment_analysis.insert(0, 'New_ID', range(1, 1 + len(sentiment_analysis)))
sentiment_analysis

Unnamed: 0,New_ID,text,label,score
0,1,rt sciencecited themessengerdrg moderan sars v...,NEGATIVE,0.9931503534317017
1,2,update report der american chemical society s...,NEGATIVE,0.9669243097305298
2,3,themessengerdrg spike protein receptor bindin...,NEGATIVE,0.9167454242706299
3,4,themessengerdrg moderan sars vaccine patenthtt...,NEGATIVE,0.9935832023620605
4,5,preliminary identification of potential vaccin...,NEGATIVE,0.9621837735176086
...,...,...,...,...
2665,2666,cagovernor evidence that vitamin d supplementa...,NEGATIVE,0.9765523672103882
2666,2667,paulschmehl i forgot to give you this link to ...,NEGATIVE,0.9820651412010193
2667,2668,rt thelonevirologi epidemiologic features and ...,POSITIVE,0.8011057376861572
2668,2669,rt thelonevirologi epidemiologic features and ...,POSITIVE,0.8011057376861572
