## Interpreting Negative Sentiments towards HydroxyChloroquine

### Importing Important Libraries

In [1]:
from comcrawl import IndexClient
import pandas as pd
from urllib.request import Request, urlopen
import nltk
# nltk.download('stopwords # download only if not present
import bs4 as bs
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

### Crawler class uses the comcrawl API
This finds all the links found within a specif domain

In [2]:
class Crawler:
    def __init__(self, key_points):
        self.key_points = key_points
        print("Crawler Imported")
    
    def client_search(self, threads=4):
        print("Initiating Client search")
        client = IndexClient(self.key_points['key_index'])
        client.search(self.key_points["website"], threads=6)
        client.download()
        return client
    
    def client_to_df(self, client):
        print("Exporting Dataset/Downloaded file")
        dataset = (pd.DataFrame(client.results).sort_values(by="timestamp").drop_duplicates("urlkey", keep="last").to_dict("records"))        
        pd.DataFrame(client.results).to_csv(self.key_points["dataframe_path"])
#         dataframe_path = "output.csv"
    
    def read_df(self):
        print("Reading File as a DataFrame")
        return pd.read_csv(self.key_points["dataframe_path"])
    
    def fetch_urls(self, database):
        self.key_points["url_with_key_word"] = database[database["url"].str.contains(self.key_points["search_word"])]
        database["html"] = database["html"].fillna(" ")
        self.key_points["html_with_key_word"] = database[database["html"].str.contains(self.key_points["search_word"])]
    
    def main(self):
        print("Crawler Begins")
        client = self.client_search(self.key_points["threads"])
        self.client_to_df(client)
        database = self.read_df()
        
        print("Crawler Work Completed!")
        
        return database



### WebPageReader Class using BeautifulSoup
Reading paragraphs and title within the link

Also, used for cleaning the text

In [27]:
class WebPageReader:
    def __init__(self, key_points):
        self.key_points = key_points
        print("Web Page Reader Imported")

    def webpage_url(self, url):
        try:
            req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
            webpage = urlopen(req).read()
            soup = bs.BeautifulSoup(webpage,'lxml')
            return soup.title.string, soup
        except:
            return "Page Not valid", 0

    def clean_text(self, text):
        print("Cleaning Text")
        x= re.sub(r'&[@#$%&()0-9]*',r'',text)
        x = re.sub(r'https?\S+',r'',x)
        x = re.sub('<[^>]*>', '', x)

        x = x.translate(str.maketrans(string.punctuation,' '*len(string.punctuation)))
        tokens = x.split()
        ps = PorterStemmer()
        stop_words = set(stopwords.words('english'))
        tokens = [ps.stem(token) for token in tokens if token not in stop_words] 
        return ' '.join([token for token in tokens if token not in stop_words])
    
     

### FindHateSpeech Class
Using NLTKs sentimentintensityanalyzer function to find the polarity score of the text

In [51]:
class FindHateSpeech:
    def __init__(self, key_points):
        self.key_points = key_points
        print("Find Hate Speech Imported")
        self.sid = SentimentIntensityAnalyzer()
    
    def polarity_score_finder(self, text):
        score = self.sid.polarity_scores(str(text.text))
        if score['compound'] > 0 and score['pos'] > self.key_points["positive_threshold"]:
            return self.sid.polarity_scores(str(text.text))
        else:
            return " "
    
    

### Main Class
Combining all classes to perform sentiment analysis on text and storing the links which provide a positive impact

In [56]:
class TLinternship:
    def __init__(self, key_points):
        self.key_points = key_points
        print("Initiating Code!")
        self.check_text = {}
        self.output = []
    
    def parse_urls(self, web, sentiment):
        print(f"Total URL Containing {self.key_points['search_word']}: {len(self.key_points['url_with_key_word'])}")

        for links in self.key_points['url_with_key_word']["url"]:
            title, _ = web.webpage_url(links)
            if title == "Page Not valid":
                continue
            else:
                self.check_text[links] = title
    
    def parse_html(self, web, sentiment):
        print(f"Total HTML Containing {self.key_points['search_word']}: {len(self.key_points['html_with_key_word'])}")        
        soup = []
        
        for x in self.key_points['html_with_key_word']["url"]:
            title, s = web.webpage_url(x)
            if title == "Page Not valid":
                continue
            else:
                for paragraph in s.find_all('p'):
                    if self.key_points["search_word"] in str(paragraph.text):
                        self.check_text[x] = paragraph        
    
    def main(self):
        print("Inititiating Crawler")
        crawl = Crawler(self.key_points)
        database = crawl.main()

#         Run the below two lines and comment out the above line, if you have the 'ouptut.csv' for faster run
#         database = pd.read_csv('output.csv')
#         crawl.fetch_urls(database)
        
        web = WebPageReader(self.key_points)
        sentiment = FindHateSpeech(self.key_points)
        
        self.parse_urls(web, sentiment)
        self.parse_html(web, sentiment)
        for key, values in self.check_text.items():
            score = sentiment.polarity_score_finder(values)
            if score == ' ':
                continue
            else:
                self.output.append(key)
        
        return self.output

In [57]:
key_points = {
    "key_index" : ["2020-29", "2020-24","2020-16","2020-10", "2020-05"],
    "website" : "fda.gov/drugs/*",
    "dataframe_path" : "output.csv",
    "search_word" : "hydroxychloroquine",
    "positive_threshold" : 0.02,
    "threads" : 6
}


In [58]:
tli = TLinternship(key_points)

Initiating Code!


In [59]:
imp_links = tli.main()

Inititiating Crawler
Crawler Imported
Web Page Reader Imported
Find Hate Speech Imported
Total URL Containing hydroxychloroquine: 5
Total HTML Containing hydroxychloroquine: 34


In [60]:
imp_links

['https://www.fda.gov/drugs/drug-safety-and-availability/fda-cautions-against-use-hydroxychloroquine-or-chloroquine-covid-19-outside-hospital-setting-or?fbclid=IwAR1-PZHsA-1A0tRm_ywuoe6c69_2--rqJoGVOX7wcdNRqSQBvi-rrajxF5o',
 'https://www.fda.gov/drugs/drug-safety-and-availability/fda-cautions-against-use-hydroxychloroquine-or-chloroquine-covid-19-outside-hospital-setting-or?utm_campaign',
 'https://www.fda.gov/drugs/drug-safety-and-availability/fda-cautions-against-use-hydroxychloroquine-or-chloroquine-covid-19-outside-hospital-setting-or?utm_source=042720-RC20%20COVID%20FDA%20Issues%20New%20Guidance&utm_medium=email&utm_campaign=RC20']