<a href="https://colab.research.google.com/github/FDDI-CentOS/data/blob/master/Google_News_URL_Scraping_and_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Google News Search and Sentiment Analysis
# Run installs once for package prep then comment out to reduce overhead
# !pip install textblob bs4 requests 
# !pip install pydrive
# djarguello@ 8-17-19

from textblob import TextBlob
from bs4 import BeautifulSoup
import requests
import re

# Initialize lists: Update keywords to tune results
other_bets = ['waymo',
              'verily',
              'access',
              'deepmind',
              'calico',
              'capitalg',
              'googleventures', 
              'sidewalk', 
              'wing',
              'loon',
              'jigsaw',
              'makani',
              'x']

keywords = ['Waymo+AND+lyft',
            'verily','access',
            'deepmind',
            'calico',
            'capitalg',
            'googleventures', 
            'sidewalk', 
            'wing',
            'loon',
            'jigsaw',
            'makani',
            'x']

# Analysis Class Object
class Analysis:
  def __init__(self, term):
      self.term = term
      self.subjectivity = 0
      self.sentiment = 0
      self.url = 'https://www.google.com/search?q={0}&source=lmns&tbm=nws&tbs=qdr:m'.format(self.term) # Google News Monthly Feed
      
  def run(self):
    file = []
    response = requests.get(self.url)
    # Print(response.text) # debugging / review response results
    soup = BeautifulSoup(response.text,'html.parser')
    headline_results = soup.find_all('div', class_='st')
    for h in headline_results:
      temp = str(h)
      temp = re.sub('\ |\?|\.|\!|\/|\;|\:', ' ', temp)
      temp = re.sub('\<.*?>', ' ', temp)
      temp = re.sub('\xa0','',temp)
      temp = re.sub('\s{2,}', ' ', temp) # Test code
      temp = temp.strip('<div class="st">')
      file.append(temp) 
      blob = TextBlob(h.get_text())
      self.sentiment += blob.sentiment.polarity / len(headline_results)
      self.subjectivity += blob.sentiment.subjectivity / len(headline_results)
    return file

In [0]:
# Analysis Function Run for Each Bet

def run_analysis(bet,keywords):
  file = []
  a = Analysis(keywords) # Insert keyword terms in Boolean logic, use '+' between terms
  new = a.run()
  
  # File output and formatting
  file.append('Bet: '+ bet + '|')
  file.append('Keywords Search: ' + str(a.term) +'|')
  file.append('Query Link:' + a.url + '|')
  file.append('Subjectivity: '+ str(round(a.subjectivity,5)) + ' Sentiment: ' + str(round(a.sentiment,5))+"|")
  # Iterate through Analysis object to append results
  for row in new:
    file.append(row+"|")
  file.append('\n----------------------------------------------------------------|\n')
  return file

In [0]:
# File Writer Function Run for Each Analysis Row for Each Bet

def file_writer(filename, input):
  with open(filename,"w") as f:
      print(input, file=f)

In [0]:
# Review Text Files: Iterate Over Bet Filename List

def review_text_files(filename):
  with open(filename, 'r')as f:
    for row in f:
      print(row)

In [0]:
# Run Analysis for Each Bet and Output to txt File
for count, bet in enumerate(other_bets):
  # Initialize file naming through iterative loop
  txt_filename = (str(bet) + ".txt")
  analysis_file = run_analysis(bet, keywords[count])

  # Text file output of analsis contents
  file_writer(txt_filename,analysis_file)
  review_text_files(txt_filename)

['Bet: waymo|', 'Keywords Search: Waymo+AND+lyft|', 'Query Link:https://www.google.com/search?q=Waymo+AND+lyft&source=lmns&tbm=nws&tbs=qdr:m|', 'Subjectivity: 0.32462 Sentiment: 0.13322|', "Alphabet's self-driving car company Waymo has built the world's smartest The need for more riders was the reason Waymo partnered with Lyft for a fleet of 10|", "Alphabet's autonomous driving and robotaxi company Waymo teamed up with fellow Alphabet company and AI specialist DeepMind to develop new training|", "The launch of Lyft's data set comes after Waymo revealed a high-quality multimodal sensor data set for autonomous driving at the IEEE Conference on|", 'In the past few months the two largest rideshare platforms, Lyft and Uber, have gone public despite This history of testing makes Waymo a good match for Lyf|', "Alphabet's (GOOGL) Waymo has sued a Clearwater, Florida, company going by the same name This news comes after Lyft ( LYFT ) came second with 28 3%|", "Waymo CEO John Krafcik, Los Angele

In [0]:
# Save Output to Google Drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

#Clear drive folder
# for bet in other_bets:
#   file = drive.CreateFile({'id': '1ILOb2ktUG7d9V-BPI2mwH6XSobcpV_V8'})
#   results_file = str(bet + '.txt')
#   file.Delete()

# Get the folder id where the file will be saved the
# Iterate for all Bet txt files and save results to Google Drive
for bet in other_bets:
  file = drive.CreateFile({'parents':[{u'id': '1ILOb2ktUG7d9V-BPI2mwH6XSobcpV_V8'}]})
  results_file = str(bet + '.txt')
  file.SetContentFile(results_file) 
  file.Upload()