<a href="https://colab.research.google.com/github/FDDI-CentOS/data/blob/master/Google_News_URL_Scraping_and_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Google News Search and Sentiment Analysis
# Run installs once for package prep then comment out to reduce overhead
# !pip install textblob bs4 requests 
# !pip install pydrive
# djarguello@ 8-17-19

from textblob import TextBlob
from bs4 import BeautifulSoup
import requests
import re
import pickle
import csv


# Initialize lists: Update keywords to tune results
other_bets = ['waymo',
              'verily',
              'access',
              'deepmind',
              'calico',
              'capitalg',
              'googleventures', 
              'sidewalk', 
              'wing',
              'loon',
              'jigsaw',
              'makani',
              'x']

keywords = ['Waymo+AND+lyft',
            'verily','access',
            'deepmind',
            'calico',
            'capitalg',
            'googleventures', 
            'sidewalk', 
            'wing',
            'loon',
            'jigsaw',
            'makani',
            'x']

# Analysis Class Object
class Analysis:
  def __init__(self, term):
      self.term = term
      self.subjectivity = 0
      self.sentiment = 0
      self.url = 'https://www.google.com/search?q={0}&source=lmns&tbm=nws&tbs=qdr:m'.format(self.term) # Google News Monthly Feed
      
  def run(self):
    file = []
    response = requests.get(self.url)
    # Print(response.text) # debugging / review response results
    soup = BeautifulSoup(response.text,'html.parser')
    headline_results = soup.find_all('div', class_='st')
    for h in headline_results:
      temp = str(h)
      temp = re.sub('\ |\?|\.|\!|\/|\;|\:', ' ', temp)
      temp = re.sub('\<.*?>', ' ', temp)
      temp = re.sub('\xa0','',temp)
      temp = re.sub('\s{2,}', ' ', temp) # Test code
      temp = temp.strip('<div class="st">')
      file.append(temp) 
      blob = TextBlob(h.get_text())
      self.sentiment += blob.sentiment.polarity / len(headline_results)
      self.subjectivity += blob.sentiment.subjectivity / len(headline_results)
    return file

In [0]:
# Analysis Function Run for Each Bet

def run_analysis(bet, keywords):
  file = []
  a = Analysis(keywords) # Insert keyword terms in Boolean logic, use '+' between terms
  new = a.run()
  
  # File output and formatting
  file.append('Bet: '+ bet + '|')
  file.append('Keywords Search: ' + str(a.term) +'|')
  file.append('Query Link:' + a.url + '|')
  file.append('Subjectivity: '+ str(round(a.subjectivity,5)) + ' Sentiment: ' + str(round(a.sentiment,5))+"|")
  # Iterate through Analysis object to append results
  for row in new:
    file.append(row+"|")
  file.append('\n----------------------------------------------------------------\n|')
  return file

In [0]:
# Run analysis for each bet and output to pickle and csv files
for count,bet in enumerate(other_bets):
  # Initialize file naming through iterative loop
  pkl_filename = (str(bet) + ".pkl")
  csv_filename = (str(bet) + ".csv")
  txt_filename = (str(bet) + ".txt")
  
  # Output analysis content to pickle files
  analysis_file = run_analysis(bet, keywords[count])
  output = open(str(pkl_filename), 'wb')
  pickle.dump(analysis_file,output)
  output.close()
  
  # Output the analsis file to csv and screen
#   for row in analysis_file:
#     print(row)
 
  # CSV output of analysis contents '|' delimited  
#   with open(csv_filename) as csvfile:
#     read = csv.reader(csvfile , delimiter = '|')
#     for row in read:
#         print(row)
        
  # Text file output of analsis contents
  with open(txt_filename,"a") as f:
    for row in analysis_file:
      print(row, file=f)
   

In [0]:
# Examine contents of pickle files
for count, bet in enumerate(other_bets):
#   pkl_filename = (str(bet)+".pkl")
#   pickle_in = open(pkl_filename,"rb")
#   pkl_results = (pickle.load(pickle_in))

#   # Print Pickle Results
#   for row in pkl_results:
#       print(row)
      
  # Examine contents of CSV files
#   csv_filename = (str(bet)+'.csv')      
#   with open(csv_filename,"r") as f:
#     reader = csv.reader(f)
#     for row in reader:
#         print(row[0].split('|'))
        
  # Examine contents of Text files
  txt_filename = (str(bet)+'.txt')
  with open(txt_filename, "r") as f:
    for line in f:
      print(line)

In [0]:
# Save Output to Google Drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

# Get the folder id where the file will be saved the
# Iterate for all Bet txt files and save results to Google Drive
for bet in other_bets:
  file = drive.CreateFile({'parents':[{u'id': '1P6JXCfObWODQP5twR9Gz6JZVwY4tyigF'}]})
  results_file = str(bet + '.txt')
  file.SetContentFile(results_file) 
  file.Upload() 