In [140]:
from pandas import json_normalize
import pandas as pd
import requests
import feedparser
from urllib.parse import urlparse, urljoin, quote
import time
import datetime
# for web scraping
from IPython.display import HTML
from bs4 import BeautifulSoup

import string

## adding some cleaning functions
transtable = str.maketrans(dict.fromkeys(string.punctuation))


def to_string(s):
    """
    takes input s and returns a string type and encoding if necessary
    """
    try:
        return str(s)
    except:
        # Change the encoding type if needed
        return s.encode('utf-8')


def unicode_to_ascii(s):
    """
    input: string
    output: string ascii
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


def strip_punctuation(input_string):
    """
    cleans string by stripping punctuation
    """

    return input_string.translate(transtable)

# Define the start date for the search - provided as 1/1/2020
start_date = datetime.datetime(2020, 1, 1)

# Get today's date
today_date = datetime.datetime.today()

# Calculate the difference in days
days_difference = (today_date - start_date).days

print(f"Number of days from 1/1/2020 to today: {days_difference} days")


Number of days from 1/1/2020 to today: 1685 days


In [141]:
# two locations of interest - these are the search word comibinations
search_keywords_combinations = {
    "Fort Fairfield": ["LPOE", "border", "port", "crossing", "customs", "GSA", "general services administration"],
    "San Luis":  ["LPOE", "border", "port", "crossing", "garita", "GSA", "general services administration"]
                               }



In [15]:
def parse_feeds_df(rss_url):
    """once given url for the rss"""
    
    #Read feed xml data 
    news_feed = feedparser.parse(rss_url) 

    #Flatten data
    df_news_feed=json_normalize(news_feed.entries)
    
    # returns dataframe of news feed entries
    return df_news_feed



In [25]:
# "https://news.google.com/search?q=%22Fort%20Fairfield%22%20AND%20%22LPOE%22%20when%3A1685d&hl=en-US&gl=US&ceid=US%3Aen"

query_list = []
for k, val in search_keywords_combinations.items():
    for v in val:
        query_ = f'"{k}" AND "{v}" when:{days_difference}d'
        query_list.append(query_)
        
query_list

['"Fort Fairfield" AND "LPOE" when:1685d',
 '"Fort Fairfield" AND "border" when:1685d',
 '"Fort Fairfield" AND "port" when:1685d',
 '"Fort Fairfield" AND "crossing" when:1685d',
 '"Fort Fairfield" AND "customs" when:1685d',
 '"Fort Fairfield" AND "GSA" when:1685d',
 '"Fort Fairfield" AND "general services administration" when:1685d',
 '"San Luis" AND "LPOE" when:1685d',
 '"San Luis" AND "border" when:1685d',
 '"San Luis" AND "port" when:1685d',
 '"San Luis" AND "crossing" when:1685d',
 '"San Luis" AND "garita" when:1685d',
 '"San Luis" AND "GSA" when:1685d',
 '"San Luis" AND "general services administration" when:1685d']

In [28]:
# empty list of dataframes
list_of_dfs = []

for q in query_list:
    rss_url ='https://news.google.com/rss/search?q=' + quote(q)
    search_url = 'https://news.google.com/search?q=' + quote(q)
    print(search_url)
    df_temp = parse_feeds_df(rss_url)
    df_temp["query"] = q
    q_simple = q.split(" when")[0]
    print(f"{q_simple}: {df_temp.shape[0]} hits")
    list_of_dfs.append(df_temp)
    time.sleep(20)
    

        
    
    


https://news.google.com/search?q=%22Fort%20Fairfield%22%20AND%20%22LPOE%22%20when%3A1685d
"Fort Fairfield" AND "LPOE": 6 hits
https://news.google.com/search?q=%22Fort%20Fairfield%22%20AND%20%22border%22%20when%3A1685d
"Fort Fairfield" AND "border": 16 hits
https://news.google.com/search?q=%22Fort%20Fairfield%22%20AND%20%22port%22%20when%3A1685d
"Fort Fairfield" AND "port": 9 hits
https://news.google.com/search?q=%22Fort%20Fairfield%22%20AND%20%22crossing%22%20when%3A1685d
"Fort Fairfield" AND "crossing": 8 hits
https://news.google.com/search?q=%22Fort%20Fairfield%22%20AND%20%22customs%22%20when%3A1685d
"Fort Fairfield" AND "customs": 9 hits
https://news.google.com/search?q=%22Fort%20Fairfield%22%20AND%20%22GSA%22%20when%3A1685d
"Fort Fairfield" AND "GSA": 10 hits
https://news.google.com/search?q=%22Fort%20Fairfield%22%20AND%20%22general%20services%20administration%22%20when%3A1685d
"Fort Fairfield" AND "general services administration": 9 hits
https://news.google.com/search?q=%22San%20

In [46]:
df_full_open_scrape = pd.concat(list_of_dfs).reset_index(drop=True)

print(df_full_open_scrape.shape[0])

364


In [47]:
# 297 unique articles found

# number of unique articles - 297
print(df_full_open_scrape.id.nunique())

259


In [48]:
df_full_open_scrape.head()

Unnamed: 0,title,links,link,id,guidislink,published,published_parsed,summary,title_detail.type,title_detail.language,title_detail.base,title_detail.value,summary_detail.type,summary_detail.language,summary_detail.base,summary_detail.value,source.href,source.title,query
0,Meetings will be held this week to discuss new...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMiuAFBV...,CBMiuAFBVV95cUxNZEZkRmc2VlpvNHlxVWF4cUZjNzd5S2...,False,"Tue, 30 Jul 2024 07:00:00 GMT","(2024, 7, 30, 7, 0, 0, 1, 212, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Meetings will be held this week to discuss new...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://www.wagmtv.com,WAGM,"""Fort Fairfield"" AND ""LPOE"" when:1685d"
1,Meeting on Fort Fairfield port of entry is Tue...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMilgFBV...,CBMilgFBVV95cUxOc2tpUGxKd3doSERTdVA1UDhWV1VpOG...,False,"Mon, 29 Jul 2024 14:14:37 GMT","(2024, 7, 29, 14, 14, 37, 0, 211, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Meeting on Fort Fairfield port of entry is Tue...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://thecounty.me,The County,"""Fort Fairfield"" AND ""LPOE"" when:1685d"
2,Fort Fairfield residents voice concerns about ...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMixAFBV...,CBMixAFBVV95cUxNNnBJeHBreVhhU082dnRGRml6b3NtS2...,False,"Wed, 31 Jul 2024 20:58:28 GMT","(2024, 7, 31, 20, 58, 28, 2, 213, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Fort Fairfield residents voice concerns about ...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://www.bangordailynews.com,Bangor Daily News,"""Fort Fairfield"" AND ""LPOE"" when:1685d"
3,Discussing the New Port of Entries in the Coun...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMieEFVX...,CBMieEFVX3lxTE5NYkdFZGdncjZVYnA0R1pPdUo4NUQ1VG...,False,"Wed, 31 Jul 2024 07:00:00 GMT","(2024, 7, 31, 7, 0, 0, 2, 213, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Discussing the New Port of Entries in the Coun...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://www.wagmtv.com,WAGM,"""Fort Fairfield"" AND ""LPOE"" when:1685d"
4,Residents’ Reply to Possible Closures of Port ...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMikgFBV...,CBMikgFBVV95cUxQZEVFQ21rc1k4SF92OGRlbTJSRUNTbW...,False,"Thu, 01 Aug 2024 07:00:00 GMT","(2024, 8, 1, 7, 0, 0, 3, 214, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Residents’ Reply to Possible Closures of Port ...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://www.wagmtv.com,WAGM,"""Fort Fairfield"" AND ""LPOE"" when:1685d"


In [49]:
df_full_open_scrape.iloc[0]

title                      Meetings will be held this week to discuss new...
links                      [{'rel': 'alternate', 'type': 'text/html', 'hr...
link                       https://news.google.com/rss/articles/CBMiuAFBV...
id                         CBMiuAFBVV95cUxNZEZkRmc2VlpvNHlxVWF4cUZjNzd5S2...
guidislink                                                             False
published                                      Tue, 30 Jul 2024 07:00:00 GMT
published_parsed                           (2024, 7, 30, 7, 0, 0, 1, 212, 0)
summary                    <a href="https://news.google.com/rss/articles/...
title_detail.type                                                 text/plain
title_detail.language                                                   None
title_detail.base          https://news.google.com/rss/search?q=%22Fort+F...
title_detail.value         Meetings will be held this week to discuss new...
summary_detail.type                                                text/html

In [60]:
id2query = df_full_open_scrape.groupby('id', as_index=False)["query"].agg(list)
id2query_dict = {}
for i, row in id2query.iterrows():
    id2query_dict[row["id"]] = row["query"]

    

In [61]:
#! pip install google-api-python-client


df_full_open_scrape["link"].tolist()[1]

'https://news.google.com/rss/articles/CBMilgFBVV95cUxOc2tpUGxKd3doSERTdVA1UDhWV1VpOGZwVko5R3FFMVZBMThZNXd6Zmh5TE5oaE1zaGpVR1FJTGNVQ0RaVjRyOVdPYmNra1E5UjduQkNDUklSTTFxblY0dzhNaFdIX1R1T2d5a0k3SmxPRUxBa0p4RWdybEVieWNmWVh4N2NodFN6QXJkdUZrUG5iazVBSFE?oc=5'

In [62]:
query_list[0]

'"Fort Fairfield" AND "LPOE" when:1685d'

In [63]:
new_df_full_open_scrape = df_full_open_scrape.drop_duplicates("id")
new_df_full_open_scrape

Unnamed: 0,title,links,link,id,guidislink,published,published_parsed,summary,title_detail.type,title_detail.language,title_detail.base,title_detail.value,summary_detail.type,summary_detail.language,summary_detail.base,summary_detail.value,source.href,source.title,query
0,Meetings will be held this week to discuss new...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMiuAFBV...,CBMiuAFBVV95cUxNZEZkRmc2VlpvNHlxVWF4cUZjNzd5S2...,False,"Tue, 30 Jul 2024 07:00:00 GMT","(2024, 7, 30, 7, 0, 0, 1, 212, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Meetings will be held this week to discuss new...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://www.wagmtv.com,WAGM,"""Fort Fairfield"" AND ""LPOE"" when:1685d"
1,Meeting on Fort Fairfield port of entry is Tue...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMilgFBV...,CBMilgFBVV95cUxOc2tpUGxKd3doSERTdVA1UDhWV1VpOG...,False,"Mon, 29 Jul 2024 14:14:37 GMT","(2024, 7, 29, 14, 14, 37, 0, 211, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Meeting on Fort Fairfield port of entry is Tue...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://thecounty.me,The County,"""Fort Fairfield"" AND ""LPOE"" when:1685d"
2,Fort Fairfield residents voice concerns about ...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMixAFBV...,CBMixAFBVV95cUxNNnBJeHBreVhhU082dnRGRml6b3NtS2...,False,"Wed, 31 Jul 2024 20:58:28 GMT","(2024, 7, 31, 20, 58, 28, 2, 213, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Fort Fairfield residents voice concerns about ...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://www.bangordailynews.com,Bangor Daily News,"""Fort Fairfield"" AND ""LPOE"" when:1685d"
3,Discussing the New Port of Entries in the Coun...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMieEFVX...,CBMieEFVX3lxTE5NYkdFZGdncjZVYnA0R1pPdUo4NUQ1VG...,False,"Wed, 31 Jul 2024 07:00:00 GMT","(2024, 7, 31, 7, 0, 0, 2, 213, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Discussing the New Port of Entries in the Coun...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://www.wagmtv.com,WAGM,"""Fort Fairfield"" AND ""LPOE"" when:1685d"
4,Residents’ Reply to Possible Closures of Port ...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMikgFBV...,CBMikgFBVV95cUxQZEVFQ21rc1k4SF92OGRlbTJSRUNTbW...,False,"Thu, 01 Aug 2024 07:00:00 GMT","(2024, 8, 1, 7, 0, 0, 3, 214, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22Fort+F...,Residents’ Reply to Possible Closures of Port ...,text/html,,https://news.google.com/rss/search?q=%22Fort+F...,"<a href=""https://news.google.com/rss/articles/...",https://www.wagmtv.com,WAGM,"""Fort Fairfield"" AND ""LPOE"" when:1685d"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,Paso Robles GSA approves county as contracting...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMi6gFBV...,CBMi6gFBVV95cUxNUWJYWks3WUNHNDQzR0xGQTRrdlVuX0...,False,"Wed, 17 Jan 2024 08:00:00 GMT","(2024, 1, 17, 8, 0, 0, 2, 17, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22San+Lu...,Paso Robles GSA approves county as contracting...,text/html,,https://news.google.com/rss/search?q=%22San+Lu...,"<a href=""https://news.google.com/rss/articles/...",https://pasoroblespress.com,The Paso Robles Press,"""San Luis"" AND ""GSA"" when:1685d"
345,WATCH: Kelly Gets Updates on Arizona Port of E...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMisgFBV...,CBMisgFBVV95cUxQWFZ5UE9hbUVhY0dKZmxfRUNDQmI3Nz...,False,"Wed, 10 Jul 2024 07:00:00 GMT","(2024, 7, 10, 7, 0, 0, 2, 192, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22San+Lu...,WATCH: Kelly Gets Updates on Arizona Port of E...,text/html,,https://news.google.com/rss/search?q=%22San+Lu...,"<a href=""https://news.google.com/rss/articles/...",https://www.kelly.senate.gov,Senator Mark Kelly,"""San Luis"" AND ""GSA"" when:1685d"
346,Hensel Phelps Awarded $174M GSA Contract for C...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMivwFBV...,CBMivwFBVV95cUxNbVBYcVo1WjBYMVpzcFVZb1cwZXc5aT...,False,"Mon, 29 Jul 2024 07:00:00 GMT","(2024, 7, 29, 7, 0, 0, 0, 211, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22San+Lu...,Hensel Phelps Awarded $174M GSA Contract for C...,text/html,,https://news.google.com/rss/search?q=%22San+Lu...,"<a href=""https://news.google.com/rss/articles/...",https://www.govconwire.com,GovCon Wire,"""San Luis"" AND ""GSA"" when:1685d"
347,GSA to Spend $3.4 Billion to Update 26 Land Po...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",https://news.google.com/rss/articles/CBMihgFBV...,CBMihgFBVV95cUxNRHZhbVpCREJmb0E2VG9uRFpjWkkwVk...,False,"Wed, 14 Dec 2022 08:00:00 GMT","(2022, 12, 14, 8, 0, 0, 2, 348, 0)","<a href=""https://news.google.com/rss/articles/...",text/plain,,https://news.google.com/rss/search?q=%22San+Lu...,GSA to Spend $3.4 Billion to Update 26 Land Po...,text/html,,https://news.google.com/rss/search?q=%22San+Lu...,"<a href=""https://news.google.com/rss/articles/...",https://www.ttnews.com,Transport Topics,"""San Luis"" AND ""GSA"" when:1685d"


In [113]:
new_df_full_open_scrape["query"] = new_df_full_open_scrape["id"].map(lambda x: id2query_dict.get(x))


def datetime_of_struct_time(st: time.struct_time) -> datetime.datetime:
    "Convert a struct_time to datetime maintaining timezone information when present"
    tz = None
    if st.tm_gmtoff is not None:
        tz = datetime.timezone(datetime.timedelta(seconds=st.tm_gmtoff))
    # datetime doesn't like leap seconds so just truncate to 59 seconds
    if st.tm_sec in {60, 61}:
        return datetime.datetime(*st[:5], 59, tzinfo=tz)
    return datetime.datetime(*st[:6], tzinfo=tz)


new_df_full_open_scrape["published_datetime"] = new_df_full_open_scrape["published_parsed"].map(lambda x: datetime_of_struct_time(x))
new_df_full_open_scrape["published_year"] = new_df_full_open_scrape["published_datetime"].map(lambda x: x.year)
new_df_full_open_scrape["published_date"]= new_df_full_open_scrape["published_datetime"].map(lambda x: x.strftime("%Y-%m-%d"))

# extracting location for easier querying of excel sheet output
new_df_full_open_scrape["location"] = new_df_full_open_scrape["query"].map(lambda x: strip_punctuation(x[0].split(" AND")[0]))

# cleaning summary text
new_df_full_open_scrape["summary_text"] = new_df_full_open_scrape["summary"].map(lambda x: unicode_to_ascii(BeautifulSoup(x).get_text()))

In [142]:
# only selecting columns of interest
cols_of_interest = ['location', 'id',  'title', 'summary_text',  'link',  'published', 'published_date', 'published_year',
   'source.href', 'source.title', 'query',]

# showing two samples
new_df_full_open_scrape[cols_of_interest].sample(2) 

Unnamed: 0,location,id,title,summary_text,link,published,published_date,published_year,source.href,source.title,query
346,San Luis,CBMivwFBVV95cUxNbVBYcVo1WjBYMVpzcFVZb1cwZXc5aT...,Hensel Phelps Awarded $174M GSA Contract for C...,Hensel Phelps Awarded $174M GSA Contract for C...,https://news.google.com/rss/articles/CBMivwFBV...,"Mon, 29 Jul 2024 07:00:00 GMT",2024-07-29,2024,https://www.govconwire.com,GovCon Wire,"[""San Luis"" AND ""GSA"" when:1685d]"
149,San Luis,CBMijAJBVV95cUxNbWYwd0JhclNzajFFX2VwZFNRVXN6cF...,Chaos ensues on US-Mexico border as Title 42 r...,Chaos ensues on US-Mexico border as Title 42 r...,https://news.google.com/rss/articles/CBMijAJBV...,"Fri, 12 May 2023 07:00:00 GMT",2023-05-12,2023,https://www.hindustantimes.com,Hindustan Times,"[""San Luis"" AND ""border"" when:1685d]"


In [138]:
# writing out output from rss feeder

new_df_full_open_scrape[cols_of_interest].to_excel("~/Downloads/google-news-articles-scrape-OGP-OCFO-LPOE-community-pulse-project.xlsx")

In [143]:
new_df_full_open_scrape[cols_of_interest]

Unnamed: 0,location,id,title,summary_text,link,published,published_date,published_year,source.href,source.title,query
0,Fort Fairfield,CBMiuAFBVV95cUxNZEZkRmc2VlpvNHlxVWF4cUZjNzd5S2...,Meetings will be held this week to discuss new...,Meetings will be held this week to discuss new...,https://news.google.com/rss/articles/CBMiuAFBV...,"Tue, 30 Jul 2024 07:00:00 GMT",2024-07-30,2024,https://www.wagmtv.com,WAGM,"[""Fort Fairfield"" AND ""LPOE"" when:1685d, ""Fort..."
1,Fort Fairfield,CBMilgFBVV95cUxOc2tpUGxKd3doSERTdVA1UDhWV1VpOG...,Meeting on Fort Fairfield port of entry is Tue...,Meeting on Fort Fairfield port of entry is Tue...,https://news.google.com/rss/articles/CBMilgFBV...,"Mon, 29 Jul 2024 14:14:37 GMT",2024-07-29,2024,https://thecounty.me,The County,"[""Fort Fairfield"" AND ""LPOE"" when:1685d, ""Fort..."
2,Fort Fairfield,CBMixAFBVV95cUxNNnBJeHBreVhhU082dnRGRml6b3NtS2...,Fort Fairfield residents voice concerns about ...,Fort Fairfield residents voice concerns about ...,https://news.google.com/rss/articles/CBMixAFBV...,"Wed, 31 Jul 2024 20:58:28 GMT",2024-07-31,2024,https://www.bangordailynews.com,Bangor Daily News,"[""Fort Fairfield"" AND ""LPOE"" when:1685d, ""Fort..."
3,Fort Fairfield,CBMieEFVX3lxTE5NYkdFZGdncjZVYnA0R1pPdUo4NUQ1VG...,Discussing the New Port of Entries in the Coun...,Discussing the New Port of Entries in the Coun...,https://news.google.com/rss/articles/CBMieEFVX...,"Wed, 31 Jul 2024 07:00:00 GMT",2024-07-31,2024,https://www.wagmtv.com,WAGM,"[""Fort Fairfield"" AND ""LPOE"" when:1685d, ""Fort..."
4,Fort Fairfield,CBMikgFBVV95cUxQZEVFQ21rc1k4SF92OGRlbTJSRUNTbW...,Residents’ Reply to Possible Closures of Port ...,Residents’ Reply to Possible Closures of Port ...,https://news.google.com/rss/articles/CBMikgFBV...,"Thu, 01 Aug 2024 07:00:00 GMT",2024-08-01,2024,https://www.wagmtv.com,WAGM,"[""Fort Fairfield"" AND ""LPOE"" when:1685d, ""Fort..."
...,...,...,...,...,...,...,...,...,...,...,...
341,San Luis,CBMi6gFBVV95cUxNUWJYWks3WUNHNDQzR0xGQTRrdlVuX0...,Paso Robles GSA approves county as contracting...,Paso Robles GSA approves county as contracting...,https://news.google.com/rss/articles/CBMi6gFBV...,"Wed, 17 Jan 2024 08:00:00 GMT",2024-01-17,2024,https://pasoroblespress.com,The Paso Robles Press,"[""San Luis"" AND ""GSA"" when:1685d]"
345,San Luis,CBMisgFBVV95cUxQWFZ5UE9hbUVhY0dKZmxfRUNDQmI3Nz...,WATCH: Kelly Gets Updates on Arizona Port of E...,WATCH: Kelly Gets Updates on Arizona Port of E...,https://news.google.com/rss/articles/CBMisgFBV...,"Wed, 10 Jul 2024 07:00:00 GMT",2024-07-10,2024,https://www.kelly.senate.gov,Senator Mark Kelly,"[""San Luis"" AND ""GSA"" when:1685d]"
346,San Luis,CBMivwFBVV95cUxNbVBYcVo1WjBYMVpzcFVZb1cwZXc5aT...,Hensel Phelps Awarded $174M GSA Contract for C...,Hensel Phelps Awarded $174M GSA Contract for C...,https://news.google.com/rss/articles/CBMivwFBV...,"Mon, 29 Jul 2024 07:00:00 GMT",2024-07-29,2024,https://www.govconwire.com,GovCon Wire,"[""San Luis"" AND ""GSA"" when:1685d]"
347,San Luis,CBMihgFBVV95cUxNRHZhbVpCREJmb0E2VG9uRFpjWkkwVk...,GSA to Spend $3.4 Billion to Update 26 Land Po...,GSA to Spend $3.4 Billion to Update 26 Land Po...,https://news.google.com/rss/articles/CBMihgFBV...,"Wed, 14 Dec 2022 08:00:00 GMT",2022-12-14,2022,https://www.ttnews.com,Transport Topics,"[""San Luis"" AND ""GSA"" when:1685d]"


In [120]:
import requests


resp = requests.get(example_url)

In [134]:
soup = BeautifulSoup(resp.text, "html.parser")
soup.get_text()

'Google News'

In [131]:
with open("/Users/isabelmetzger/Downloads/example_html.html", "w") as fout:
    
    fout.write(str(BeautifulSoup(resp.text)))

In [199]:
q_d['"Fort Fairfield" AND "LPOE"']

l_dfs = []
for k,val in q_d.items():
    df_ = pd.concat([pd.DataFrame(val[0]["items"]), pd.DataFrame(val[1]["items"])]).reset_index(drop=True)
    df_["query"] = k
    l_dfs.append(df_)


In [146]:

service = build("customsearch", "v1", developerKey=my_api_key)


def google_search(service, query_keywords, api_key, cse_id, **kwargs):
    res = service.cse().list(q=query_keywords, cx=cse_id, **kwargs).execute()
    return res

def google_next_page(service, query_keywords, api_key, cse_id, res):
    next_res = service.cse().list(q=query_keywords, cx=cse_id, num=10, start=res['queries']['nextPage'][0]['startIndex'],).execute()
    return next_res



In [157]:
max_page = 4
q = '"Fort Fairfield" AND "LPOE"',
page = 0
all_res_list = []
while page < max_page:
    results = google_search(service, q, my_api_key, my_cse_id, num=10, gl="us", lr="lang_en")
    
    google_next_page(service, q, my_api_key, my_cse_id, res)
    
    page += 1
    if page == max_page:
        break
        

    



35

In [177]:
first_res = service.cse().list(q=query, cx=my_cse_id, start=1, gl="us", lr="lang_en").execute()
second_res = service.cse().list(q=query, cx=my_cse_id, start=11, gl="us", lr="lang_en").execute()


In [194]:
for k, val in q_d.items():
    print(val[0]["items"])
    break

[{'kind': 'customsearch#result', 'title': 'Fort Fairfield Land Port of Entry, Maine | GSA', 'htmlTitle': '<b>Fort Fairfield</b> Land Port of Entry, Maine | GSA', 'link': 'https://www.gsa.gov/about-us/gsa-regions/region-1-new-england/buildings-and-facilities/development-projects/fort-fairfield-land-port-of-entry-maine', 'displayLink': 'www.gsa.gov', 'snippet': '7 days ago ... This project will deliver a new land port complex to supplement the existing LPOE in Fort Fairfield, Maine. Programming for the new LPOE\xa0...', 'htmlSnippet': '7 days ago <b>...</b> This project will deliver a new land port complex to supplement the existing <b>LPOE</b> in <b>Fort Fairfield</b>, Maine. Programming for the new <b>LPOE</b>&nbsp;...', 'formattedUrl': 'https://www.gsa.gov/about.../fort-fairfield-land-port-of-entry-maine', 'htmlFormattedUrl': 'https://www.gsa.gov/about.../<b>fort-fairfield</b>-land-port-of-entry-maine', 'pagemap': {'cse_thumbnail': [{'src': 'https://encrypted-tbn0.gstatic.com/images?q

In [141]:
l_results = []

for q in query_list:
    l_results.append(google_search(service, q, my_api_key, my_cse_id,num=10, gl="us", lr="lang_en"))
    


In [142]:
df_search_info = pd.DataFrame(res["searchInformation"] for res in l_results)


In [144]:
df_search_info["query"] = query_list

In [75]:
res = google_search(service, q, my_api_key, my_cse_id)

In [155]:
res_l = google_next_page(service, '"Fort Fairfield" AND "LPOE"', my_api_key, my_cse_id, res, page=0, max_page=1, url_items=res["items"])


In [198]:
[val[0]["items"], val[1]["items"]]

[[{'kind': 'customsearch#result',
   'title': 'General Services Administration Announces $100 Million ...',
   'htmlTitle': '<b>General Services Administration</b> Announces $100 Million ...',
   'link': 'https://www.gsa.gov/about-us/gsa-regions/region-9-pacific-rim/region-9-newsroom/pacific-rim-press-releases/general-services-administration-announces-100-mil-04032024',
   'displayLink': 'www.gsa.gov',
   'snippet': 'Apr 3, 2024 ... Federal investment transforms U.S.- Mexico Border into eco-friendly gateway SAN LUIS, Ariz. ㅡ The U.S. General Services Administration\xa0...',
   'htmlSnippet': 'Apr 3, 2024 <b>...</b> Federal investment transforms U.S.- Mexico Border into eco-friendly gateway <b>SAN LUIS</b>, Ariz. ㅡ The U.S. <b>General Services Administration</b>&nbsp;...',
   'formattedUrl': 'https://www.gsa.gov/.../general-services-administration-announces-100-mil-...',
   'htmlFormattedUrl': 'https://www.gsa.gov/.../<b>general-services-administration</b>-announces-100-mil-...',
   'pa

In [76]:
next_res = service.cse().list(q='"Fort Fairfield" AND "LPOE"', cx=my_cse_id, num=10, start=res['queries']['nextPage'][0]['startIndex'],).execute()
for item in next_res['items']:
        url_items.append(item)
    page += 1
    
    if page == max_page:
        return url_items

{'kind': 'customsearch#search',
 'url': {'type': 'application/json',
  'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'},
 'queries': {'request': [{'title': 'Google Custom Search - "Fort Fairfield" AND "LPOE"',
    'totalResults': '35',
    'searchTerms': '"Fort Fairfield" AND "LPOE"',
    'count': 10,
    'startIndex': 1,
    'inputEncoding': 'utf8',
    'outputEncoding': 'u

In [77]:
res['queries']['nextPage'][0]['startIndex']

11

In [95]:
res

{'kind': 'customsearch#search',
 'url': {'type': 'application/json',
  'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'},
 'queries': {'request': [{'title': 'Google Custom Search - "Fort Fairfield" AND "LPOE"',
    'totalResults': '35',
    'searchTerms': '"Fort Fairfield" AND "LPOE"',
    'count': 10,
    'startIndex': 1,
    'inputEncoding': 'utf8',
    'outputEncoding': 'u