<a href="https://colab.research.google.com/github/ChrisR08/ChrisR08/blob/main/Genersis_Query_Google.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import gspread
import pandas as pd
import time, random
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from google.colab import files
from google.colab import drive
drive.mount('/content/gdrive')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Mounted at /content/gdrive


In [None]:
#@title Methods
from bs4 import BeautifulSoup
from requests import get

def fetch_results(search_term, number_results, language_code):
  usr_agent = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
  }
  escaped_search_term = search_term.replace(' ', '+')
  google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(escaped_search_term, number_results+1, language_code)
  response = get(google_url, headers=usr_agent)
  response.raise_for_status()
  return response.text

def parse_results(raw_html):
  soup = BeautifulSoup(raw_html, 'html.parser')
  result_block = soup.find_all('div', attrs={'class': 'g'})
  for result in result_block:
    link = result.find('a', href=True)
    title = result.find('h3')
    if link and title:
      yield (link['href'], title.text)

def pullFirst10GoogleResults(query):
    my_results_list = []
    html    = fetch_results(query, 10, "en")
    res_lst = list(parse_results(html))
    return res_lst

def pullGoogleSerps(query_lst):
  serp_lst = []
  for q in query_lst:
    serps = pullFirst10GoogleResults(q)
    for serp in serps:
      serp_lst.append(serp)
    time.sleep(random.randint(2,5))
  return serp_lst

def cleanText(text):
    words = text.split()
    # convert to lower case
    words = [word.lower() for word in words]
    # remove all tokens that are not alphabetic
    words = [word for word in words if word.isalpha()]
    # The Porter Stemmer had poor results comparing ot the Lemmatizer, both for verbs and nouns/adjectives.
    words = [WordNetLemmatizer().lemmatize(x, 'v') for x in words]
    return words

def matchesIntention( kw_lst, query_lst):
    intent_kw_set = set(kw_lst)
    query_kw_set  = set(query_lst)
    return bool(intent_kw_set.intersection(query_kw_set))

def findSearchIntent(query):
    ### Transactional Phase: The user is actively looking to make a purchase
    transactional_kws = ['buy','sale','let','rent','get','coupon','discount','deal','shipping','ship', 'purchase', 'acquire', 'transaction']
    ### Research Phase: Decided that they are purchasing something and trying to find more information about it
    research_kws      = ['review','best', 'good', 'bad', 'wordst', 'awesome', 'amazing', 'top','cheap','affordable','comparison', 'vs', "research" ]
    ### Informational Phase: The users are trying to build things themselves
    informational_kws = ['intro','diy','tutorial','guide','explain','demo','way to','ways to','i need to', 'tips','tricks',"information","know","manual", "explaination"]
    ### Question Phase: The users is in the beginning of the buying cycle and navigates by asking questions
    question_kws      = ['how','what','who','when','whose','why','where']
    ### Sticky Phase: Unlikely to convery now or every
    sticky_kws        = ['free', 'torrent', 'download', 'hack']
    ### Generic/Brand: Don't know how to distinguish Brand Names presently.

    cleanQuery = cleanText(query)

    returnFlag   = False
    returnString = ''

    if matchesIntention(transactional_kws, cleanQuery):
        returnString += "Transactional"
        returnFlag = True
    if matchesIntention(research_kws, cleanQuery):
        if returnFlag: returnString += " & "
        returnString += "Research"
        returnFlag = True
    if matchesIntention(informational_kws, cleanQuery):
        if returnFlag: returnString += " & "
        returnString += "Informational"
        returnFlag = True
    if matchesIntention(question_kws, cleanQuery):
        if returnFlag: returnString += " & "
        returnString += "Question"
        returnFlag = True
    if matchesIntention(sticky_kws, cleanQuery):
        if returnFlag: returnString += " & "
        returnString += "Sticky"
        returnFlag = True
    if returnFlag == False: returnString = "Generic/Brand"

    return returnString

def prepareFinalDf(serp_lst):
  priority_lst = []
  basic_lst    = []
  for serp in serp_lst:
    for prior_kw in Priority_Keywords:
      if prior_kw.lower() in serp[1].lower():
        priority_lst.append(serp)
      else:
        basic_lst.append(serp)

  priority_df = pd.DataFrame(priority_lst, columns=["URL", "Title"]).drop_duplicates()
  priority_df["Priority"] = 1
  basic_df = pd.DataFrame(basic_lst, columns=["URL", "Title"]).drop_duplicates()
  basic_df["Priority"] = 2
  tot_df = pd.concat([priority_df, basic_df])
  tot_df["Intent"] = tot_df.apply(lambda x: findSearchIntent(x["Title"]), axis=1)
  tot_df.sort_values(["Priority", "Intent","URL"], ascending=[True, False, True], inplace=True)
  return tot_df


In [None]:
queries_df  = pd.read_csv("/content/gdrive/MyDrive/queries.csv")
queries_lst = queries_df["query"].tolist()

In [None]:
serp_lst    = pullGoogleSerps(queries_lst)

In [None]:
#@markdown ---
#@markdown Please select the keywords you wish to target and look for the titles of your target URLs.<br>
Priority_Keywords = 'dog, pet, cat'  #@param {type: "string"}
Priority_Keywords = [x.strip() for x in Priority_Keywords.split(",")]

In [None]:
tot_df = prepareFinalDf(serp_lst)

In [None]:
tot_df[["Priority", "Intent", "Title", "URL"]].to_csv('target_urls.csv', index=False)
files.download('target_urls.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>