importing libraries

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import re
import time
import random
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import os
from fake_useragent import UserAgent
from typing import Dict
import csv
from tqdm import tqdm
from datetime import datetime

scrape_paa = False

Helper functions

In [None]:
# latest user agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
header = {"User-Agent": USER_AGENT}


def get_link(s): #extract link
    link = s.find(
        lambda tag: (
            tag.name == "a"
            and tag.has_attr("href")
            and tag["href"].startswith("http")
            and (tag.h3 or tag.h2) is not None
        )
    ).get("href")
    if "#" in link:
        link = link.split("#")[0]
    return link


def get_title(s): #extract title
    try:
      title = s.find(
          lambda tag: (
              tag.name == "a"
              and tag.has_attr("href")
              and tag["href"].startswith("http")
              and (tag.h3 or tag.h2) is not None
          )
      ).find_next('h3').text.strip()
    except:
        title = s.find(
          lambda tag: (
              tag.name == "a"
              and tag.has_attr("href")
              and tag["href"].startswith("http")
              and (tag.h3 or tag.h2) is not None
          )
      ).find_next('h2').text.strip()
    return title


def return_data(paa, seed, text, link, title): #return data as dict
    return {
        "PAA Title": paa,
        "Parent": seed,
        "Text": text,
        "URL": link,
        "URL Title": title,
    }


def get_ul(s, paa, seed): #getting the ul element
    text = (
        s.find("div", {"role": "heading", "aria-level": "3"})
        .find_next("ul")
        .find_all("li")
    )
    text = [a.text.strip() for a in text]
    text = "".join(text)
    link = get_link(s)
    title = get_title(s)
    data = return_data(paa, seed, text, link, title)
    return data


def get_ol(s, paa, seed): #getting the ol element
    text = (
        s.find("div", {"role": "heading", "aria-level": "3"})
        .find_next("ol")
        .find_all("li")
    )
    text = [a.text.strip() for a in text]
    text = "".join(text)
    link = get_link(s)
    title = get_title(s)
    data = return_data(paa, seed, text, link, title)
    return data


def get_snippet(s, paa, seed): #getting normal snippet text
    text = (
        s.find("div", {"role": "heading", "aria-level": "3"})
        .find_next("span", class_="hgKElc")
        .text.strip()
    )
    link = get_link(s)
    title = get_title(s)
    data = return_data(paa, seed, text, link, title)
    return data


def write_to_csv(df: Dict, filename, csv_header): #for writing to csv file
    with open(f"{filename}.csv", "a+", newline="", encoding = "ISO-8859-1") as f:
        writer = csv.DictWriter(f, fieldnames=csv_header)
        f.seek(0, 2)
        if f.tell() == 0:
            writer.writeheader()
        writer.writerow(df)

Getting Level 1 paa from seed keyword

In [None]:
session = requests.Session()
def get_paa(seed, level, paa):
    URL = "https://www.google.com/search"
    params = {"q": seed, "gl": "us"}
    while True:
        response = session.get(URL, params=params, headers=header)
        if response.status_code == 200:
            break
    s = bs(response.text, "html5lib")
    # getting the paa questions
    try:
        div_questions = s.find_all("div", class_="related-question-pair")
        get_text = lambda a: a.text.split("Search for:")[0]
        questions = list(map(get_text, div_questions))
    except:
        questions = []
    if len(questions) > 0:
        paa_file = f"level_{level}"
        q_df = pd.DataFrame({"paa": questions})
        q_df["parent"] = seed
        q_df_dict = q_df.to_dict(orient="records")
        csv_header = ["paa", "parent"]
        for q in q_df_dict:
            write_to_csv(q, paa_file, csv_header)
seeds_keywords = ["world cup"]
if scrape_paa:
    for seed in seeds_keywords:
        get_paa(seed, 1, seed)

This function will collect the data from each paa

In [None]:
def get_paa_data(paa, level, seed): #paa: paa question, level: which level the paa is in, seed:parent paa/keyword for this paa
    excluded_keywords = ["health", "cancer"]  #excluded keywords
    if not any(keyword in paa for keyword in excluded_keywords):
        csv_header = ["PAA Title", "Parent", "Text", "URL", "URL Title"]
        URL = "https://www.google.com/search"
        params = {"q": paa, "gl": "us"}
        while True: #trying until success
            r = session.get(URL, params=params, headers=header)
            if r.status_code == 200:
                break
        sp = bs(r.text, "html5lib")
        # ul snippet
        try:
            if (
                sp.find("div", {"role": "heading", "aria-level": "3"}).find_next("ul")
                is not None
            ):
                data = get_ul(sp, paa, seed)
                write_to_csv(data, fileName, csv_header)
                # print(data)
            # ol list
            if (
                sp.find("div", {"role": "heading", "aria-level": "3"}).find_next("ol")
                is not None
            ):
                data = get_ol(sp, paa, seed)
                write_to_csv(data, fileName, csv_header)
                # print(data)
            # ul list
            if (
                sp.find("div", {"role": "heading", "aria-level": "3"}).find_next(
                    "span", class_="hgKElc"
                )
                is not None
            ):
                data = get_snippet(sp, paa, seed)
                write_to_csv(data, fileName, csv_header)
                # print(data)
        except:
            # print("Nothing here")
            pass
        try:
            div_questions = sp.find_all("div", class_="related-question-pair")
            get_text = lambda a: a.text.split("Search for:")[0]
            questions = list(map(get_text, div_questions))
        except:
            questions = []
        if len(questions) > 0:
            paa_file = f"level_{level}"
            q_df = pd.DataFrame({"paa": questions})
            q_df["parent"] = paa
            q_df_dict = q_df.to_dict(orient="records")
            csv_header = ["paa", "parent"]
            for q in q_df_dict:
                write_to_csv(q, paa_file, csv_header)
paa_level = 4 #change your level here
if scrape_paa:
    fileName = f"google_ppa_{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
    for i in range(1, paa_level + 1): #you can change the level here.
        level = pd.read_csv(f"level_{i}.csv") #reading the level csv file
        level = level.drop_duplicates(subset=["paa"], keep="first") #removing duplicates
        level_dict = level.to_dict(orient="records")
        items = [
            {"paa": level_dict[p]["paa"], "level": i + 1, "seed": level_dict[p]["parent"]}
            for p in range(len(level_dict))
        ]  #items list for threadpool
        def run(my_iter):
            with ThreadPoolExecutor(max_workers=5) as executor:
                results = list(tqdm(executor.map(lambda f: get_paa_data(**f), my_iter), total = len(my_iter)))
        run(items)
    files_to_delete = [
        "level_1.csv",
        "level_2.csv",
        "level_3.csv",
        "level_4.csv",
        "level_5.csv",
        "level_6.csv",
        "level_7.csv",
        "level_8.csv",
        "level_9.csv"
    ] #removing these files because in each run they will generate.
    for f in files_to_delete:
        if os.path.exists(f):
            os.remove(f)
            print(f)


The next section is for keyword clustering

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cluster
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

In [None]:
stemmer = PorterStemmer()
sw = stopwords.words('english')
def tokenizer(keyword):
    return [stemmer.stem(w) for w in keyword.split()]

Read the latest file from directory

In [None]:
import glob
import os

list_of_files = glob.glob('*.csv') # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime)
print(latest_file)

In [None]:
keywords_df = pd.read_csv(latest_file, encoding= "ISO-8859-1") #change the fileName here
keywords_df = keywords_df.drop_duplicates(subset=['PAA Title'],keep='first')
keywords = keywords_df['PAA Title'].values.tolist()
keywords = [k.replace("?", "").strip() for k in keywords]

In [None]:
tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words=sw)
X = pd.DataFrame(tfidf.fit_transform(keywords).toarray(),
                 index=keywords, columns=tfidf.get_feature_names_out())
c = cluster.AffinityPropagation() # For prediction
X['pred'] = c.fit_predict(X) #adding to dataframe

Saving to file

In [None]:
X = X.reset_index()
X = X[['index','pred']]
X = X.rename(columns={'index':"paa_title"})
keywords_df['paa_title'] = X['paa_title']
keywords_df['clusters'] = X['pred']
keywords_df.to_csv(f'{fileName}_clustered_file.csv', index=False, encoding='ISO-8859-1')