In [5]:
from bs4 import BeautifulSoup
import requests
import re
from collections import Counter
from string import punctuation
import math
import pandas as pd
from ipynb.fs.full.functions import *

## Helper functions

In [6]:
# remove punctuation
def CleanText(text):
    text = str(text)
    forbidden = [r'\n', r'.', r'?', r'!', r'(', r')', r'\r']
    for i in forbidden:
        text.replace(i, ' ')
    return text

# returns amount of a specific word on a page (complete page)
def WordCount(webpage, word):
    words = ''.join([t for t in webpage.find_all(text=True)])
    words = CleanText(words.lower())
    words = words.split()
    return words.count(word.lower())

# returns amount of a specific word on a page (body only)
def WordCountBody(webpage, word):
    words = ''.join([t for t in webpage.body.find_all(text=True)])
    words = CleanText(words.lower())
    words = words.split()
    return words.count(word.lower())

# returns the total amount of words on a page
def TotalWordCount(website):
    # strip all script and style elements
    for script in website(["script", "style"]):
        script.extract()

    text = website.get_text()
    
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    # calculate amount of seperate words
    total_word_count = len(text.split())
    return total_word_count

# returns keyword density = keyword to total words ratio
def KeywordDensity(keyword, website):
    total_length = TotalWordCount(website)
    keyword_amount = WordCount(website,keyword)
    return keyword_amount / total_length

def ToPercentage(number):
    return round(number * 100, 2)

## Get rankingfactor data function

In [7]:
# returns dataframe with the keyword density as percentage
def GetKeywordDensity(keyword, df):
    rows_list = []
    urls = df["Ur"]
    
    for key, value in urls.iteritems():
        try:
            r = requests.get(value, timeout=5)
            soup = BeautifulSoup(r.content)   
            results = {"keyword_density_percentage":ToPercentage(KeywordDensity(keyword, soup))}
            rows_list.append(results)
        except requests.exceptions.Timeout:
            continue    
    
    return pd.DataFrame(rows_list)

In [8]:
d = {'Ur': ["https://www.bouwmaat.com/bouwmaterialen", "https://www.bouwbestel.nl/bouwmaterialen.html"], 'Iets': [3, 4]}
df = pd.DataFrame(data=d)
GetKeywordDensity("bouwmaterialen", df)

Unnamed: 0,keyword_density_percentage
0,0.17
1,0.28
