In [22]:
import csv
import datetime
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from collections import Counter
from stop_words import get_stop_words

In [23]:
def clean_word(word):
    word = word.replace("!", "")
    word = word.replace("?", "")
    word = word.replace(".", "")
    word = word.replace(":", "")
    word = word.replace(",", "")
    word = word.replace(";", "")
    word = word.replace(")", "")
    word = word.replace("(", "")
    word = word.replace("-", "")
    word = word.replace("--", "")
    word = word.replace('—', "")
    return word

def clean_up_words(words):
    new_words = []
    pkg_stop_words = get_stop_words('en')
    my_stop_words = ['the', 'a', 'an', 'in', 'is', 'are', 'and', 'or', 'if']
    for word in words:
        word = word.lower()
        cleaned_word = clean_word(word)
        if cleaned_word in my_stop_words or cleaned_word in pkg_stop_words:
            pass
        else:
            new_words.append(cleaned_word)
    return new_words

def create_csv_path(csv_path):
    if not os.path.exists(csv_path):
        with open(csv_path, 'w') as csvfile:
            header_columns = ['word', 'count', 'timestamp']
            writer = csv.DictWriter(csvfile, fieldnames=header_columns)
            writer.writeheader()

In [24]:
saved_domains = {
    "1": "main-container",
    "2": "content-area"
}

my_url = input("Enter any url to scrape: ") 

print("Grabbing...", my_url)
domain = urlparse(my_url).netloc
print("via domain", domain)

response = requests.get(my_url)
print("Status is", response.status_code)

if response.status_code != 200:
    print("You can't scrape this", response.status_code)
else:
    print("Scraping..")
    html = response.text
    soup = BeautifulSoup(html, "html.parser")
    if domain in saved_domains:
        div_class = saved_domains[domain]
        body_ = soup.find("div", {"class": div_class})
    else:
        body_ = soup.find("body")
    words = body_.text.split()
    clean_words = clean_up_words(words)
    word_counts = Counter(clean_words)
    print(word_counts.most_common(30))
    filename = domain.replace(".", "-") + '.csv' 
    path  = 'csv/' + filename
    timestamp = datetime.datetime.now()
    create_csv_path(path)
    with open(path, 'a') as csvfile:
        header_columns = ['word', 'count', 'timestamp']
        writer = csv.DictWriter(csvfile, fieldnames=header_columns)
        for word, count in word_counts.most_common(30):
            writer.writerow({
                    "count": count,
                    "word": word,
                    "timestamp": timestamp
                })

Enter any url to scrape: https://en.wikipedia.org/wiki/2022_Russian_invasion_of_Ukraine
Grabbing... https://en.wikipedia.org/wiki/2022_Russian_invasion_of_Ukraine
via domain en.wikipedia.org
Status is 200
Scraping..
[('2022', 1437), ('february', 1067), ('^', 614), ('retrieved', 494), ('march', 469), ('archived', 426), ('original', 424), ('russian', 351), ('ukraine', 338), ('24', 261), ('war', 226), ('27', 195), ('russia', 192), ('26', 190), ('1', 185), ('25', 181), ('invasion', 143), ('ukrainian', 127), ('28', 112), ('military', 97), ('ukraine"', 94), ('2', 87), ('news', 84), ('putin', 80), ('3', 79), ('–', 76), ('4', 75), ('2014', 73), ('forces', 65), ('times', 61)]
