## Import packages

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## Function definitions

In [None]:
def replace_by_dict(map, text):
    """
    Replace occurrences of keys in the text with their corresponding values from a dictionary.

    This function iterates through each key-value pair in the provided dictionary and 
    replaces all occurrences of the key in the given text with the corresponding value.

    Parameters:
        map (dict): A dictionary where keys are the substrings to be replaced and values are the substrings to replace with.
        text (str): The text in which to perform the replacements.

    Returns:
        str: The modified text with all specified replacements made.
    """
    for key, value in map.items():
        text = text.replace(key, value)
    return text

In [None]:
def get_blog_text(url):
    """
    Retrieve the main text content from a blog post.

    This function sends a GET request to the specified URL, parses the HTML content using
    BeautifulSoup, and extracts the text from all paragraph elements (<p>) that do not
    have any specific class attribute. The extracted text is then concatenated into a
    single string with each paragraph separated by a newline character.

    Parameters:
        url (str): The URL of the blog post to retrieve the text from.

    Returns:
        str: The concatenated text content of the blog post.
    """
    
    req1 = requests.get(url)
    bs1 = BeautifulSoup(req1.content)
    pars = bs1.find_all('p', class_ = '')
    pars_t = [i.text for i in pars]
    full_text = '\n'.join(pars_t)
    return full_text

In [None]:
def get_date_and_tags(text):
    """
    Extract the date and tags from a given text string.

    This function processes a text string to extract the date and tags. The text is expected
    to contain a date followed by a note count, and then tags prefixed by '#'. The date is
    converted to a `datetime.date` object, and the tags are extracted as a list of strings.

    Parameters:
        text (str): The input text string containing the date, note count, and tags.

    Returns:
        tuple: A tuple containing:
            - date (datetime.date): The extracted date.
            - tags (list of str): A list of extracted tags.
    """
    date, tags = re.sub('[1]{1} note|\d notes', '|', text).split('|')
    date = pd.to_datetime(date).date()
    tags = tags.split('#')
    tags = [i for i in tags if len(i)>0]
    return date, tags
    

In [None]:
def get_images(url, letter_dict):
    """
    Download images from a given URL and save them locally.

    This function fetches the HTML content from the specified URL, extracts image sources (excluding
    certain images based on their 'alt' attribute), and downloads the images. The images are saved
    locally with filenames generated from the URL's title, modified by a dictionary of replacements.

    Parameters:
        url (str): The URL from which to download images.

    Returns:
        list: A list of URLs of the downloaded images.
    """
    title = replace_by_dict(letter_dict,url.split('/')[-1])
    req1 = requests.get(url)
    bs1 = BeautifulSoup(req1.content)
    imgs_raw = bs1.find_all('img')
    imgs_raw = [i for i in imgs_raw if (i['alt'] not in ['Avatar', ''])]
    imgs_good = [i['srcset'].split(',')[-1].split(' ')[1] for i in imgs_raw]
    for i,v in enumerate(imgs_good):
        im = requests.get(v)
        img_data = im.content
        with open(f'{title}_{i}.jpg', 'wb') as handler:
            handler.write(img_data)
        
    return imgs_good

## Set up variables

In [None]:
letter_dict = {'%C3%A1': 'á',
               '%C3%A9': 'é',
               '%C3%AD': 'í',
               '%C3%B3': 'ó',
               '%C3%B6': 'ö',
               '%C5%91': 'ő',
               '%C3%BA': 'ú',
               '%C3%BC': 'ü',
               
               }

## Execute 

Read the downloaded tumblr archive html file and convert to Beautiful object 

For the code to work you need to download the archive view of your tumblr and paste the path to the downloaded html file
- the download is needed so it contains all the posts
- your archive view url should look like something like this: 
    - https://username.tumblr.com/archive

In [None]:
# Read the downloaded tumblr archive html file and convert to Beautiful object 
# for the code to work you need to download the archive view of your tumblr and paste the path to the downloaded html file
# the download is needed so it contains all the posts
# your archive view url should look like something like this: 
# https://username.tumblr.com/archive
tumblr_archive_html = ''

with open(tumblr_archive_html, 'r', encoding='utf-8') as file:
    html_content = file.read()
bs = BeautifulSoup(html_content)

- Extract all the links of the specific class
- Extract the URLs from the links
- Extract and convert the titles from the urls (conversion is needed so it is using Hungarian letters)

In [None]:
letter_dict = {'%C3%A1': 'á',
               '%C3%A9': 'é',
               '%C3%AD': 'í',
               '%C3%B3': 'ó',
               '%C3%B6': 'ö',
               '%C5%91': 'ő',
               '%C3%BA': 'ú',
               '%C3%BC': 'ü',
               
               }

In [None]:
# Extract all the links of the specific class
# Extract the URLs from the links
# Extract and convert the titles from the urls (conversion is needed so it is using Hungarian letters)
all_links = bs.find_all('a', class_ = 'oKaff QmZ0e')
all_urls = [i['href'] for i in all_links]
all_titles = [replace_by_dict(letter_dict,i.split('/')[-1]) for i in all_urls]

Execute the functions to scrape all data
- text
- dates
- tags
- download images

In [None]:
# Execute the functions to scrape all data
#   - text
#   - dates
#   - tags
#   - download images
all_text = [get_blog_text(i) for i in all_urls]
print('all text done')
all_dates = [get_date_and_tags(i.text)[0] for i in all_links]
print('all dates done')
all_tags = [get_date_and_tags(i.text)[1] for i in all_links]
print('all tags done')
[get_images(i, letter_dict) for i in all_urls]
print('all images done')

Convert the collected data to a pandas dataframe and write it to a csv and a json file

In [None]:
# Convert the collected data to a pandas dataframe and write it to a csv and a json file
data = pd.DataFrame({'title': all_titles,
                     'urls': all_urls,
                     'text': all_text,
                     'date': all_dates,
                     'tags': all_tags})
data.to_csv('erasmus_blog_content.csv', index = False)
data.to_json('erasmus_blog_content.json', index=False)
data.head(2)