# Datascraping und Optical Character Recognition


## Datamanagement und -scraping

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import zipfile
import json
import os
import pandas as pd

!pip -q install easyocr==1.2.1

import easyocr
reader = easyocr.Reader(['de','en'])

In [None]:
four_cat_file_path = "/content/drive/MyDrive/Projekt_Mobilisierung/Data/CSU/Kandidierende/heisl/heisl.csv"

df = pd.read_csv(four_cat_file_path, sep = ";")

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')

start_date = '2023-03-31'
end_date = '2023-10-09'

df = df[(df['timestamp'] >= start_date) & (df['timestamp'] <= end_date)].copy()

In [None]:
zip_file_path = '/content/drive/MyDrive/Projekt_Mobilisierung/Data/CSU/Kandidierende/heisl/heisl.zip'
output_zip_file_path = ''

four_cat_folder = "cat-export/"

video_path = "media/images"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(four_cat_folder)

print(f"Files extracted to {four_cat_folder}")

metadata_file_path = f'{four_cat_folder}/.metadata.json'

with open(metadata_file_path, 'r') as file:
    data = json.load(file)

if not os.path.exists(video_path):
    os.makedirs(video_path)


for item in data.values():
    if item.get('success', False):
        post_id = item['post_ids'][0]
        filename = item['filename']
        print(f"Processing Post ID: {post_id}, Filename: {filename}")
        source_path = os.path.join(four_cat_folder, filename)
        destination_path = os.path.join(video_path, f"{post_id}.jpg")
        os.rename(source_path, destination_path)

In [None]:
!zip -r /content/drive/MyDrive/Projekt_Mobilisierung/Data/CSU/Kandidierende/heisl/heisl_images_ref.zip media

In [None]:
df['image_file'] = df.apply(lambda row: f"media/images/{row['id']}.jpg", axis=1)

In [None]:
df = df[df['image_file'] != "media/images/nan.jpg"]

## Optical Character Recognition

In [None]:
def run_ocr(image_path):
    ocr_result = reader.readtext(image_path, detail = 0)
    ocr_text = " ".join(ocr_result)
    return ocr_text

df['ocr_text'] = df['image_file'].apply(run_ocr)

In [None]:
df.to_csv('/content/drive/MyDrive/Projekt_Mobilisierung/Data/CSU/Kandidierende/heisl/heisl_posts_01_04-08_10_ocr.csv', index=False)

In [None]:
len(df)

94

## Datasaving

In [None]:
original_zip_path = '/content/drive/MyDrive/Projekt_Mobilisierung/Data/CSU/Kandidierende/heisl/heisl_images_ref.zip'
output_zip_path = '/content/drive/MyDrive/Projekt_Mobilisierung/Data/CSU/Kandidierende/heisl/heisl_images_final.zip'
output_folder = '/content/drive/MyDrive/Projekt_Mobilisierung/Data/CSU/Kandidierende/heisl/heisl_images_final'

os.makedirs(output_folder, exist_ok=True)

for filename in df['id']:
    file_path_in_zip = f"media/images/{filename}.jpg"
    output_file_path = os.path.join(output_folder, f"{filename}.jpg")

    with zipfile.ZipFile(original_zip_path, 'r') as zip_ref:
        with zip_ref.open(file_path_in_zip) as source, open(output_file_path, 'wb') as target:
            target.write(source.read())

with zipfile.ZipFile(output_zip_path, 'w') as zipf:
    for root, _, files in os.walk(output_folder):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, output_folder))

os.remove(original_zip_path)

print("Extraktion abgeschlossen. Die extrahierten Dateien befinden sich im Zip-Ordner:", output_zip_path)

Extraktion abgeschlossen. Die extrahierten Dateien befinden sich im Zip-Ordner: /content/drive/MyDrive/Projekt_Mobilisierung/FW/Kandidierende/piazolo/piazolo_images_final.zip


Quellen:

Achmann-Denkler, M. (2024). michaelachmann/social-media-lab: DOI Release (v0.0.12). Zenodo. https://doi.org/10.5281/zenodo.10618621

Achmann-Denkler, M. (2023). “OCR.” November 27, 2023. https://doi.org/10.5281/zenodo.10039756.

Peeters, S. (2023). Zeeschuimer (v1.8.0). Zenodo. https://doi.org/10.5281/zenodo.8399900.

Peeters, S., Hagen, S., & Wahl D. (2023). 4CAT Capture and Analysis Toolkit (v1.36). Zenodo. https://doi.org/10.5281/zenodo.8139174.
