# CSV processing for Redezeit's dashboard scraped data

This notebook's purpose is to process the scraped csv files from Redezeit's Looker Studio dashboard and rename the csv files with a more meaningful name.
We preferred a nb instead of a .py file to have a controlled view of each line of code.

In [25]:
import pandas as pd
import os


### Load files

In [26]:
# Setup paths and file names
BASE_DIR = BASE_DIR = os.getcwd()  # Gets the current working directory in a notebook
folder_path = os.path.join(BASE_DIR, '..', 'data')       #  Build relative path
folder_path = os.path.normpath(folder_path)     #  Clean up '..' for cross-platform compatibility

file_names = [
    'landingpage.csv',
    'user_behaviors.csv',
    'what_devices_used_chart.csv',
    'what_did_user_do.csv',
    'where_did_they_come_from.csv',
    'where_new_visitors_come_from_chart.csv',
    'who_was_visiting_chart.csv'
]

# create output folder for reworked CSVs
parent_folder = os.path.dirname(folder_path)  # go up one level
clean_folder = os.path.join(folder_path, "clean")
os.makedirs(clean_folder, exist_ok=True)

###  1) Precleaning:

Replace csv separator ',' with ';', and '.' to ',' to avoid formatting issues (',' is used as tex format for decimals)

In [27]:
#  pre-cleaning function (replace ',' with ';' and '.' with ',')

def pre_clean_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile:
        content = infile.read()

    #  step by step replace: 1) comma separation to semicolon
    content = content.replace(',', ';')

    #  2) decimal dots to commas
    content = content.replace ('.', ',')

    #  Save

    with open(output_path, 'w', encoding='utf-8') as outfile:
        outfile.write(content)

### 2)  Industry standard file naming

In [28]:
load_renames = {
    "cleaned_landingpage.csv":                "landing_page_views",
    "cleaned_user_behaviors.csv":             "user_sessions",
    "cleaned_what_devices_used_chart.csv":    "device_usage",
    "cleaned_what_did_user_do.csv":           "user_events",
    "cleaned_where_did_they_come_from.csv":   "traffic_sources",
    "cleaned_where_they_come_from_chart.csv": "traffic_source_chart",
    "cleaned_who_was_visiting_chart.csv":     "daily_visitors_chart",
}

### 3) Apply pre-cleaning and save

In [29]:
# Apply pre-cleaning and save with new names
cleaned_files = []
for file in file_names:
    raw_path = os.path.join(folder_path, file)
    cleaned_name = f"cleaned_{file}"

    # Use renamed filename if available
    final_name = load_renames.get(cleaned_name, cleaned_name) + ".csv"
    clean_path = os.path.join(clean_folder, final_name)

    pre_clean_file(raw_path, clean_path)
    cleaned_files.append(clean_path)

print("Files prepared and saved with renamed filenames.")

Files prepared and saved with renamed filenames.


### 5) Load CSV's into df

By loading everything in df's, we make sure things are smooth and working, ready to load into the DB. Converting types here makes sense because this way we can catch errors before we load everything in Postgres

In [30]:
# Load cleaned files using new names
dataframes = {}
for fname in os.listdir(clean_folder):
    if fname.endswith(".csv"):
        path = os.path.join(clean_folder, fname)
        key = load_renames.get(fname, fname.replace(".csv", ""))
        df = pd.read_csv(path, sep=';', encoding='utf-8')
        dataframes[key] = df
        print(f"{key} loaded from {fname}: {df.shape}")

cleaned_where_new_visitors_come_from_chart loaded from cleaned_where_new_visitors_come_from_chart.csv.csv: (7302, 3)
daily_visitors_chart loaded from daily_visitors_chart.csv: (636, 3)
device_usage loaded from device_usage.csv: (2293, 3)
landing_page_views loaded from landing_page_views.csv: (16804, 4)


ParserError: Error tokenizing data. C error: Expected 6 fields in line 7526, saw 11


### 6)  Cleaning function

In [14]:
def clean_all_columns(df):
    """
Cleans and standardizes all columns in a pandas DataFrame by converting dates, durations,percentages, and localized numeric formats into consistent types.

Args:
    df: A pandas DataFrame containing raw, string-based or mixed-type columns.

Returns:
    -Pandas DataFrame with cleaned columns.
    -Datetime columns are parsed, duration columns are converted to time strings and numeric values
    -Percentages are normalized to floats.
    -German-style numeric strings are converted to float values.
    -All other string columns are preserved as-is.

Raises:
    None explicitly, but columns with unsupported formats may result in NaNs due to coercion.
"""
    # Define fields to skip numeric conversion
    text_fields = {"name des events", "even_label", "kategorie", "quelle", "source"}

    for col in df.columns:
        is_obj = df[col].dtype == "object"
        sample = df[col].dropna().astype(str) if is_obj else None

        # 1) Date conversion
        if "datum" in col.lower():
            df[col] = pd.to_datetime(df[col], errors="coerce")
            print(f"📅 Converted to datetime: {col}")
            continue

        # 2) Time durations hh:mm:ss
        if is_obj and sample.str.match(r"^\d{2}:\d{2}:\d{2}$").all():
            td = pd.to_timedelta(df[col], errors="coerce")
            # human-readable HH:MM:SS
            df[col] = td.astype(str).str[-8:]
            # numeric columns
            df[f"{col}_seconds"] = td.dt.total_seconds()
            df[f"{col}_days"]    = td.dt.total_seconds() / 86400
            print(f"⏱️ Processed time column: {col}")
            continue

        # 3) Percentage conversion (only pure percent strings)
        if is_obj and sample.str.match(r"^[\d\.\,]+\s*%$").all():
            df[col] = (
                sample.str.replace("%", "", regex=False)
                      .str.replace(",", ".", regex=False)
                      .astype(float) / 100
            )
            print(f"📊 Converted percentage: {col}")
            continue

        # 4) German-style numeric conversion
        if is_obj and col.lower() not in text_fields:
            # check pure numeric patterns
            if sample.str.match(r"^[\d\.\,]+$").all():
                cleaned = sample.str.replace(".", "", regex=False)  # remove thousands
                cleaned = cleaned.str.replace(",", ".", regex=False)  # decimal comma
                df[col] = pd.to_numeric(cleaned, errors="coerce")
                print(f"🔢 Converted to numeric: {col}")
                continue

        # 5) Preserve any other object columns as strings
        if is_obj:
            df[col] = sample
            print(f"🔤 Preserved string column: {col}")

    return df


### 7) Run them Jewells!

In [15]:
for name, df in dataframes.items():
    print(f"Cleaning DataFrame: {name}")
    dataframes[name] = clean_all_columns(df)

Cleaning DataFrame: daily_visitors_chart
📅 Converted to datetime: Datum
🔤 Preserved string column: Kategorie
Cleaning DataFrame: device_usage
📅 Converted to datetime: Datum
🔤 Preserved string column: Kategorie
Cleaning DataFrame: landing_page_views
📅 Converted to datetime: Datum
🔤 Preserved string column: Seitentitel
Cleaning DataFrame: traffic_sources
📅 Converted to datetime: Datum
🔤 Preserved string column: Quelle
Cleaning DataFrame: traffic_source_chart
📅 Converted to datetime: Datum
🔤 Preserved string column: Kategorie
Cleaning DataFrame: user_events
📅 Converted to datetime: Datum
🔤 Preserved string column: Name des Events
🔤 Preserved string column: even_label
Cleaning DataFrame: user_sessions
📅 Converted to datetime: Datum
⏱️ Processed time column: Durchschn, Zeit auf der Seite


### 8) Datatype check for each df

In [16]:
for name, df in dataframes.items():
    print(f"'{name}' data types:")
    print(df.dtypes)
    print("-" * 40)

'daily_visitors_chart' data types:
Datum        datetime64[ns]
Kategorie            object
Wert                  int64
dtype: object
----------------------------------------
'device_usage' data types:
Datum        datetime64[ns]
Kategorie            object
Wert                float64
dtype: object
----------------------------------------
'landing_page_views' data types:
Datum          datetime64[ns]
EID                   float64
Seitentitel            object
Aufrufe               float64
dtype: object
----------------------------------------
'traffic_sources' data types:
Datum                  datetime64[ns]
EID                           float64
Quelle                         object
Sitzungen                       int64
Aufrufe                       float64
Aufrufe pro Sitzung           float64
dtype: object
----------------------------------------
'traffic_source_chart' data types:
Datum        datetime64[ns]
Kategorie            object
Wert                  int64
dtype: object
------

### 9) Head check for each df

In [17]:
for name, df in dataframes.items():
    print(f"'{name}' head:")
    print(df.head())
    print("-" * 60)

'daily_visitors_chart' head:
       Datum Kategorie  Wert
0 2023-01-12    female    15
1 2023-01-23    female    12
2 2023-01-31    female    11
3 2023-03-25    female    15
4 2023-03-27    female    11
------------------------------------------------------------
'device_usage' head:
       Datum Kategorie  Wert
0 2023-02-12    mobile  49.0
1 2023-02-12   desktop  48.0
2 2023-02-13   desktop  64.0
3 2023-02-13    mobile  36.0
4 2023-02-14   desktop  63.0
------------------------------------------------------------
'landing_page_views' head:
       Datum  EID                                        Seitentitel  Aufrufe
0 2023-01-01  1.0  REDEZEIT FÜR DICH Start DE | REDEZEIT FÜR DICH...     11.0
1 2023-01-01  2.0  Ich biete Redezeit, | REDEZEIT FÜR DICH #virtu...      2.0
2 2023-01-01  3.0  Lesezeit – das Redezeit Blog, | REDEZEIT FÜR D...      2.0
3 2023-01-01  4.0  SEO WER SIND WIR DE | REDEZEIT FÜR DICH #virtu...      2.0
4 2023-01-01  5.0  Du suchst REDEZEIT für Dich DE | REDEZEIT FÜ

In [23]:
with open(path, encoding='utf-8') as file:
    lines = file.readlines()
    print(lines[16634])  # Python uses 0-based indexing

2025-06-20;19,;Über uns | REDEZEIT FÜR DICH #virtualsupporttalks;02025-06-21;1,;Willkommen | REDEZEIT FÜR DICH #virtualsupporttalks;14



In [31]:
with open(path, encoding='utf-8') as file:
    for i, line in enumerate(file):
        if i == 7525:  # line number is zero-indexed
            print(line)

2025-06-27;9,;duckduckgo;1;0;02025-06-28;1,;google;31;30;0,97

