In [3]:
# staging.ipynb
# ----------------------
# 1. Подключение к PostgreSQL
# ----------------------
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
from torgstat.db import get_engine, DB_SCHEMA

# Загружаем переменные окружения
load_dotenv()

# Создаём движок SQLAlchemy
engine = get_engine()

# Функция для чтения таблицы из схемы analytics
def read_table(table_name: str):
    query = f'SELECT * FROM {DB_SCHEMA}.{table_name};'
    return pd.read_sql(query, engine)

print("Подключение к PostgreSQL успешно установлено.")

Подключение к PostgreSQL успешно установлено.


In [None]:
# ----------------------
# 2. Читаем сырые таблицы
# ----------------------

df_users = read_table("users")
df_sessions = read_table("sessions")
df_subs = read_table("subscriptions")
df_invoices = read_table("invoices")
df_plans = read_table("plans")
df_events = read_table("events")

# Быстрый просмотр
print("Users:", df_users.shape)
print("Sessions:", df_sessions.shape)
print("Subscriptions:", df_subs.shape)
print("Invoices:", df_invoices.shape)
print("Plans:", df_plans.shape)
print("Events:", df_events.shape)

# Размер данных
print(f"Размер DataFrame: {type(df_users.shape)}")
print(f"Количество строк: {df_users.shape[0]}")
print(f"Количество столбцов: {df_users.shape[1]}")

# Общая информация
df_users.info()
df_sessions.info()
df_subs.info()
df_invoices.info()
df_plans.info()
df_events.info()


# Просмотр первых и случайных строк

display(df_users.head())
display(df_sessions.head())
display(df_subs.head())
display(df_invoices.head())
display(df_plans.head())
display(df_events.head())

In [None]:
# ----------------------
# 3. Создаём staging-таблицы
# ----------------------
# STG_USERS: очистка и нормализация
df_stg_users = df_users.copy()
df_stg_users['signup_date'] = pd.to_datetime(df_stg_users['signup_date'])
df_stg_users = df_stg_users.drop_duplicates(subset='user_id')

# STG_SESSIONS: проверяем дубликаты, типы данных
df_stg_sessions = df_sessions.copy()
df_stg_sessions['session_date'] = pd.to_datetime(df_stg_sessions['session_date'])
df_stg_sessions = df_stg_sessions.drop_duplicates(subset='session_id')

# STG_SUBSCRIPTIONS: статус и даты
df_stg_subs = df_subs.copy()
df_stg_subs['start_date'] = pd.to_datetime(df_stg_subs['start_date'])
df_stg_subs['status'] = df_stg_subs['status'].fillna('unknown')

# STG_INVOICES: даты и суммы
df_stg_invoices = df_invoices.copy()
df_stg_invoices['period_start'] = pd.to_datetime(df_stg_invoices['period_start'])
df_stg_invoices['period_end'] = pd.to_datetime(df_stg_invoices['period_end'])
df_stg_invoices['invoice_date'] = pd.to_datetime(df_stg_invoices['invoice_date'])
df_stg_invoices['amount'] = df_stg_invoices['amount'].astype(float)

# STG_EVENTS: даты и события
df_stg_events = df_events.copy()
df_stg_events['event_date'] = pd.to_datetime(df_stg_events['event_date'])

# ----------------------
# 4. Записываем staging-таблицы обратно в БД (analytics schema)
# ----------------------
staging_tables = {
    "stg_users": df_stg_users,
    "stg_sessions": df_stg_sessions,
    "stg_subscriptions": df_stg_subs,
    "stg_invoices": df_stg_invoices,
    "stg_events": df_stg_events
}

for table_name, df in staging_tables.items():
    df.to_sql(table_name, engine, schema=DB_SCHEMA, if_exists='replace', index=False)
    print(f"[INFO] Таблица {DB_SCHEMA}.{table_name} сохранена.")