In [1]:
# staging.ipynb
# ----------------------
# 1. Подключение к PostgreSQL
# ----------------------
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
from torgstat.db import get_engine, SCHEMA

# Загружаем переменные окружения
load_dotenv()

# Создаём движок SQLAlchemy
engine = get_engine()

# Функция для чтения таблицы из схемы analytics
def read_table(table_name: str):
    query = f'SELECT * FROM {SCHEMA}.{table_name};'
    return pd.read_sql(query, engine)

ImportError: cannot import name 'SCHEMA' from 'torgstat.db' (/Users/admin/torgstat-analytics-case/src/torgstat/db.py)

In [27]:
# ----------------------
# 2. Читаем сырые таблицы
# ----------------------

df_users = read_table("users")
df_sessions = read_table("sessions")
df_subs = read_table("subscriptions")
df_invoices = read_table("invoices")
df_plans = read_table("plans")
df_events = read_table("events")

# Быстрый просмотр
print("Users:", df_users.shape)
print("Sessions:", df_sessions.shape)
print("Subscriptions:", df_subs.shape)
print("Invoices:", df_invoices.shape)
print("Plans:", df_plans.shape)
print("Events:", df_events.shape)

# Размер данных
print(f"Размер DataFrame: {type(df_users.shape)}")
print(f"Количество строк: {df_users.shape[0]}")
print(f"Количество столбцов: {df_users.shape[1]}")

# Общая информация
df_users.info()
df_sessions.info()
df_subs.info()
df_invoices.info()
df_plans.info()
df_events.info()


# Просмотр первых и случайных строк

display(df_users.head())
display(df_sessions.head())
display(df_subs.head())
display(df_invoices.head())
display(df_plans.head())
display(df_events.head())

Users: (2000, 3)
Sessions: (5914, 7)
Subscriptions: (658, 5)
Invoices: (2167, 9)
Plans: (3, 4)
Events: (16251, 3)
Размер DataFrame: <class 'tuple'>
Количество строк: 2000
Количество столбцов: 3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      2000 non-null   int64 
 1   signup_date  2000 non-null   object
 2   region       2000 non-null   object
dtypes: int64(1), object(2)
memory usage: 47.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5914 entries, 0 to 5913
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   session_id        5914 non-null   object
 1   user_id           5914 non-null   int64 
 2   session_date      5914 non-null   object
 3   utm_source        5914 non-null   object
 4   utm_medium        5914 non-null   object
 5   utm_campaig

Unnamed: 0,user_id,signup_date,region
0,1,2024-04-12,Казань
1,2,2024-03-15,Москва
2,3,2024-02-22,Новосибирск
3,4,2024-02-27,Казань
4,5,2024-01-15,Москва


Unnamed: 0,session_id,user_id,session_date,utm_source,utm_medium,utm_campaign,is_first_session
0,sess_1_1,1,2024-04-12,google,cpc,generic,True
1,sess_1_2,1,2024-05-03,google,cpc,generic,False
2,sess_1_3,1,2024-07-04,google,cpc,generic,False
3,sess_2_1,2,2024-03-15,google,cpc,brand,True
4,sess_2_2,2,2024-03-18,google,cpc,brand,False


Unnamed: 0,subscription_id,user_id,plan_id,start_date,status
0,1001,1,2,2024-04-16,active
1,1002,2,2,2024-03-19,churned
2,1003,5,1,2024-01-26,churned
3,1004,10,1,2024-03-05,churned
4,1005,13,2,2024-04-04,churned


Unnamed: 0,invoice_id,subscription_id,user_id,period_start,period_end,invoice_date,amount,paid,is_initial
0,50001,1001,1,2024-04-16,2024-05-16,2024-04-16,999.0,True,True
1,50002,1001,1,2024-05-16,2024-06-15,2024-05-16,999.0,True,False
2,50003,1001,1,2024-06-15,2024-07-15,2024-06-15,999.0,True,False
3,50004,1001,1,2024-07-15,2024-08-14,2024-07-15,999.0,True,False
4,50005,1001,1,2024-08-14,2024-09-13,2024-08-14,999.0,True,False


Unnamed: 0,plan_id,plan_name,period,price
0,1,Basic,monthly,499.0
1,2,Pro,monthly,999.0
2,3,Business,monthly,1999.0


Unnamed: 0,user_id,event_date,event_name
0,1,2024-04-12,app_open
1,1,2024-04-13,app_open
2,1,2024-04-13,feature_b
3,1,2024-04-15,app_open
4,1,2024-04-17,app_open


In [None]:
# ----------------------
# 3. Создаём staging-таблицы
# ----------------------
# STG_USERS: очистка и нормализация
df_stg_users = df_users.copy()
df_stg_users['signup_date'] = pd.to_datetime(df_stg_users['signup_date'])
df_stg_users = df_stg_users.drop_duplicates(subset='user_id')

# STG_SESSIONS: проверяем дубликаты, типы данных
df_stg_sessions = df_sessions.copy()
df_stg_sessions['session_date'] = pd.to_datetime(df_stg_sessions['session_date'])
df_stg_sessions = df_stg_sessions.drop_duplicates(subset='session_id')

# STG_SUBSCRIPTIONS: статус и даты
df_stg_subs = df_subs.copy()
df_stg_subs['start_date'] = pd.to_datetime(df_stg_subs['start_date'])
df_stg_subs['status'] = df_stg_subs['status'].fillna('unknown')

# STG_INVOICES: даты и суммы
df_stg_invoices = df_invoices.copy()
df_stg_invoices['period_start'] = pd.to_datetime(df_stg_invoices['period_start'])
df_stg_invoices['period_end'] = pd.to_datetime(df_stg_invoices['period_end'])
df_stg_invoices['invoice_date'] = pd.to_datetime(df_stg_invoices['invoice_date'])
df_stg_invoices['amount'] = df_stg_invoices['amount'].astype(float)

# STG_EVENTS: даты и события
df_stg_events = df_events.copy()
df_stg_events['event_date'] = pd.to_datetime(df_stg_events['event_date'])

# ----------------------
# 4. Записываем staging-таблицы обратно в БД (analytics schema)
# ----------------------
staging_tables = {
    "stg_users": df_stg_users,
    "stg_sessions": df_stg_sessions,
    "stg_subscriptions": df_stg_subs,
    "stg_invoices": df_stg_invoices,
    "stg_events": df_stg_events
}

for table_name, df in staging_tables.items():
    df.to_sql(table_name, engine, schema=SCHEMA, if_exists='replace', index=False)
    print(f"[INFO] Таблица {SCHEMA}.{table_name} сохранена.")