## Импорты, подключения и сбор датасетов

In [None]:
import json
import pandas as pd
from sqlalchemy import create_engine
import pathlib
import os

os.environ['ETL_ROOT'] = r'C:\Users\Roman\Desktop\ETL-PIPELINE'
BASE_DIR = pathlib.Path(os.environ['ETL_ROOT'])
RAW_DIR = BASE_DIR / 'raw_data'
PROC_DIR = BASE_DIR / 'processed'

plugin_path                    = RAW_DIR / 'tim_export_plugin.csv'
gitlab_path                    = RAW_DIR / 'gitlab_export_lines.json'
mapping_path                   = BASE_DIR / 'config' / 'gitlab-plugins_mapping.csv'

save_path = PROC_DIR / 'gitlab_transformed.csv'

# Подключение
engine_postgres = create_engine("postgresql+psycopg2://postgres:Q!w2e3r4@192.168.42.188:5430/postgres")
engine_pluginsdb = create_engine("postgresql+psycopg2://postgres:Q!w2e3r4@192.168.42.188:5430/pluginsdb")

## Создание датафреймов по источникам данных

In [None]:
df_mapping = pd.read_csv(mapping_path, encoding='utf-8', sep=',')
df_mapping.columns = df_mapping.columns.str.strip().str.replace('\ufeff', '', regex=False)

with gitlab_path.open(encoding='utf-8') as f:
    gitlab = json.load(f)
df_gitlab = pd.json_normalize(gitlab)

df_plugin = pd.read_csv(plugin_path)

## Проверка маппинга

In [None]:
# 1. Фильтрация по префиксу
df_revit = df_gitlab[df_gitlab["name"].str.startswith("plugins/revit/")].copy()

# 2. Извлечение имени плагина
df_revit["plugin_name"] = df_revit["name"].str.replace("^plugins/revit/", "", regex=True)

# 3. Точное сравнение с учётом регистра
known_plugins = df_mapping["gitlab_name"].tolist()
df_new = df_revit[~df_revit["plugin_name"].isin(known_plugins)].copy()

# 4. Подготовка к вставке: gitlab_id → строка вида "123.0"
df_new_to_add = df_new[["plugin_name", "id"]].rename(columns={
    "plugin_name": "gitlab_name",
    "id": "gitlab_id"
})
df_new_to_add["gitlab_id"] = df_new_to_add["gitlab_id"].apply(lambda x: f"{int(x)}.0")

In [None]:
import gspread
from google.oauth2.service_account import Credentials


SERVICE_ACCOUNT_FILE = BASE_DIR / 'config' / 'revitmaterials-4c3f80dae9f5.json' 
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
ws = (
    gspread.authorize(creds)
    .open_by_key('19ZDWnS0Ft8bLVCbVyHsOatTTzidv55r5Rj7Woi9mNck')
    .worksheet('gitlab-plugins')
)

current_rows = len(ws.get_all_values())
rows_to_append = df_new_to_add.astype(str).values.tolist()

for row in rows_to_append:
    ws.append_row(row, value_input_option='USER_ENTERED')


In [None]:
df_gitlab = df_gitlab.merge(
    df_mapping[["gitlab_id", "tim_guid"]],
    how="left",
    left_on="id",
    right_on="gitlab_id"
)
df_gitlab.drop(columns=["gitlab_id"], inplace=True)

df_gitlab = df_gitlab.merge(
    df_plugin[["id", "display_name"]],
    how="left",
    left_on="tim_guid",
    right_on="id"
)

df_gitlab.drop(columns=["id_y"], inplace=True)
df_gitlab.rename(columns={"id_x": "id"}, inplace=True)

df_gitlab = df_gitlab.merge(
    df_plugin[['id', 'developer']],
    how='left',
    left_on='tim_guid',
    right_on='id'
)

df_gitlab.drop(columns=["id_y"], inplace=True)
df_gitlab.rename(columns={"id_x": "id"}, inplace=True)

df_gitlab.rename(columns={
    "id": "gitlab_id",
    "name": "gitlab_name",
    "chosen_branch": "gitlab_branch"
}, inplace=True)

## Пишем в БД

In [None]:
from sqlalchemy import text
import pandas as pd

# ✅ Проверка подключения к базе
with engine_postgres.begin() as conn:
    db_name = pd.read_sql("SELECT current_database()", conn)
    print("🔎 Подключен к базе:", db_name.iloc[0, 0])

# ✅ Пересоздание структуры таблицы
df_gitlab.head(0).to_sql(
    "ext_scripts_gitlab",
    engine_postgres,
    schema="datalake",
    if_exists="replace",  # Пересоздаёт таблицу с колонками из DataFrame
    index=False
)
print("🛠 Структура таблицы datalake.test_lake пересоздана из DataFrame.")

# ✅ Загрузка данных в новую таблицу
df_gitlab.to_sql(
    "ext_scripts_gitlab",
    engine_postgres,
    schema="datalake",
    if_exists="append",
    index=False
)
print(f"✅ Загружено строк: {len(df_gitlab)}")