# Scoring

## Импорт библиотек

In [1]:
import pandas as pd, pyarrow
import numpy as np
import matplotlib.pyplot as plt
import os, sys
from pathlib import Path
import datetime

import re
# --- Настройка путей и sys.path ---
# Добавляем корневую директорию проекта в sys.path для импорта кастомных модулей
PROJECT_ROOT = Path().cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.config import config
from src.logger import logger
from src.database import clickhouse_engine, postgres_engine, ipdr_engine    


Configuration loaded successfully.


2025-10-27 16:15:10,697 | my_logger - INFO - ✅ PostgreSQL engine создан | /data/aturov/scoring/src/database.py:21
2025-10-27 16:15:10,739 | my_logger - INFO - ✅ ClickHouse engine создан | /data/aturov/scoring/src/database.py:36
2025-10-27 16:15:10,740 | my_logger - INFO - ✅ IPDR ClickHouse engine создан | /data/aturov/scoring/src/database.py:46


## Данные

### Данные о кредитах

In [2]:
df = pd.read_parquet(f"{config.environment.data_processed_path}/credit_data_2025-10-17.parquet")
logger.info(f"Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")
df.head()

2025-10-27 16:15:11,028 | my_logger - INFO - Data loaded: 87696 rows, 29 columns | /tmp/ipykernel_4130472/3231231003.py:2


Unnamed: 0,trust_phone,phone_1,phone_2,phone_3,inn,passport,who_give,date_give,sex,date_birth,...,name_object_credit,contract_date_open,summa,interest_on_credit,contract_length,prev_credit_count,sum_of_prev_credits,overdue_max,total_overdue,status
0,,0772529356 0557,,,10101194902445,AN4279692,МКК50-19,2015-09-25,Женщина,1949-01-01,...,Торгово-закупочная деятельность,2023-03-29,1000000.0,0.0,36.0,0.0,0.0,0.0,0.0,Одобрено
1,996700200000.0,0700205385,,,10101195302746,AN0798149,ИИМ50-10,2008-04-10,Женщина,1953-01-01,...,Растениеводство,2022-08-25,85000.0,6.0,17.0,0.0,0.0,3.0,3.0,Одобрено
2,996700200000.0,0700205385,,,10101195302746,AN0798149,ИИМ50-10,2008-04-10,Женщина,1953-01-01,...,Животноводство,2022-08-25,85000.0,6.0,17.0,0.0,0.0,14.0,14.0,Одобрено
3,,0990288871,,,10101195303139,AN4751110,МКК50-37,2016-12-05,Женщина,1953-01-01,...,Животноводство,2014-12-22,500000.0,25.0,36.0,0.0,0.0,2438.0,2438.0,Одобрено
4,,0704608121,,,10101195400720,AN2737017,МКК50-14,2012-02-27,Женщина,1954-01-01,...,Животноводство,2023-04-12,300000.0,6.0,36.0,0.0,0.0,0.0,0.0,Одобрено


In [3]:
df.query('status == "Отказано"')['date_birth'].min(), df.query('status == "Одобрено"')['date_birth'].max()

('1953-03-20 00:00:00', '2006-10-26')

In [4]:
df.query('status == "Отказано"')['interest_on_credit'].mean(), df.query('status == "Одобрено"')['interest_on_credit'].mean()

(np.float64(19.901098013886646), np.float64(16.086343048757232))

Файл содержит данные о кредитах Элдик Банка по состоянию на 03.10.2025  с суммой кредита не более 1 млн сом

Расшифровка полей:

Данные о заемщике:
- trust_phone - - наиболее "свежий" номер тел из "mib"
- phone - телефон заемщика из данных указанных в кредитном договоре
- phone_abs_1  - дополнительный телефон, возможно устаревший
- phone_abs_2 - дополнительный телефон 2, возможно устаревший
- inn  - ПИН заемщика
- passport - номер пасспрота заемщика
- who_give - подразделение выдавшее паспорт
- date_give - дата выдачи пасспорта
- sex - пол заемщика ( 1 - жен, 2 - муж )
- date_birth - дата рождения
- birthplace - место рождения
- marital_status  - семейный статус (1 - Женат/замужем, 2 - Холост/не замужем, 3- Разведен/разведена, 4 - Вдовец/вдова)

Данные о месте проживания заемщика:

- name_region - область
- city - город
- street - улица и дом

Данные о кредите:

 - id_credit - внутренний идентификационный номер кредита
 - id_filials - номер филиала
 - id_branch_bank - код сберкассы
 - name_code_credit - наименование шифра кредита
 - name_object_credit - наименование объекта кредитования
 - contract_date_open - дата открытия кредитного договора
 - summa - сумма кредита
 - interest_on_credit - процент по кредиту
 - contract_length - срок договора
 - prev_credit_count - количество кредитов в истории на момент взятия данного кредита (если ранее кредитов в нашем банке не было - могут стоять значения 0 или Nan)
 - sum_of_prev_credits - сумма кредитов  в истории на момент взятия текущего кредита (если ранее кредитов в нашем банке не было - сумма будет 0 или Nan)
 - overdue_max - разовая максимальная просрочка
 - total_overdue - суммарная просрочка по кредиту

In [5]:
df.columns

Index(['trust_phone', 'phone_1', 'phone_2', 'phone_3', 'inn', 'passport',
       'who_give', 'date_give', 'sex', 'date_birth', 'birthplace',
       'marital_status', 'name_region', 'city', 'street', 'id_credit',
       'id_filials', 'id_branch_bank', 'name_code_credit',
       'name_object_credit', 'contract_date_open', 'summa',
       'interest_on_credit', 'contract_length', 'prev_credit_count',
       'sum_of_prev_credits', 'overdue_max', 'total_overdue', 'status'],
      dtype='object')

In [6]:
df.drop_duplicates(subset=['inn', 'passport', 'contract_date_open', 'id_credit'], inplace=True)
logger.info(f"len data {df.shape[0]}")
df['contract_date_open'] = pd.to_datetime(df['contract_date_open'], errors='coerce')
logger.info(f"len data {df.shape[0]}")
df.dropna(subset=['contract_date_open'], inplace=True)
logger.info(f"len data {df.shape[0]}")
df = df.query("contract_date_open >= '2020-01-01'")
logger.info(f"len data {df.shape[0]}")


2025-10-27 16:15:11,289 | my_logger - INFO - len data 87696 | /tmp/ipykernel_4130472/1837554212.py:2
2025-10-27 16:15:11,306 | my_logger - INFO - len data 87696 | /tmp/ipykernel_4130472/1837554212.py:4
2025-10-27 16:15:11,353 | my_logger - INFO - len data 87696 | /tmp/ipykernel_4130472/1837554212.py:6
2025-10-27 16:15:11,479 | my_logger - INFO - len data 73957 | /tmp/ipykernel_4130472/1837554212.py:8


In [7]:
df['contract_date_open'] = pd.to_datetime(df['contract_date_open'], errors='coerce')
df.shape

(73957, 29)

In [8]:
df.set_index(['inn', 'contract_date_open', 'passport', 'id_credit'], inplace=True)

In [9]:
data = df.copy()
data = data[['trust_phone', 'phone_1', 'phone_2', 'phone_3']]

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 73957 entries, (np.int64(10101194902445), Timestamp('2023-03-29 00:00:00'), 'AN4279692', '1800000002501') to (np.int64(11403197301092), Timestamp('2025-10-14 00:00:00'), 'ID 880687', '68edbebfd9f085d50dc268b3')
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   trust_phone  14487 non-null  float64
 1   phone_1      73957 non-null  string 
 2   phone_2      688 non-null    string 
 3   phone_3      1019 non-null   string 
dtypes: float64(1), string(3)
memory usage: 10.7+ MB


In [11]:
df.query('status == "Отказано"')['date_birth'].min(), df.query('status == "Одобрено"')['date_birth'].max()

('1953-03-20 00:00:00', '2006-10-26')

In [12]:
def normalize_phone(val):
    """
    Вернуть (номер_в_формате_996XXXXXXXXX, статус).
    """
    if pd.isna(val):
        return pd.NA, 'NaN'
    s = str(val)
    digits = re.sub(r'\D+', '', s)
    if not digits:
        return pd.NA, 'Not Found'

    # 1) ищем явный 996XXXXXXXXX
    m = re.search(r'996\d{9}', digits)
    if m:
        return m.group(0), 'Ok'

    # 2) ищем 0XXXXXXXXX (обычно 10 цифр: 0 + 9)
    m = re.search(r'0\d{9}', digits)
    if m:
        local = m.group(0)[1:]  # убрать ведущую 0
        return '996' + local, 'Ok'

    # 3) первая встреченная последовательность из 9 цифр
    m = re.search(r'\d{9}', digits)
    if m:
        return '996' + m.group(0), 'Ok'

    # 4) fallback: если всего цифр >=9, взять последние 9
    if len(digits) >= 9:
        return '996' + digits[-9:], 'Ok'

    return pd.NA, 'Not Parsed'

In [13]:
# Применение к колонкам
phone_cols = ['trust_phone', 'phone_1', 'phone_2', 'phone_3']
for col in phone_cols:
    if col in data.columns:
        data[[f'{col}_norm', f'{col}_status']] = data[col].apply(lambda x: pd.Series(normalize_phone(x)))
# Просмотр результатов
data[['trust_phone','trust_phone_norm','phone_1','phone_1_norm','phone_2','phone_2_norm','phone_3','phone_3_norm']].head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,trust_phone,trust_phone_norm,phone_1,phone_1_norm,phone_2,phone_2_norm,phone_3,phone_3_norm
inn,contract_date_open,passport,id_credit,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10101194902445,2023-03-29,AN4279692,1800000002501,,,0772529356 0557,996772529356,,,,
10101195302746,2022-08-25,AN0798149,2000000003107,996700200000.0,996700205385.0,0700205385,996700205385,,,,
10101195302746,2022-08-25,AN0798149,2000000003108,996700200000.0,996700205385.0,0700205385,996700205385,,,,
10101195400720,2023-04-12,AN2737017,1600000002980,,,0704608121,996704608121,,,,
10101195400720,2023-04-12,AN2737017,1600000002981,,,0704608121,996704608121,,,,
10101195400894,2022-05-24,AN3332429,2800000002496,,,0705162722,996705162722,,,,
10101195400894,2024-07-04,AN3332429,2800000003459,,,0705162722,996705162722,,,,
10101195503297,2023-06-15,AN3047147,5700000000974,,,0773060403 0702,996773060403,,,,
10101195600788,2023-03-24,ID0584072,1500000001874,,,0700 545 074 07,996700545074,,,,
10101195700624,2023-11-24,AN3919197,5300000002062,,,0500300589,996500300589,,,,


In [14]:
phone_not_parsed = data.query('phone_2_status == "Not Parsed"')['phone_2'].unique()
logger.info(f"Phones not parsed: {len(phone_not_parsed)}")
list(phone_not_parsed)

2025-10-27 16:15:35,191 | my_logger - INFO - Phones not parsed: 137 | /tmp/ipykernel_4130472/913528703.py:2


['579553',
 '439426',
 '257573',
 '57-03-47',
 '65-01-68',
 '610404',
 '431973',
 '423919',
 '579610',
 '556175',
 '50213',
 '33 13 66 ',
 '511959',
 '28-50-43',
 '486037',
 '384717',
 '62517',
 '3-10-11',
 '630653',
 '426481',
 '590749',
 '512453',
 '635484',
 '54083',
 '469492',
 '40-74-89',
 '671500',
 '53-90-32',
 'Р.545200',
 '682928',
 '659159',
 '412430',
 '623030',
 '657285',
 '427614',
 '666724',
 '644389',
 '428020',
 '437834',
 '61-50-79',
 '933375',
 '336162',
 '472267',
 '248424',
 '44-70-70-',
 '54-36-14',
 '5-50-67',
 '93-97-32',
 '43-51-19',
 '451378',
 '45-92-21',
 '402465',
 '2-42-03',
 '642928',
 '437246',
 '51-06-51',
 '258513',
 '212646',
 '49-66-88',
 '300228',
 '64-58-94',
 '535310',
 '53-11-74',
 '42-82-83',
 '555748',
 '500668',
 '670025',
 '620336',
 '491960',
 '65-31-45',
 '64-84-13',
 '444338',
 '524828',
 '547093',
 '29-08-47',
 '570727',
 '501033',
 '539164',
 '54-09-49',
 '312580',
 '54-37-01',
 '664164',
 '561871',
 '624559',
 '654285',
 '314420',
 '5320

In [15]:
data.columns

Index(['trust_phone', 'phone_1', 'phone_2', 'phone_3', 'trust_phone_norm',
       'trust_phone_status', 'phone_1_norm', 'phone_1_status', 'phone_2_norm',
       'phone_2_status', 'phone_3_norm', 'phone_3_status'],
      dtype='object')

In [16]:
data.shape

(73957, 12)

In [17]:
def select_phones(row):
    phones = []

    def ok(status, val):
        return (status == 'Ok') and pd.notna(val)

    def add_unique(val):
        if pd.notna(val) and val not in phones:
            phones.append(val)

    # 1. trust_phone
    if ok(row.get('trust_phone_status'), row.get('trust_phone_norm')):
        add_unique(row.get('trust_phone_norm'))

    # 2. phone — добавляем если Ok и отличается от уже добавленных
    if ok(row.get('phone_status'), row.get('phone_norm')):
        add_unique(row.get('phone_norm'))

    # 3. phone_1
    if ok(row.get('phone_1_status'), row.get('phone_1_norm')):
        add_unique(row.get('phone_1_norm'))

    # 4. phone_2
    if ok(row.get('phone_2_status'), row.get('phone_2_norm')):
        add_unique(row.get('phone_2_norm'))

    return phones if phones else pd.NA

data.loc[:, 'array_phones'] = data.apply(select_phones, axis=1)


In [18]:
data['array_phones'].head(50)

inn             contract_date_open  passport   id_credit    
10101194902445  2023-03-29          AN4279692  1800000002501                  [996772529356]
10101195302746  2022-08-25          AN0798149  2000000003107                  [996700205385]
                                               2000000003108                  [996700205385]
10101195400720  2023-04-12          AN2737017  1600000002980                  [996704608121]
                                               1600000002981                  [996704608121]
10101195400894  2022-05-24          AN3332429  2800000002496                  [996705162722]
                2024-07-04          AN3332429  2800000003459                  [996705162722]
10101195503297  2023-06-15          AN3047147  5700000000974                  [996773060403]
10101195600788  2023-03-24          ID0584072  1500000001874                  [996700545074]
10101195700624  2023-11-24          AN3919197  5300000002062                  [996500300589]
101011957

In [19]:
df_banking = data[['array_phones']].join(df)

In [20]:
df_banking.reset_index(inplace=True)

In [21]:
df_banking.shape

(73957, 30)

In [22]:
df_banking['inn'] = df_banking['inn'].astype('int64')
df_banking['total_overdue'].fillna(0, inplace=True)
df_banking['total_overdue'] = df_banking['total_overdue'].astype('int64')
df_banking['overdue_max'].fillna(0, inplace=True)
df_banking['overdue_max'] = df_banking['overdue_max'].astype('int64')
df_banking['sum_of_prev_credits'].fillna(0, inplace=True)
df_banking['prev_credit_count'].fillna(0, inplace=True)
df_banking['summa'] = df_banking['summa'].astype('int64')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_banking['total_overdue'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_banking['overdue_max'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [23]:
# Преобразование столбца array_phones в формат, поддерживаемый ClickHouse в Array(String)
import json

df_banking['array_phones'] = (
    df_banking['array_phones']
    .apply(lambda v: json.dumps(v, ensure_ascii=False) if isinstance(v, (list, tuple)) else ("" if pd.isna(v) else str(v)))
    .astype('string')
)
df_banking.array_phones.info()

<class 'pandas.core.series.Series'>
RangeIndex: 73957 entries, 0 to 73956
Series name: array_phones
Non-Null Count  Dtype 
--------------  ----- 
73957 non-null  string
dtypes: string(1)
memory usage: 577.9 KB


In [24]:
df_banking['date_give'] = pd.to_datetime(df_banking['date_give'], errors='coerce')
df_banking['contract_length'] = df_banking['contract_length'].astype('int64')
df_banking['prev_credit_count'] = df_banking['prev_credit_count'].astype('int64')
df_banking['sum_of_prev_credits'] = df_banking['sum_of_prev_credits'].astype('int64')
# Вариант: заполнить пропуски в id_filials/id_branch_bank (если приемлемо)
df_banking['id_filials'] = df_banking['id_filials'].fillna(0).astype('int64')
df_banking['id_branch_bank'] = df_banking['id_branch_bank'].fillna(0).astype('int64')

df_banking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73957 entries, 0 to 73956
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   inn                  73957 non-null  int64         
 1   contract_date_open   73957 non-null  datetime64[ns]
 2   passport             73957 non-null  string        
 3   id_credit            73957 non-null  string        
 4   array_phones         73957 non-null  string        
 5   trust_phone          14487 non-null  float64       
 6   phone_1              73957 non-null  string        
 7   phone_2              688 non-null    string        
 8   phone_3              1019 non-null   string        
 9   who_give             73957 non-null  string        
 10  date_give            49185 non-null  datetime64[ns]
 11  sex                  73957 non-null  string        
 12  date_birth           73957 non-null  string        
 13  birthplace           1674 non-n

In [25]:
df_banking.marital_status.unique()

<StringArray>
['Вдовец/Вдова', 'Женат(Замужем)', 'Разведен(а)', 'Холост(а)']
Length: 4, dtype: string

In [26]:
df_banking.query('status == "Отказано"')['date_birth'].min(), df.query('status == "Одобрено"')['date_birth'].max()

('1953-03-20 00:00:00', '2006-10-26')

In [27]:
df_banking['date_birth'] = pd.to_datetime(df_banking['date_birth'], format='mixed', errors='coerce')
df_banking['date_birth'] = df_banking['date_birth'].dt.date
df_banking.query('status == "Отказано"')['date_birth'].unique()

array([datetime.date(1959, 5, 1), datetime.date(1987, 6, 20),
       datetime.date(2001, 10, 20), ..., datetime.date(1980, 10, 5),
       datetime.date(2002, 11, 18), datetime.date(1973, 3, 14)],
      shape=(8838,), dtype=object)

In [28]:
from sqlalchemy.orm import declarative_base
from sqlalchemy import Column, Integer, String, DateTime, Float, Date, Boolean, BigInteger
from clickhouse_sqlalchemy.types import Array
from clickhouse_sqlalchemy import engines

Base = declarative_base()

class CreditsEldik(Base):
    """ Таблица с данными по кредитам из Эльдик банка."""
    __tablename__ = "credits_eldik"
    __table_args__ = (
        engines.MergeTree(order_by=("inn", "contract_date_open", "id_credit")),
        {"schema": "data_science"},
    )
    # primary keys: inn, contract_date_open, id_credit
    inn = Column(BigInteger, primary_key=True, comment="ИНН клиента")
    contract_date_open = Column(Date, primary_key=True, comment="Дата открытия договора")
    id_credit = Column(String, primary_key=True, comment="ID кредита")
    # phone numbers
    array_phones = Column(Array(String), comment="Массив телефонов")
    trust_phone = Column(String, default=None, nullable=True, comment="Телефон доверенного лица")
    phone_1 = Column(String, default=None, nullable=True, comment="Телефон 1")
    phone_2 = Column(String, default=None, nullable=True, comment="Телефон 2")
    phone_3 = Column(String, default=None, nullable=True, comment="Телефон 3")
    # паспортные данные
    passport = Column(String, default=None, nullable=True, comment="Паспорт")
    who_give = Column(String, default=None, nullable=True, comment="Кто выдал паспорт")
    date_give = Column(Date, default=None, nullable=True, comment="Дата выдачи паспорта")
    sex = Column(String, default=None, nullable=True, comment="пол заемщика")
    date_birth = Column(Date, default=None, nullable=True, comment="Дата рождения клиента")
    birthplace = Column(String, default=None, nullable=True, comment="Место рождения")
    marital_status = Column(String, default=None, nullable=True, comment="семейный статус (1 - Женат/замужем, 2 - Холост/не замужем, 3- Разведен/разведена, 4 - Вдовец/вдова)")
    #name_region = Column(String, default=None, nullable=True, comment="Регион")
    #city = Column(String, default=None, nullable=True, comment="Город")
    #street = Column(String, default=None, nullable=True, comment="Улица")
    #id_filials = Column(Integer, default=0, nullable=False, comment="ID филиала")
    #id_branch_bank = Column(Integer, default=0, nullable=False, comment="ID банка")
    # credit details
    name_code_credit = Column(String, default=None, nullable=True, comment="Код кредита")
    name_object_credit = Column(String, default=None, nullable=True, comment="Наименование объекта кредита")
    summa = Column(BigInteger, default=0, nullable=False, comment="Сумма кредита")
    interest_on_credit = Column(Float, default=None, nullable=True, comment="Процентная ставка")
    contract_length = Column(Integer, default=None, nullable=True, comment="Срок кредита")
    prev_credit_count = Column(Integer, default=0, nullable=False, comment="Количество предыдущих кредитов")
    sum_of_prev_credits = Column(Integer, default=0, nullable=False, comment="Сумма предыдущих кредитов")
    overdue_max = Column(Integer, default=0, nullable=False, comment="Максимальная просрочка")
    total_overdue = Column(Integer, default=0, nullable=False, comment="Общая просрочка")
    status = Column(String, default=None, nullable=True, comment="Статус кредита")
Base.metadata.create_all(clickhouse_engine)



In [29]:
from sqlalchemy import text

sql_after_table = """
ALTER TABLE data_science.credits_eldik
ADD COLUMN id UUID DEFAULT generateUUIDv4();
"""
with clickhouse_engine.connect() as conn:
    # SQLAlchemy 2.0 requires an Executable (text()) or use exec_driver_sql().
    conn.execute(text(sql_after_table))
    logger.info("Column 'id' added to 'data_science.credits_eldik' table")

2025-10-27 16:15:37,988 | my_logger - INFO - Column 'id' added to 'data_science.credits_eldik' table | /tmp/ipykernel_4130472/1089640877.py:10


In [30]:
import json
from math import isnan
from sqlalchemy import inspect

def parse_array_field(v):
    if v is None: 
        return None
    if isinstance(v, (list, tuple)):
        return list(v) if v else None
    if isinstance(v, str):
        v = v.strip()
        if v == "" :
            return None
        try:
            return json.loads(v)
        except Exception:
            return None
    return None

def df_to_clickhouse_records(df):
    records = []
    for r in df.to_dict(orient='records'):
        rec = {}
        for k, v in r.items():
            # normalize any NA/NaT/pd.NA -> None
            if pd.isna(v):
                rec[k] = None
                continue

            # pandas Timestamp -> python datetime
            if isinstance(v, pd.Timestamp):
                rec[k] = v.to_pydatetime()
                continue

            # python datetime/date -> keep
            if isinstance(v, (datetime.datetime, datetime.date)):
                rec[k] = v
                continue

            # json/array field
            if k == "array_phones":
                rec[k] = parse_array_field(v)
                continue

            rec[k] = v
        records.append(rec)
    return records

In [31]:
# вставка пачками через Table.insert() (рекомендуется)
table = CreditsEldik.__table__
CHUNK = 2000

records = df_to_clickhouse_records(df_banking)

with clickhouse_engine.connect() as conn:
    for i in range(0, len(records), CHUNK):
        chunk = records[i:i+CHUNK]
        conn.execute(table.insert(), chunk)



In [32]:
df.query('status == "Отказано"')['date_birth'].unique()

<StringArray>
['1959-05-01 00:00:00', '1987-06-20 00:00:00', '2001-10-20 00:00:00',
 '1960-02-05 00:00:00', '1994-10-14 00:00:00', '1969-05-29 00:00:00',
 '1998-12-10 00:00:00', '1984-07-17 00:00:00', '1995-10-22 00:00:00',
 '1971-03-19 00:00:00',
 ...
 '1958-09-04 00:00:00', '1972-10-20 00:00:00', '1967-07-21 00:00:00',
 '1999-10-07 00:00:00', '1989-06-22 00:00:00', '1968-11-15 00:00:00',
 '1973-04-15 00:00:00', '1980-10-05 00:00:00', '2002-11-18 00:00:00',
 '1973-03-14 00:00:00']
Length: 8838, dtype: string

In [33]:
df_banking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73957 entries, 0 to 73956
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   inn                  73957 non-null  int64         
 1   contract_date_open   73957 non-null  datetime64[ns]
 2   passport             73957 non-null  string        
 3   id_credit            73957 non-null  string        
 4   array_phones         73957 non-null  string        
 5   trust_phone          14487 non-null  float64       
 6   phone_1              73957 non-null  string        
 7   phone_2              688 non-null    string        
 8   phone_3              1019 non-null   string        
 9   who_give             73957 non-null  string        
 10  date_give            49185 non-null  datetime64[ns]
 11  sex                  73957 non-null  string        
 12  date_birth           73957 non-null  object        
 13  birthplace           1674 non-n

In [34]:
df_banking.to_parquet(f"{config.environment.data_processed_path}/credit_data_031025.parquet", index=False)

In [35]:
df_banking.query('status == "Отказано"')['date_birth'].unique()

array([datetime.date(1959, 5, 1), datetime.date(1987, 6, 20),
       datetime.date(2001, 10, 20), ..., datetime.date(1980, 10, 5),
       datetime.date(2002, 11, 18), datetime.date(1973, 3, 14)],
      shape=(8838,), dtype=object)

In [36]:
sql = """ 
select * from dict.dict_cont_status dcs
"""

df_dict = pd.read_sql(sql, clickhouse_engine)
df_dict.to_csv(f"{config.environment.data_raw_path}/dict_cont_status_081025.csv", index=False)
df_dict.head()

Unnamed: 0,id,status
0,0,Terminated
1,1,Active
2,4,Suspended
3,3,Idle
4,2,New
