# Анализ данных проекта "Модель кредитного риск-менеджмента"

### *В этом ноутбуке мы реализуем три стадии машинного обучения:*

* Business understanding (данный процесс представлен данными .pdf файла задания);
* Data understanding (изучим предоставленные данные)
* Data preparation (подготовим данные для проведения обучения моделей)

**Цели и задачи ноутбука**:  

* ознакомление с предоставленными датасетами и описаниями представленных атрибутов;
* оценка чистоты и полноты данных;  
* приведение данных в удобный вид для дальнейшей работы;
* проведение базовой чистки (дубликаты, пустые значения, типизация данных, ненужные атрибуты;
* подготовка плана по форматированию данных.  
 ---

## Импорт необходимых библиотек и модулей

In [4]:
import os
import re
import gc
import pandas as pd
import psutil
import time
from tqdm.notebook import tqdm
from glob import glob
from sklearn.preprocessing import OneHotEncoder

In [5]:
pd.set_option('display.max_columns', None) # настройка pandas на показ всех колонок в датасете без огранечений

## Загрузка данных и просмотр данных
Осмотрим один из 12 представленных файлов train_data

In [6]:
df_preview = pd.read_parquet('/home/jupyter/project/train_data/train_data_0.pq')
df_preview.head(20)

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,pre_loans_outstanding,pre_loans_total_overdue,pre_loans_max_overdue_sum,pre_loans_credit_cost_rate,pre_loans5,pre_loans530,pre_loans3060,pre_loans6090,pre_loans90,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,pre_util,pre_over2limit,pre_maxover2limit,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,enc_paym_0,enc_paym_1,enc_paym_2,enc_paym_3,enc_paym_4,enc_paym_5,enc_paym_6,enc_paym_7,enc_paym_8,enc_paym_9,enc_paym_10,enc_paym_11,enc_paym_12,enc_paym_13,enc_paym_14,enc_paym_15,enc_paym_16,enc_paym_17,enc_paym_18,enc_paym_19,enc_paym_20,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,0,1,18,9,2,3,16,10,11,3,3,0,2,11,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0,0,3,3,3,3,3,3,3,3,3,4,3,3,3,3,3,3,3,3,4,3,3,3,4,1,3,4,1,0,0
1,0,2,18,9,14,14,12,12,0,3,3,0,2,11,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,4,1,3,4,1,0,0
2,0,3,18,9,4,8,1,11,11,0,5,0,2,8,6,16,5,4,8,1,1,1,1,1,15,2,17,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,4,1,2,3,1,1,1
3,0,4,4,1,9,12,16,7,12,2,3,0,2,4,6,16,5,4,8,0,1,1,1,1,16,2,17,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,3,3,3,3,3,3,3,3,4,3,3,3,4,1,3,1,1,0,0
4,0,5,5,12,15,2,11,12,10,2,3,0,2,4,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0,0,0,0,0,0,0,3,3,3,3,4,3,3,3,3,3,3,3,3,4,3,3,3,4,1,3,4,1,0,0
5,0,6,5,0,11,8,12,11,4,2,3,0,2,4,6,16,5,4,8,1,1,1,1,1,9,5,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,4,3,3,3,4,1,2,3,1,0,1
6,0,7,3,9,1,2,12,14,15,5,3,0,2,3,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0,0,0,0,0,0,0,0,3,3,3,4,3,3,3,3,3,3,3,3,4,3,3,3,4,1,3,4,1,0,0
7,0,8,2,9,2,3,12,14,15,5,3,0,2,13,6,16,5,4,8,1,1,1,1,1,16,2,17,1,1,1,0,0,3,3,3,3,3,3,3,3,3,4,3,3,3,3,3,3,3,3,4,3,3,3,4,1,3,4,1,0,0
8,0,9,1,9,11,13,14,8,2,5,1,0,2,11,6,16,5,4,8,1,1,1,1,1,1,2,17,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,3,3,3,3,3,4,3,3,3,4,1,2,4,1,0,0
9,0,10,7,9,2,10,8,8,16,4,2,0,2,11,6,16,5,4,8,1,1,1,1,1,15,2,17,0,1,1,0,0,0,0,0,0,3,3,3,3,3,4,3,3,3,3,3,3,3,3,4,3,3,3,4,1,2,4,1,0,0


In [8]:
print(df_preview.shape)
print(df_preview.info())
print(df_preview.describe())
print(f'Пропуски датасета: {df_preview.isna().sum().sort_values(ascending=False)}')
print(f'Дубликаты датасета: {df_preview.duplicated().sum()}')

(1974724, 61)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1974724 entries, 0 to 1974723
Data columns (total 61 columns):
 #   Column                         Dtype
---  ------                         -----
 0   id                             int64
 1   rn                             int64
 2   pre_since_opened               int64
 3   pre_since_confirmed            int64
 4   pre_pterm                      int64
 5   pre_fterm                      int64
 6   pre_till_pclose                int64
 7   pre_till_fclose                int64
 8   pre_loans_credit_limit         int64
 9   pre_loans_next_pay_summ        int64
 10  pre_loans_outstanding          int64
 11  pre_loans_total_overdue        int64
 12  pre_loans_max_overdue_sum      int64
 13  pre_loans_credit_cost_rate     int64
 14  pre_loans5                     int64
 15  pre_loans530                   int64
 16  pre_loans3060                  int64
 17  pre_loans6090                  int64
 18  pre_loans90                 

In [9]:
del df_preview
gc.collect

<function gc.collect(generation=2)>

* Все признаки переведены в числа. Пропусков и дубликатов не обнаружили. Данные достаточно чистые. Удаляем данный фрагмент для экономии памяти.

## Определяем функцию для чтения датасета почанково

In [4]:
def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                     num_parts_to_read: int = 2, columns=None, verbose=False) -> pd.DataFrame:
    """
    читает num_parts_to_read партиций, преобразовывает их к pd.DataFrame и возвращает
    :param path_to_dataset: путь до директории с партициями
    :param start_from: номер партиции, с которой нужно начать чтение
    :param num_parts_to_read: количество партиций, которые требуется прочитать
    :param columns: список колонок, которые нужно прочитать из партиции
    :return: pd.DataFrame
    """
    parquet_dir = os.path.join(path_to_dataset, 'train_data')

    res = []
    dataset_paths = sorted(
            [os.path.join(parquet_dir, f) for f in os.listdir(parquet_dir) if f.startswith('train')],
            key=lambda x: int(x.split('_')[-1].replace('.pq','')))

    print(dataset_paths)

    start_from = max(0, start_from)
    chunks = dataset_paths[start_from: start_from + num_parts_to_read]
    if verbose:
        print('Reading chunks:\n')
        for chunk in chunks:
            print(chunk)
    for chunk_path in tqdm(chunks, desc="Reading dataset with pandas"):
        print('chunk_path', chunk_path)
        chunk = pd.read_parquet(chunk_path,columns=columns)
        res.append(chunk)

    return pd.concat(res).reset_index(drop=True)

## Определяем функцию дата инжиниринга для создания дополнительных фич

In [5]:
def feature_engineer_one_block(df):
    df = df.copy()

    # A. Признаки по кредитам
    # total_past_dues - суммарные просрочки по buckets, почему: комплексный индикатор истории просрочек
    overdue_cols = ['pre_loans5', 'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90']
    if all(col in df.columns for col in overdue_cols):
        df['total_past_dues'] = df[overdue_cols].sum(axis=1)
    # has_current_overdue - флаг текущей просрочки, почему: критический сигнал дефолта
    if 'pre_loans_total_overdue' in df.columns:
        df['has_current_overdue'] = (df['pre_loans_total_overdue'] > 0).astype(int)

    # B. История платежей
    paym_cols = [col for col in df.columns if col.startswith('enc_paym_')]
    if paym_cols:
        df['paym_bad_count'] = (df[paym_cols] > 0).sum(axis=1)  # Кол-во плохих платежей
        df['paym_last_status'] = df['enc_paym_0']  # Последний статус


    # C. Другие (флаги активных/закрытых кредитов)
    # Почему: различает активные и закрытые кредиты, активные могут иметь более высокий риск
    if 'fclose_flag' in df.columns:
        df['is_active_loan'] = (df['fclose_flag'] == 0).astype(int)


    return df

In [10]:
### Версия с разделением колонок. Неудачная судя по обучению. Метрики меньше
#    def prepare_transactions_dataset(path_to_dataset: str, 
                                 num_parts_to_preprocess_at_once: int = 1,
                                 num_parts_total: int = 2,
                                 save_to_path: str = 'processed_data_1',
                                 log_to_file: bool = True,
                                 verbose: bool = False):
    """
    Итеративно читает parquet-файлы, создаёт новые признаки, кодирует фичи и агрегирует по id.
    """

    def log(msg: str):
        print(msg)
        if log_to_file:
            with open(log_path, 'a', encoding='utf-8') as f:
                f.write(msg + '\n')

    save_dir_train = os.path.join(path_to_dataset, save_to_path, 'train')
    os.makedirs(save_dir_train, exist_ok=True)

    process = psutil.Process(os.getpid())
    start_time = time.time()
    log_path = os.path.join(path_to_dataset, save_to_path, 'process.log')

    numeric_cols = [
        'pre_loans_credit_limit','pre_loans_next_pay_summ','pre_loans_outstanding',
        'pre_loans_total_overdue','pre_loans_max_overdue_sum','pre_loans_credit_cost_rate',
        'pre_util','pre_over2limit','pre_maxover2limit','pre_since_opened','pre_since_confirmed',
        'pre_pterm','pre_fterm','pre_till_pclose','pre_till_fclose'
    ]

    flag_cols = [
        'is_zero_loans5','is_zero_loans530','is_zero_loans3060','is_zero_loans6090','is_zero_loans90',
        'is_zero_util','is_zero_over2limit','is_zero_maxover2limit','pclose_flag','fclose_flag'
    ]

    cat_cols = [
        'enc_loans_account_holder_type',
        'enc_loans_credit_status',
        'enc_loans_account_cur',
        'enc_loans_credit_type'
    ]

    preprocessed_frames = []

    for step in tqdm(range(0, num_parts_total, num_parts_to_preprocess_at_once),
                                   desc="Transforming transactions data"):
        step_start = time.time()

        # === 1. Чтение ===
        transactions_frame = read_parquet_dataset_from_local(
            path_to_dataset, step, num_parts_to_preprocess_at_once,
            columns=None,  # читаем все колонки
            verbose=verbose
        )
        log(f'Прочитано строк: {len(transactions_frame)}, уникальных id: {len(transactions_frame["id"].unique())}')

        # === 2. Фичи ===
        transactions_frame = feature_engineer_one_block(transactions_frame)

        # --- Downcast ---
        for c in transactions_frame.columns:
            if pd.api.types.is_integer_dtype(transactions_frame[c]):
                transactions_frame[c] = pd.to_numeric(transactions_frame[c], downcast='unsigned')
            elif pd.api.types.is_float_dtype(transactions_frame[c]):
                transactions_frame[c] = pd.to_numeric(transactions_frame[c], downcast='float')

        # === 3. Кодирование ===
        cat_cols_existing = [c for c in cat_cols if c in transactions_frame.columns]
        ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
        encoded = ohe.fit_transform(transactions_frame[cat_cols_existing])
        encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(cat_cols_existing))
        log(f'Размер encoded после OHE: {encoded_df.shape}')

        # === 4. Объединяем ===
        data_preprocessed = pd.concat([
            transactions_frame['id'].reset_index(drop=True),  # точно одномерная колонка
            transactions_frame[[c for c in transactions_frame.columns 
                                if c not in cat_cols_existing + ['rn', 'id']]].reset_index(drop=True),
            encoded_df.reset_index(drop=True)
        ], axis=1)

        log(f'Размер блока после конкатенации: {data_preprocessed.shape}')

        del transactions_frame, encoded, encoded_df
        gc.collect()

        # === 5. Агрегация ===
        agg_dict = {col: 'sum' for col in data_preprocessed.columns if col != 'id'}
        data_grouped = data_preprocessed.groupby('id', as_index=False).agg(agg_dict)
        log(f'После агрегации: {data_grouped.shape}')

        del data_preprocessed
        gc.collect()

        # === 6. Сохранение ===
        block_str = f"{step:03d}"
        data_grouped.to_parquet(os.path.join(save_dir_train, f'processed_chunk_{block_str}.parquet'))
        log(f'💾 Train chunk сохранён: {save_dir_train}/processed_chunk_{block_str}.parquet')

        preprocessed_frames.append(data_grouped)
        del data_grouped
        gc.collect()

        # === 7. Мониторинг ===
        elapsed = time.time() - step_start
        mem_mb = process.memory_info().rss / (1024 * 1024)
        log(f'⏱ Время: {elapsed:.2f} сек | 🧠 Память: {mem_mb:.1f} МБ')

    # === 8. Финальный merge ===
    final_df = pd.concat(preprocessed_frames).fillna(0.)
    targets = pd.read_csv('/home/jupyter/project/train_target.csv')
    final_df = final_df.merge(targets, how='inner', on='id')

    final_path = os.path.join(save_dir_train, 'final_train.parquet')
    final_df.to_parquet(final_path)
    log(f'💾 Итоговый train сохранён: {final_path}, размер: {len(final_df)} строк')

    total_time = time.time() - start_time
    mem_final = process.memory_info().rss / (1024 * 1024)
    log(f'\n🎯 Завершено! Время: {total_time/60:.1f} мин | Память: {mem_final:.1f} МБ')



IndentationError: unindent does not match any outer indentation level (<tokenize>, line 26)

## Определяем функцию которая, с помощью функций выше, читает файлы, создает новые фичи, кодирует фичи, производит аггрегацию, мерджит с train_target.csv и сохраняет данные почанково

In [6]:
def prepare_transactions_dataset(path_to_dataset: str, 
                                 num_parts_to_preprocess_at_once: int = 1,
                                 num_parts_total: int = 2,
                                 save_to_path: str = 'processed_data_1',
                                 log_to_file: bool = True,
                                 verbose: bool = False):
    """
    Итеративно читаем parquet-файлы, создаём новые признаки, кодируем фичи (OHE на всех кроме flag_cols),
    агрегируем по id, мержим с train_target.csv и сохраняем чанки.
    """

    def log(msg: str):
        print(msg)
        if log_to_file:
            with open(log_path, 'a', encoding='utf-8') as f:
                f.write(msg + '\n')

    save_dir_train = os.path.join(path_to_dataset, save_to_path, 'train')
    os.makedirs(save_dir_train, exist_ok=True)

    process = psutil.Process(os.getpid())
    start_time = time.time()
    log_path = os.path.join(path_to_dataset, save_to_path, 'process.log')

    flag_cols = [
        'is_zero_loans5','is_zero_loans530','is_zero_loans3060','is_zero_loans6090','is_zero_loans90',
        'is_zero_util','is_zero_over2limit','is_zero_maxover2limit','pclose_flag','fclose_flag'
    ]

    # === Заранее загружаем таргеты ===
    targets = pd.read_csv('/home/jupyter/project/train_target.csv')
    log(f'📂 Загружен train_target.csv: {targets.shape[0]} строк')

    for step in tqdm(range(0, num_parts_total, num_parts_to_preprocess_at_once),
                     desc="Transforming transactions data"):
        step_start = time.time()

        # === 1. Чтение ===
        transactions_frame = read_parquet_dataset_from_local(
            path_to_dataset, step, num_parts_to_preprocess_at_once,
            columns=None, verbose=verbose
        )
        log(f'Прочитано строк: {len(transactions_frame)}, уникальных id: {transactions_frame["id"].nunique()}')

        # === 2. Фичи ===
        transactions_frame = feature_engineer_one_block(transactions_frame)

        # --- Downcast ---
        for c in transactions_frame.columns:
            if pd.api.types.is_integer_dtype(transactions_frame[c]):
                transactions_frame[c] = pd.to_numeric(transactions_frame[c], downcast='unsigned')
            elif pd.api.types.is_float_dtype(transactions_frame[c]):
                transactions_frame[c] = pd.to_numeric(transactions_frame[c], downcast='float')

        # === 3. OHE ===
        ohe_cols = [c for c in transactions_frame.columns if c not in flag_cols + ['id', 'rn']]
        ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
        encoded = ohe.fit_transform(transactions_frame[ohe_cols])
        encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(ohe_cols))
        log(f'Размер encoded после OHE: {encoded_df.shape}')

        # === 4. Объединяем ===
        data_preprocessed = pd.concat([
            transactions_frame['id'].reset_index(drop=True),
            transactions_frame[flag_cols].reset_index(drop=True),
            encoded_df.reset_index(drop=True)
        ], axis=1)
        log(f'Размер блока после конкатенации: {data_preprocessed.shape}')

        del transactions_frame, encoded, encoded_df
        gc.collect()

        # === 5. Агрегация ===
        agg_dict = {col: 'sum' for col in data_preprocessed.columns if col != 'id'}
        data_grouped = data_preprocessed.groupby('id', as_index=False).agg(agg_dict)
        log(f'После агрегации: {data_grouped.shape}')
        del data_preprocessed
        gc.collect()

        # === 6. Merge с таргетом ===
        data_merged = data_grouped.merge(targets, how='inner', on='id')
        log(f'После merge с таргетом: {data_merged.shape}')

        del data_grouped
        gc.collect()

        # === 7. Сохранение чанка ===
        block_str = f"{step:03d}"
        chunk_path = os.path.join(save_dir_train, f'processed_chunk_{block_str}.parquet')
        data_merged.to_parquet(chunk_path)
        log(f'💾 Train chunk сохранён: {chunk_path}')

        del data_merged
        gc.collect()

        # === 8. Мониторинг ===
        elapsed = time.time() - step_start
        mem_mb = process.memory_info().rss / (1024 * 1024)
        log(f'⏱ Время: {elapsed:.2f} сек | 🧠 Память: {mem_mb:.1f} МБ')

    # === 9. Финальный лог ===
    total_time = time.time() - start_time
    mem_final = process.memory_info().rss / (1024 * 1024)
    log(f'\n🎯 Завершено! Время: {total_time/60:.1f} мин | Память: {mem_final:.1f} МБ')


## Запускаем данную функцию, указав путь к проекту. Обрабатывать будем по 1 чанку одновременно все 12 чанков.

In [7]:
DATA_ROOT = '/home/jupyter/project'


df_result = prepare_transactions_dataset(
    path_to_dataset=DATA_ROOT,
    num_parts_to_preprocess_at_once=1,  # сколько чанков одновременно
    num_parts_total=12,                  # обработать 12
    save_to_path='processed_data_1',    # папка для сохранения
    log_to_file=True,                   # вести лог
    verbose=True                        # выводить информацию в консоль
)


📂 Загружен train_target.csv: 3000000 строк


Transforming transactions data:   0%|          | 0/12 [00:00<?, ?it/s]

['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/train_data/train_data_11.pq']
Reading chunks:

/home/jupyter/project/train_data/train_data_0.pq


Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_0.pq
Прочитано строк: 1974724, уникальных id: 250000
Размер encoded после OHE: (1974724, 385)
Размер блока после конкатенации: (1974724, 396)
После агрегации: (250000, 396)
После merge с таргетом: (250000, 397)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_000.parquet
⏱ Время: 89.14 сек | 🧠 Память: 506.4 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_1.pq
Прочитано строк: 2107305, уникальных id: 250000
Размер encoded после OHE: (2107305, 387)
Размер блока после конкатенации: (2107305, 398)
После агрегации: (250000, 398)
После merge с таргетом: (250000, 399)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_001.parquet
⏱ Время: 87.17 сек | 🧠 Память: 508.0 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_2.pq
Прочитано строк: 2080508, уникальных id: 250000
Размер encoded после OHE: (2080508, 386)
Размер блока после конкатенации: (2080508, 397)
После агрегации: (250000, 397)
После merge с таргетом: (250000, 398)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_002.parquet
⏱ Время: 86.30 сек | 🧠 Память: 538.5 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_3.pq
Прочитано строк: 2112592, уникальных id: 250000
Размер encoded после OHE: (2112592, 389)
Размер блока после конкатенации: (2112592, 400)
После агрегации: (250000, 400)
После merge с таргетом: (250000, 401)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_003.parquet
⏱ Время: 85.33 сек | 🧠 Память: 539.7 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_4.pq
Прочитано строк: 2064110, уникальных id: 250000
Размер encoded после OHE: (2064110, 390)
Размер блока после конкатенации: (2064110, 401)
После агрегации: (250000, 401)
После merge с таргетом: (250000, 402)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_004.parquet
⏱ Время: 84.73 сек | 🧠 Память: 541.0 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_5.pq
Прочитано строк: 2150908, уникальных id: 250000
Размер encoded после OHE: (2150908, 394)
Размер блока после конкатенации: (2150908, 405)
После агрегации: (250000, 405)
После merge с таргетом: (250000, 406)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_005.parquet
⏱ Время: 87.70 сек | 🧠 Память: 539.5 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_6.pq
Прочитано строк: 2176452, уникальных id: 250000
Размер encoded после OHE: (2176452, 398)
Размер блока после конкатенации: (2176452, 409)
После агрегации: (250000, 409)
После merge с таргетом: (250000, 410)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_006.parquet
⏱ Время: 90.43 сек | 🧠 Память: 537.4 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_7.pq
Прочитано строк: 2222245, уникальных id: 250000
Размер encoded после OHE: (2222245, 393)
Размер блока после конкатенации: (2222245, 404)
После агрегации: (250000, 404)
После merge с таргетом: (250000, 405)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_007.parquet
⏱ Время: 90.23 сек | 🧠 Память: 600.2 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_8.pq
Прочитано строк: 2242615, уникальных id: 250000
Размер encoded после OHE: (2242615, 401)
Размер блока после конкатенации: (2242615, 412)
После агрегации: (250000, 412)
После merge с таргетом: (250000, 413)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_008.parquet
⏱ Время: 91.51 сек | 🧠 Память: 631.2 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_9.pq
Прочитано строк: 2284256, уникальных id: 250000
Размер encoded после OHE: (2284256, 403)
Размер блока после конкатенации: (2284256, 414)
После агрегации: (250000, 414)
После merge с таргетом: (250000, 415)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_009.parquet
⏱ Время: 92.78 сек | 🧠 Память: 575.2 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project/

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_10.pq
Прочитано строк: 2296372, уникальных id: 250000
Размер encoded после OHE: (2296372, 399)
Размер блока после конкатенации: (2296372, 410)
После агрегации: (250000, 410)
После merge с таргетом: (250000, 411)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_010.parquet
⏱ Время: 93.96 сек | 🧠 Память: 604.8 МБ
['/home/jupyter/project/train_data/train_data_0.pq', '/home/jupyter/project/train_data/train_data_1.pq', '/home/jupyter/project/train_data/train_data_2.pq', '/home/jupyter/project/train_data/train_data_3.pq', '/home/jupyter/project/train_data/train_data_4.pq', '/home/jupyter/project/train_data/train_data_5.pq', '/home/jupyter/project/train_data/train_data_6.pq', '/home/jupyter/project/train_data/train_data_7.pq', '/home/jupyter/project/train_data/train_data_8.pq', '/home/jupyter/project/train_data/train_data_9.pq', '/home/jupyter/project/train_data/train_data_10.pq', '/home/jupyter/project

Reading dataset with pandas:   0%|          | 0/1 [00:00<?, ?it/s]

chunk_path /home/jupyter/project/train_data/train_data_11.pq
Прочитано строк: 2450630, уникальных id: 250000
Размер encoded после OHE: (2450630, 403)
Размер блока после конкатенации: (2450630, 414)
После агрегации: (250000, 414)
После merge с таргетом: (250000, 415)
💾 Train chunk сохранён: /home/jupyter/project/processed_data_1/train/processed_chunk_011.parquet
⏱ Время: 100.83 сек | 🧠 Память: 614.2 МБ

🎯 Завершено! Время: 18.0 мин | Память: 614.2 МБ


## Собираем обработаные чанки и сохраняем в единый файл

In [8]:
path_to_dataset = '/home/jupyter/project/processed_data_1/train'
final_path = os.path.join(path_to_dataset, 'final_train.parquet')
target_csv_path = os.path.join(path_to_dataset, 'train_target.csv')

# === Находим все чанк-файлы ===
chunk_files = sorted([
    os.path.join(path_to_dataset, f)
    for f in os.listdir(path_to_dataset)
    if f.startswith('processed_chunk_') and f.endswith('.parquet')
])

print(f'📦 Найдено {len(chunk_files)} чанк(ов) для объединения.')

# === Сборка финального train ===
dfs = []
for f in tqdm(chunk_files, desc='Чтение чанков'):
    df = pd.read_parquet(f)
    dfs.append(df)

final_df = pd.concat(dfs, ignore_index=True)
print(f'✅ Итоговый train собран: {final_df.shape}')

# === Сохраняем финальный parquet ===
final_df.to_parquet(final_path)
print(f'💾 Сохранён: {final_path}')

📦 Найдено 12 чанк(ов) для объединения.


Чтение чанков:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Итоговый train собран: (3000000, 430)
💾 Сохранён: /home/jupyter/project/processed_data_1/train/final_train.parquet


## Создаем отдельный тестовый набор данных для проверки на моделях 

In [11]:
from sklearn.model_selection import train_test_split
save_dir_test = os.path.join('/home/jupyter/project/processed_data_1/test')
df_train, df_test = train_test_split(
            final_df,
            train_size=0.8,
            random_state=42,
            stratify=final_df['flag']
        )
print(f'📊 Train shape: {df_train.shape}, Test shape: {df_test.shape}')
test_path = os.path.join(save_dir_test, 'test.parquet')
df_test.to_parquet(test_path)
print(f'💾 Test сохранён: {test_path}')

📊 Train shape: (2400000, 430), Test shape: (600000, 430)
💾 Test сохранён: /home/jupyter/project/processed_data_1/test/test.parquet
