In [1]:
pip install pyarrow

Note: you may need to restart the kernel to use updated packages.


In [19]:
import os
import tqdm
import pyarrow
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import dill as pickle

from datetime import datetime
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [43]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
path = 'data_python/final_work_2/train_data/'

In [4]:
save_to_path = 'data_python/final_work_2/res/'

In [5]:
targets = pd.read_csv('data_python/final_work_2/train_target.csv')

In [6]:
def prepare_transactions_dataset(path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                                 save_to_path=None, verbose: bool=False):

    preprocessed_frames = []

    def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                     num_parts_to_read: int = 2, columns=None, verbose=False) -> pd.DataFrame:


        res = []
        dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
                                  if filename.startswith('train')])

        start_from = max(0, start_from)
        chunks = dataset_paths[start_from: start_from + num_parts_to_read]
        if verbose:
            print('Reading chunks:\n')
            for chunk in chunks:
                print(chunk)
        for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):
            print('chunk_path', chunk_path)
            chunk = pd.read_parquet(chunk_path,columns=columns)
            res.append(chunk)

        return pd.concat(res).reset_index(drop=True)
    
    for step in tqdm.tqdm_notebook(range(0, num_parts_total, num_parts_to_preprocess_at_once),
                                   desc="Transforming transactions data"):
        transactions_frame = read_parquet_dataset_from_local(path_to_dataset, step, num_parts_to_preprocess_at_once,
                                                             verbose=verbose)

        if save_to_path:
            block_as_str = str(step)
            if len(block_as_str) == 1:
                block_as_str = '00' + block_as_str
            else:
                block_as_str = '0' + block_as_str
            transactions_frame.to_parquet(os.path.join(save_to_path, f'processed_chunk_{block_as_str}.parquet'))

        preprocessed_frames.append(transactions_frame)
    return pd.concat(preprocessed_frames)

In [7]:
df = prepare_transactions_dataset(path, num_parts_to_preprocess_at_once=12, num_parts_total=2,
                                  save_to_path=save_to_path)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step in tqdm.tqdm_notebook(range(0, num_parts_total, num_parts_to_preprocess_at_once),


Transforming transactions data:   0%|          | 0/1 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):


Reading dataset with pandas:   0%|          | 0/12 [00:00<?, ?it/s]

chunk_path data_python/final_work_2/train_data/train_data_0.pq
chunk_path data_python/final_work_2/train_data/train_data_1.pq
chunk_path data_python/final_work_2/train_data/train_data_10.pq
chunk_path data_python/final_work_2/train_data/train_data_11.pq
chunk_path data_python/final_work_2/train_data/train_data_2.pq
chunk_path data_python/final_work_2/train_data/train_data_3.pq
chunk_path data_python/final_work_2/train_data/train_data_4.pq
chunk_path data_python/final_work_2/train_data/train_data_5.pq
chunk_path data_python/final_work_2/train_data/train_data_6.pq
chunk_path data_python/final_work_2/train_data/train_data_7.pq
chunk_path data_python/final_work_2/train_data/train_data_8.pq
chunk_path data_python/final_work_2/train_data/train_data_9.pq


In [8]:
def transformation(df):
    for m in ['pre_since_confirmed',
          'pre_pterm',
          'pre_fterm',
          'pre_till_fclose',
          'pre_loans_credit_limit']:
        data_val = df[m].value_counts()
        for a, b in zip(data_val.index, data_val.values):    
            if b < data_val.sum()/(len(data_val)*2):
                df[m] = df[m].replace(a, a+1)
            
    for m in df.iloc[:, 30:55].columns:
        data_val = df[m].value_counts()
        for a, b in zip(data_val.index, data_val.values):    
            f = data_val.max()

            if data_val.index[0] == 0:
                if b == f:
                    max_0 = a
                else:
                    df[m] = df[m].replace(a, 1)
            else:
                if b == f:
                    max_1 = a
                else:
                    df[m] = df[m].replace(a, 0)

        for a, b in zip(data_val.index, data_val.values):
            f = data_val.max()
            if b == f and a != 0:
                df[m] = df[m].replace(a, 1)
    
    df = df.drop(['rn', 'pre_loans_total_overdue'], axis=1)
    
    df_encode_1 = df['id']
    columns_l = list(df.columns.values[1:6])
    df_1 = pd.get_dummies(df[columns_l], columns=columns_l, drop_first = True)
    dummy_signs = df_1.columns.values
    df_concat = pd.concat([df_encode_1, df_1], axis=1)
    df_encode_1 = df_concat.groupby("id")[dummy_signs].sum().reset_index(drop=False)
    
    df_encode_2 = df['id']
    columns_l = list(df.columns.values[6:13])
    df_2 = pd.get_dummies(df[columns_l], columns=columns_l, drop_first = True)
    dummy_signs = df_2.columns.values
    df_concat = pd.concat([df_encode_2, df_2], axis=1)
    df_encode_2 = df_concat.groupby("id")[dummy_signs].sum().reset_index(drop=False)
    
    df_encode_3 = df['id']
    columns_l = list(df.columns.values[13:25])
    df_3 = pd.get_dummies(df[columns_l], columns=columns_l, drop_first = True)
    dummy_signs = df_3.columns.values
    df_concat = pd.concat([df_encode_3, df_3], axis=1)
    df_encode_3 = df_concat.groupby("id")[dummy_signs].sum().reset_index(drop=False)
    
    df_encode_4 = df['id']
    columns_l = list(df.columns.values[25:59])
    df_4 = pd.get_dummies(df[columns_l], columns=columns_l, drop_first = True)
    dummy_signs = df_4.columns.values
    df_concat = pd.concat([df_encode_4, df_4], axis=1)
    df_encode_4 = df_concat.groupby("id")[dummy_signs].sum().reset_index(drop=False)
    
    df_result = pd.concat([df_encode_1, df_encode_2, df_encode_3, df_encode_4, targets], axis=1).drop(columns='id')
    
#     print(f'Число столбцов в df - {len(df_result.columns)}')
    
    return df_result

In [9]:
def train_test(df_result):
    
    X, y = df_result.drop(['flag'], axis=1), df_result['flag']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    return X_train, X_test, y_train, y_test

In [10]:
class MyDataset(Dataset):
    def __init__(self, X, y):

        self.X = torch.from_numpy(X.to_numpy().astype(np.float32))
        self.y = torch.from_numpy(y.to_numpy().astype(np.float32))

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return self.X.shape[0]
    
class MyNet(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.hidden_1 = nn.Linear(303, int(303*2))
        self.f1 = nn.ReLU()
        self.hidden_2 = nn.Linear(int(303*2), int(303*2)//4)
        self.f2 = nn.ReLU()
        self.hidden_3 = nn.Linear(int(303*2)//4, (int(303*2)//4)//4)
        self.f3 = nn.ReLU()
        self.hidden_4 = nn.Linear((int(303*2)//4)//4, ((int(303*2)//4)//4)//4)        
        self.f4 = nn.ReLU()        
        self.output = nn.Linear(((int(303*2)//4)//4)//4, 1)
        self.f5 = nn.Sigmoid()
    
    def forward(self, x):
        x = self.f1(self.hidden_1(x))
        x = self.f2(self.hidden_2(x))
        x = self.f3(self.hidden_3(x))
        x = self.f4(self.hidden_4(x))
        x = self.f5(self.output(x))
        return x
     
    def fit(self, X_train, lr=0.001, num_epochs=100):

        loss_fn = nn.BCELoss()
        optimizer = torch.optim.SGD(my_net.parameters(), lr=0.001)

        train_dataloader = DataLoader(
            MyDataset(X_train[0], X_train[2]),
            batch_size=128,
            shuffle=True) 
        
        for epoch in range(num_epochs):    
            for X, y in train_dataloader:
                pred = my_net(X)
                loss = loss_fn(pred, y.unsqueeze(-1))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        print(f'ROC AUC train - {roc_auc_score(X_train[2], my_net(torch.from_numpy(X_train[0].to_numpy().astype(np.float32))).detach().numpy())}')        
        return my_net

    def predict(self, X_train):
        with torch.no_grad():
            y_pred = my_net(torch.from_numpy(X_train[1].to_numpy().astype(np.float32)))
        print(f'ROC AUC test - {roc_auc_score(X_train[3], y_pred.detach().numpy())}')
        return y_pred

my_net = MyNet()

In [11]:
model = MyNet()

In [12]:
final_pipe = Pipeline([
    ('transformation', FunctionTransformer(transformation)),
    ('train_test', FunctionTransformer(train_test)),
    ('nn_model', model)
])

In [13]:
y_pred_train = final_pipe.fit(df)

ROC AUC train - 0.7686966092687515


In [14]:
y_pred_test = final_pipe.predict(df)

ROC AUC test - 0.7593837363087456


In [44]:
with open('credit_risk_model.pkl', 'wb') as file:
    pickle.dump({
        'model': final_pipe,
        'metadata': {
            'name': "credit_risk_model",
            'author': 'Denis Shkaraburov',
            'version': 1,
            'date': datetime.now()
        }
    }, file, recurse=True)

In [39]:
y_pred = pd.DataFrame(y_pred_test.detach().numpy(), columns=["y_pred"])
y_pred.head()

Unnamed: 0,y_pred
0,0.003342
1,0.051365
2,0.037902
3,0.053005
4,0.01591


In [42]:
y_pred.to_csv('y_pred.csv')