**Важно**


Bitwise properties:

float16 : 1 sign bit, 5 exponent bit, 10-bit significand (fractional part).

float32 : 1 sign bit, 8 exponent bit, and 23-bit significand (fractional part).

float64 : 1 sign bit, 11 exponent bits, and 52 fraction bits.

float128 : 1 sign bit, 15 exponent bits, and 112 fraction bits.

In [28]:
import numpy as np
import decimal

import pandas as pd
pd.set_option('display.float_format', lambda x: '{:.2f}'.format(x))

from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import time

In [29]:
train_df = pd.read_parquet("../data/raw/train_data.pqt")
test_df = pd.read_parquet("../data/raw/test_data.pqt")


cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]


train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

### Объединение датасета 

In [30]:
df = pd.concat([train_df, test_df])

In [31]:
df[:len(train_df)]

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.74,0.71,1.29,0.75,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.95,0.57,0.90,0.55,0.77,0.94,0.30,0.97,"{α, γ}",{other}
1,0,month_2,1.05,0.83,2.46,1.05,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.95,0.50,0.79,0.55,0.70,0.99,0.30,0.95,"{α, γ}",{other}
2,0,month_3,0.69,0.74,0.43,0.70,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.95,0.44,0.88,0.55,0.66,0.81,0.29,0.96,"{α, γ}",{other}
3,1,month_1,-0.08,-0.09,-0.11,-0.08,channel_code_2,city_14,city_type_0,,...,0.95,0.41,0.37,0.57,0.79,-0.18,0.25,0.46,{other},{other}
4,1,month_2,-0.09,-0.10,-0.12,-0.09,channel_code_2,city_14,city_type_0,,...,0.95,0.43,0.07,0.56,0.70,-0.18,0.26,0.50,{other},{other}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,199998,month_2,-0.16,-0.20,-0.13,-0.16,channel_code_9,city_14,city_type_0,,...,,,-0.17,,,-0.20,,,{},{}
599996,199998,month_3,-0.16,-0.20,-0.13,-0.16,channel_code_9,city_14,city_type_0,,...,0.94,0.38,-0.17,0.55,0.41,-0.20,0.25,0.37,{α},{}
599997,199999,month_1,-0.16,-0.20,-0.13,-0.16,channel_code_14,city_1876,city_type_0,index_city_code_195,...,,,-0.17,,,-0.20,,,{},{}
599998,199999,month_2,-0.16,-0.20,-0.13,-0.16,channel_code_14,city_1876,city_type_0,index_city_code_195,...,,,-0.17,,,-0.20,,,{},{}


### Преобразование типов

In [32]:
d = decimal.Decimal(float(df['sum_deb_h_oper_3m'][0].values[0]))
d.as_tuple().exponent

-52

In [33]:
df

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.74,0.71,1.29,0.75,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.95,0.57,0.90,0.55,0.77,0.94,0.30,0.97,"{α, γ}",{other}
1,0,month_2,1.05,0.83,2.46,1.05,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.95,0.50,0.79,0.55,0.70,0.99,0.30,0.95,"{α, γ}",{other}
2,0,month_3,0.69,0.74,0.43,0.70,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.95,0.44,0.88,0.55,0.66,0.81,0.29,0.96,"{α, γ}",{other}
3,1,month_1,-0.08,-0.09,-0.11,-0.08,channel_code_2,city_14,city_type_0,,...,0.95,0.41,0.37,0.57,0.79,-0.18,0.25,0.46,{other},{other}
4,1,month_2,-0.09,-0.10,-0.12,-0.09,channel_code_2,city_14,city_type_0,,...,0.95,0.43,0.07,0.56,0.70,-0.18,0.26,0.50,{other},{other}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290115,299998,month_5,,,,,,,,,...,,,-0.17,,,-0.20,,,{},
290116,299998,month_6,-0.16,-0.20,-0.13,-0.16,channel_code_9,city_25,city_type_0,index_city_code_30,...,0.94,0.38,-0.17,0.55,0.41,-0.20,0.25,0.37,,
290117,299999,month_4,-0.15,-0.17,-0.13,-0.15,channel_code_9,city_6,city_type_0,index_city_code_34,...,0.94,0.38,-0.10,0.55,0.57,-0.10,0.25,0.44,{α},
290118,299999,month_5,-0.14,-0.16,-0.12,-0.14,channel_code_9,city_6,city_type_0,index_city_code_34,...,0.94,0.38,-0.05,0.56,0.70,-0.03,0.25,0.47,{α},


In [10]:
def count_decimal_places(num):
    d = decimal.Decimal(float(num))
    res = d.as_tuple().exponent
    if 'n' in str(res):
        print(res)
    return res



In [11]:
nums_after_point = []
for column in df.select_dtypes(include='number'):
    max_decimal_places = df[column].dropna().apply(count_decimal_places)
    nums_after_point.append(max_decimal_places.max())




KeyboardInterrupt



In [37]:
print(nums_after_point) # Колличество знаков после запятой 

[0, -32, -35, -37, -37, -44, -50, -38, -39, -40, -46, -35, -34, -36, -37, -47, -37, -46, -37, -43, -38, -45, -34, -39, -37, -42, -47, -34, -41, -48, -36, -43, -45, -39, -49, -46, -36, -45, -43, -40, -45, -47, -33, -39, -48, -36, -42, -44, -36, -47, -37, -48, -38, -41, -37, -44, -35, -45, -36, -43, -45, -35, -39, -45, -36, -46, -42, -37, -45, -47, -37, -44, -45, -35, -41, -45, -37, -41, -44, -36, -44, -46]


In [6]:
(df['id'].astype('int32') == df['id'].astype('int64')).value_counts()

id
True    890120
Name: count, dtype: int64

### Float32

In [21]:
# int16 от -2,147,483,648 до 2,147,483,647
# float32 
df['id'] = df['id'].astype('int32')

float64_columns = df.select_dtypes(include=['float64']).columns
df[float64_columns] = df[float64_columns].astype('float32')

### Float16

In [34]:
# int16 от -2,147,483,648 до 2,147,483,647
# float16 от -65504 до 65504
df['id'] = df['id'].astype('int32')

float64_columns = df.select_dtypes(include=['float64']).columns
df[float64_columns] = df[float64_columns].astype('float16')

### Разеделние датасета

In [36]:
new_train_df = df.iloc[:len(train_df)]
new_test_df = df.iloc[len(train_df):]

### Проверка потери

In [37]:
new_train_df.equals(train_df), new_test_df.equals(test_df)

(False, False)

### Сохранение датасета

In [39]:
new_train_df.to_parquet("../data/interim/train_data_lm_16.pqt")
new_test_df.to_parquet("../data/interim/test_data_lm_16.pqt")