In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

In [2]:
def evraz_metric(y_true: pd.DataFrame, y_pred: np.array):
    """
    Метрика оценки качества модели, предложенная организаторами EVRAZ.
    :param answers: pd.DataFrame, датасет с реальными значениями целевых переменных.
    :param user_csv: pd.DataFrame, датасет с предсказанными значениями целевых переменных.
    :return:
    """
    predictions = pd.DataFrame(data=y_pred, columns=['C', 'TST'])
    # Содержание углерода в металле.
    delta_c = np.abs(np.array(y_true['C']) - np.array(predictions['C']))
    hit_rate_c = np.int64(delta_c < 0.02)
    # Температура металла.
    delta_t = np.abs(np.array(y_true['TST']) - np.array(predictions['TST']))
    hit_rate_t = np.int64(delta_t < 20)
    N = np.size(y_true['C'])
    return np.sum(hit_rate_c + hit_rate_t) / 2 / N,\
           np.sum(hit_rate_c) / N, np.sum(hit_rate_t) / N

In [3]:
def get_seconds(value, format):
    time_to_proc = datetime.strptime(value, format)
    diff = time_to_proc - datetime(2001, 1, 1)
    secs = round(diff.total_seconds(), 1)
    return int(secs)

## making train ds

In [4]:
target_train = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/target_train.csv')

lom_train = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/lom_train.csv')
plavki_train = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/plavki_train.csv')

chugun_train = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/chugun_train.csv')
sip_train = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/sip_train.csv')
gas_train = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/gas_train.csv')
produv_train = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/produv_train.csv')
chronom_train = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/chronom_train.csv')

In [5]:
%%time

produv_train['timed'] = produv_train['SEC'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S"))
gas_train['timed'] = gas_train['Time'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S.%f"))

CPU times: user 1min 38s, sys: 1.3 s, total: 1min 40s
Wall time: 1min 40s


In [6]:
chronom_train = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/chronom_train.csv')
chronom_train['timed'] = chronom_train['VR_NACH'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S"))
chronom_train['timed'] = chronom_train['timed'].astype('int64')

In [7]:
operations = chronom_train[chronom_train.NOP == 'Повалка'].drop_duplicates(
    subset=['NPLV', 'VR_NACH']
).drop_duplicates(subset=['NPLV'])[['NPLV', 'VR_KON', 'timed']]

In [8]:
merged_data = pd.merge_asof(
    gas_train.sort_values(by='timed'), produv_train.sort_values(by='timed'),
    on='timed', direction='backward'
)

In [9]:
essenntials = pd.merge(
    operations.sort_values(by='timed'), merged_data.sort_values(by='timed'),
    left_on='timed', right_on='timed'
) #.dropna()

In [10]:
tar_nplv = list(target_train.NPLV.unique()); merg_mplv = list(essenntials.NPLV.unique())

In [11]:
set(tar_nplv) - set(merg_mplv)

{510157,
 510177,
 510414,
 510695,
 510829,
 510948,
 511031,
 511119,
 511173,
 511571}

In [12]:
lst = list(set(tar_nplv) - set(merg_mplv))

In [13]:
brak = pd.DataFrame(lst)
brak = brak.rename(columns={0: 'NPLV'})

In [14]:
for clmn in ['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'O2_pressure', 'timed', 'RAS', 'POL']:
    brak[clmn] = essenntials[clmn].mean()

In [15]:
features = pd.concat([essenntials, brak], axis=0).reset_index(drop=True)

In [16]:
dataset = features.merge(target_train, how='left', left_on='NPLV', right_on='NPLV')[['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'O2_pressure', 'timed', 'RAS', 'POL', 'NPLV', 'TST', 'C']]

In [17]:
dataset = essenntials.merge(target_train, how='left', left_on='NPLV', right_on='NPLV')[['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'O2_pressure', 'timed', 'RAS', 'POL', 'NPLV', 'TST', 'C']]

In [18]:
dataset.to_csv('train_dataset.csv', index=False)

## making test ds

In [19]:
sample = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/sample_submission.csv')

In [20]:
gas_test = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/gas_test.csv')
produv_test = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/produv_test.csv')

In [21]:
produv_test['timed'] = produv_test['SEC'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S"))
gas_test['timed'] = gas_test['Time'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S.%f"))

In [22]:
merged_data = pd.merge_asof(
    gas_test.sort_values(by='timed'), produv_test.sort_values(by='timed'),
    on='timed', direction='backward'
)

In [23]:
chronom_test = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/chronom_test.csv')
chronom_test['timed'] = chronom_test['VR_KON'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S"))
chronom_test['timed'] = chronom_test['timed'].astype('int64')

In [24]:
operations = chronom_test[chronom_test.NOP == 'Продувка'].drop_duplicates(
    subset=['NPLV', 'VR_KON']
).drop_duplicates(subset=['NPLV'])[['NPLV', 'VR_KON', 'timed']]

In [25]:
essenntials = pd.merge(
    operations.sort_values(by='timed'), merged_data.sort_values(by='timed'),
    left_on='timed', right_on='timed'
) #.dropna()

In [26]:
len(essenntials.NPLV), len(sample.NPLV),

(771, 780)

In [27]:
tar_nplv = list(sample.NPLV.unique()); merg_mplv = list(essenntials.NPLV.unique())

In [28]:
set(tar_nplv) - set(merg_mplv)

{512454, 512882, 512936, 512942, 513029, 513091, 513193, 513244, 513285}

In [29]:
lst = list(set(tar_nplv) - set(merg_mplv))
lst

[513091, 513029, 512454, 513285, 512936, 513193, 512942, 512882, 513244]

In [30]:
brak = pd.DataFrame(lst)
brak = brak.rename(columns={0: 'NPLV'})

In [31]:
for clmn in ['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'O2_pressure', 'timed', 'RAS', 'POL']:
    brak[clmn] = essenntials[clmn].mean()

In [32]:
features = pd.concat([essenntials, brak], axis=0).reset_index(drop=True)

In [33]:
len(features.NPLV)

780

In [34]:
features = sample[['NPLV']].merge(features, how='left', left_on='NPLV', right_on='NPLV')[['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'O2_pressure', 'timed', 'RAS', 'POL', 'NPLV']]

In [35]:
features.to_csv('test_dataset.csv', index=False)

## TEST RUN

In [36]:
target_train = pd.read_csv('train_dataset.csv')
essenntials = pd.read_csv('test_dataset.csv')

# ЗДЕСЬ ПОЛУЧАЕМ ДАТАСЕТЫ ДЛЯ ПРЕДИКТА НА ПРОДЕ:

In [37]:
plavki_test = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/plavki_test.csv')
lom_test = pd.read_csv('/home/varsey-fabuza/Downloads/evrazdata/lom_test.csv')
essenntials = essenntials.merge(plavki_test, left_on='NPLV', right_on='NPLV')
essenntials = essenntials.merge(lom_test, left_on='NPLV', right_on='NPLV').drop_duplicates(subset=['NPLV', 'RAS'])
essenntials.to_csv('test_ds_cleaned.csv', index=None)

In [38]:
target_train = target_train.merge(lom_train, left_on='NPLV', right_on='NPLV')
target_train = target_train.drop_duplicates(subset=['NPLV', 'RAS'])
target_train = target_train.merge(plavki_train, left_on='NPLV', right_on='NPLV')
target_train = target_train.drop_duplicates(subset=['NPLV', 'RAS'])
target_train.to_csv('train_ds_cleaned.csv', index=None)

In [39]:
for clm in ['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'POL', 'O2_pressure', 'RAS',]:
    target_train[clm] = target_train[clm].replace(0, target_train[clm].mean())#.mask(target_train[clm]==0).fillna(target_train[clm].mean())

In [40]:
for clm in ['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'POL', 'O2_pressure', 'RAS',]:
    essenntials[clm] = essenntials[clm].replace(0, essenntials[clm].mean())

In [41]:
target_train['TST'] = target_train['TST'].fillna(target_train['TST'].mean())
target_train['TST'].isnull().values.any()

False

In [42]:
target_train['C'] = target_train['C'].fillna(target_train['C'].mean())
target_train['C'].isnull().values.any()

False

In [43]:
target_train.shape[1], target_train.columns

(27,
 Index(['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'O2_pressure', 'timed',
        'RAS', 'POL', 'NPLV', 'TST', 'C', 'VDL', 'NML', 'VES', 'plavka_VR_NACH',
        'plavka_VR_KON', 'plavka_NMZ', 'plavka_NAPR_ZAD', 'plavka_STFUT',
        'plavka_TIPE_FUR', 'plavka_ST_FURM', 'plavka_TIPE_GOL',
        'plavka_ST_GOL'],
       dtype='object'))

In [45]:
target_train.to_csv('train_ds_cleaned_limited.csv', index=False)
essenntials.to_csv('test_ds_cleaned_limited.csv', index=False)