In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

In [None]:
def evraz_metric(y_true: pd.DataFrame, y_pred: np.array):
    """
    Метрика оценки качества модели, предложенная организаторами EVRAZ.
    :param answers: pd.DataFrame, датасет с реальными значениями целевых переменных.
    :param user_csv: pd.DataFrame, датасет с предсказанными значениями целевых переменных.
    :return:
    """
    predictions = pd.DataFrame(data=y_pred, columns=['C', 'TST'])
    # Содержание углерода в металле.
    delta_c = np.abs(np.array(y_true['C']) - np.array(predictions['C']))
    hit_rate_c = np.int64(delta_c < 0.02)
    # Температура металла.
    delta_t = np.abs(np.array(y_true['TST']) - np.array(predictions['TST']))
    hit_rate_t = np.int64(delta_t < 20)
    N = np.size(y_true['C'])
    return np.sum(hit_rate_c + hit_rate_t) / 2 / N,\
           np.sum(hit_rate_c) / N, np.sum(hit_rate_t) / N

In [None]:
def get_seconds(value, format):
    time_to_proc = datetime.strptime(value, format)
    diff = time_to_proc - datetime(2001, 1, 1)
    secs = round(diff.total_seconds(), 1)
    return int(secs)

## making train ds

In [None]:
target_train = pd.read_csv('Downloads/evrazdata/target_train.csv')

lom_train = pd.read_csv('Downloads/evrazdata/lom_train.csv')
plavki_train = pd.read_csv('/evrazdata/plavki_train.csv')

chugun_train = pd.read_csv('Downloads/evrazdata/chugun_train.csv')
sip_train = pd.read_csv('Downloads/evrazdata/sip_train.csv')
gas_train = pd.read_csv('Downloads/evrazdata/gas_train.csv')
produv_train = pd.read_csv('Downloads/evrazdata/produv_train.csv')
chronom_train = pd.read_csv('Downloads/evrazdata/chronom_train.csv')

In [None]:
%%time

produv_train['timed'] = produv_train['SEC'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S"))
gas_train['timed'] = gas_train['Time'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S.%f"))

In [None]:
chronom_train = pd.read_csv('Downloads/evrazdata/chronom_train.csv')
chronom_train['timed'] = chronom_train['VR_NACH'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S"))
chronom_train['timed'] = chronom_train['timed'].astype('int64')

In [None]:
operations = chronom_train[chronom_train.NOP == 'Повалка'].drop_duplicates(
    subset=['NPLV', 'VR_NACH']
).drop_duplicates(subset=['NPLV'])[['NPLV', 'VR_KON', 'timed']]

In [None]:
merged_data = pd.merge_asof(
    gas_train.sort_values(by='timed'), produv_train.sort_values(by='timed'),
    on='timed', direction='backward'
)

In [None]:
essenntials = pd.merge(
    operations.sort_values(by='timed'), merged_data.sort_values(by='timed'),
    left_on='timed', right_on='timed'
)

In [None]:
dataset = essenntials.merge(target_train, how='left', left_on='NPLV', right_on='NPLV')[['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'O2_pressure', 'timed', 'RAS', 'POL', 'NPLV', 'TST', 'C']]

In [None]:
dataset.to_csv('dataset.csv', index=False)

## making test ds

In [None]:
sample = pd.read_csv('evrazdata/sample_submission.csv')

In [None]:
gas_test = pd.read_csv('Downloads/evrazdata/gas_test.csv')
produv_test = pd.read_csv('Downloads/evrazdata/produv_test.csv')

In [None]:
produv_test['timed'] = produv_test['SEC'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S"))
gas_test['timed'] = gas_test['Time'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S.%f"))

In [None]:
merged_data = pd.merge_asof(
    gas_test.sort_values(by='timed'), produv_test.sort_values(by='timed'),
    on='timed', direction='backward'
)

In [None]:
chronom_test = pd.read_csv('Downloads/evrazdata/chronom_test.csv')
chronom_test['timed'] = chronom_test['VR_KON'].apply(lambda x : get_seconds(x, "%Y-%m-%d %H:%M:%S"))
chronom_test['timed'] = chronom_test['timed'].astype('int64')

In [None]:
operations = chronom_test[chronom_test.NOP == 'Продувка'].drop_duplicates(
    subset=['NPLV', 'VR_KON']
).drop_duplicates(subset=['NPLV'])[['NPLV', 'VR_KON', 'timed']]

In [None]:
essenntials = pd.merge(
    operations.sort_values(by='timed'), merged_data.sort_values(by='timed'),
    left_on='timed', right_on='timed'
) #.dropna()

In [None]:
len(essenntials.NPLV), len(sample.NPLV),

In [None]:
tar_nplv = list(sample.NPLV.unique()); merg_mplv = list(essenntials.NPLV.unique())

In [None]:
set(tar_nplv) - set(merg_mplv)

In [None]:
lst = list(set(tar_nplv) - set(merg_mplv))
lst

In [None]:
brak = pd.DataFrame(lst)
brak = brak.rename(columns={0: 'NPLV'})

In [None]:
for clmn in ['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'O2_pressure', 'timed', 'RAS', 'POL']:
    brak[clmn] = essenntials[clmn].mean()

In [None]:
features = pd.concat([essenntials, brak], axis=0).reset_index(drop=True)

In [None]:
len(features.NPLV)

In [None]:
features = sample[['NPLV']].merge(features, how='left', left_on='NPLV', right_on='NPLV')[['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'O2_pressure', 'timed', 'RAS', 'POL', 'NPLV']]

In [None]:
features.to_csv('test_dataset.csv', index=False)

## TEST RUN

In [None]:
target_train = pd.read_csv('/home/varsey-fabuza/PycharmProjects/evraz/_testing/dataset.csv')
essenntials = pd.read_csv('test_dataset.csv')

# ЗДЕСЬ ПОЛУЧАЕМ ДАТАСЕТЫ ДЛЯ ПРЕДИКТА НА ПРОДЕ:

In [None]:
plavki_test = pd.read_csv('data/plavki_test.csv')
essenntials = essenntials.merge(plavki_test, left_on='NPLV', right_on='NPLV')
essenntials.to_csv('test_ds_cleaned.csv', index=None)

In [None]:
target_train = target_train.merge(lom_train, left_on='NPLV', right_on='NPLV')
target_train = target_train.drop_duplicates(subset=['NPLV', 'RAS'])
target_train = target_train.merge(plavki_train, left_on='NPLV', right_on='NPLV')
target_train = target_train.drop_duplicates(subset=['NPLV', 'RAS'])
target_train.to_csv('train_ds_cleaned.csv', index=None)

In [None]:
target_train = target_train[['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'POL',
                             'O2_pressure', 'RAS', 'TST', 'C', 'NPLV']]

In [None]:
essenntials = essenntials[['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'POL',
                           'O2_pressure', 'RAS', 'NPLV']]

In [None]:
target_train.head()

In [None]:
for clm in ['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'POL', 'O2_pressure', 'RAS',]:
    target_train[clm].mask(target_train[clm]==0).fillna(target_train[clm].mean())

In [None]:
for clm in ['V', 'T', 'O2', 'N2', 'H2', 'CO2', 'CO', 'AR', 'POL', 'O2_pressure', 'RAS',]:
    essenntials[clm].mask(essenntials[clm]==0).fillna(essenntials[clm].mean())

In [None]:
target_train['TST'] = target_train['TST'].fillna(target_train['TST'].mean())
target_train['TST'].isnull().values.any()

In [None]:
target_train['C'] = target_train['C'].fillna(target_train['C'].mean())
target_train['C'].isnull().values.any()

In [None]:
target_train.shape[1], target_train.columns

In [None]:
last_col = target_train.shape[1] - 3
dummy_df = target_train
sc = StandardScaler()

In [None]:
features_c = np.array(pd.concat([dummy_df[['O2', 'N2']],
                                 np.log(dummy_df[['AR']])], axis=1))
target_c = np.log( np.array(dummy_df[['C']])) #TST
#target_c = np.array(dummy_df[['C']]) #TST

features_train_c, features_test_c, target_train_c, target_test_c = train_test_split(features_c,
                                                                                    target_c,
                                                                                    random_state=0)

features_train_c = sc.fit_transform(features_train_c)
features_test_c = sc.fit_transform(features_test_c)

baseline_c = CatBoostRegressor(verbose=0, eval_metric='RMSE', random_seed = 1, n_estimators = 110, subsample = 1, max_depth = 1)
baseline_c.fit(features_train_c, target_train_c)

In [None]:
features_tst = np.array(dummy_df[['T', 'H2', 'CO2', 'CO',]])
target_tst = np.array(dummy_df[['TST']]) #TС

features_train_tst, features_test_tst, target_train_tst, target_test_tst = train_test_split(features_tst,
                                                                                            target_tst,
                                                                                            random_state=0)
features_train_tst = sc.fit_transform(features_train_tst)
features_test_tst = sc.fit_transform(features_test_tst)

baseline_tst = CatBoostRegressor(verbose=0, eval_metric='RMSE', random_seed = 1, n_estimators = 75)
baseline_tst.fit(features_train_tst, target_train_tst)

In [None]:
model_tst = baseline_tst.fit(features_train_tst, target_train_tst)
model_c = baseline_c.fit(features_train_c, target_train_c)

In [None]:
res = (
    pd.concat([
        pd.DataFrame(np.exp(model_c.predict(features_test_c)*1.00)).rename(columns={0: 'C'}),
        pd.DataFrame(model_tst.predict(features_test_tst)*1.00).rename(columns={0: 'TST'})],
        axis=1),
    pd.concat([
        pd.DataFrame(np.exp(target_test_c)).rename(columns={0: 'C'}),
        pd.DataFrame(target_test_tst).rename(columns={0: 'TST'})],
        axis=1),
)

In [None]:
res[0].head()

In [None]:
evraz_metric(res[0], res[1])

In [None]:
#(0.626937984496124, 0.6996124031007752, 0.5542635658914729)

In [None]:
pd.concat([
    pd.DataFrame(np.exp(target_test_c)).rename(columns={0: 'C'}),
    pd.DataFrame(target_test_tst).rename(columns={0: 'TST'})],
    axis=1).hist()

In [None]:
pd.concat([
    pd.DataFrame(np.exp(baseline_c.predict(features_test_c)*1.0)).rename(columns={0: 'C'}),
    pd.DataFrame(baseline_tst.predict(features_test_tst)*1.0).rename(columns={0: 'TST'})],
    axis=1).hist()

## battle run

In [None]:
features_c = np.array(pd.concat([dummy_df[['O2', 'N2']],
                                 np.log(dummy_df[['AR']])], axis=1))
target_c = np.log( np.array(dummy_df[['C']])) #TST

features_train_c = sc.fit_transform(features_c)

battle_c = CatBoostRegressor(verbose=0, eval_metric='RMSE', random_seed = 1, n_estimators = 110, subsample = 1, max_depth = 1)

In [None]:
features_tst = np.array(dummy_df[['T', 'H2', 'CO2', 'CO',]])
target_tst = np.array(dummy_df[['TST']]) #TС

features_train_tst = sc.fit_transform(features_tst)

battle_tst = CatBoostRegressor(verbose=0, eval_metric='RMSE', random_seed = 1, n_estimators = 75)

In [None]:
battle_model_c = battle_c.fit(features_c, target_c)
battle_model_tst = battle_tst.fit(features_tst, target_tst)

In [None]:
features_c_test = np.array(pd.concat([essenntials[['O2', 'N2']],
                                      np.log(essenntials[['AR']])], axis=1))

features_tst_test = np.array(essenntials[['T', 'H2', 'CO2', 'CO']])

In [None]:
pd.concat([essenntials[['NPLV']],
           pd.DataFrame(np.exp(battle_model_c.predict(features_c_test)*1.00)).rename(columns={0: 'C'}),
           pd.DataFrame(battle_model_tst.predict(features_tst_test)*1.00).rename(columns={0: 'TST'})], axis= 1)

In [None]:
res = pd.concat([essenntials[['NPLV']],
                 pd.DataFrame(battle_model_tst.predict(features_tst_test)*1.00).rename(columns={0: 'TST'}),
                 pd.DataFrame(np.exp(battle_model_c.predict(features_c_test)*1.00)).rename(columns={0: 'C'}),], axis= 1)

In [None]:
res[res.NPLV == 513374]

In [None]:
res

In [None]:
res.to_csv('subm2.csv', index=False)