In [1]:
from sklearn.metrics import auc, roc_auc_score, roc_curve, recall_score, precision_score
import warnings
from itertools import combinations
from sklearn.metrics import confusion_matrix, cohen_kappa_score, f1_score, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, MissingIndicator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from pandas import concat
from datetime import datetime, timedelta
%matplotlib inline


warnings.filterwarnings('ignore')

In [2]:
# Зафиксируем RANDOM_SEED, чтобы эксперименты были воспроизводимы:
RANDOM_SEED = 41

# Предобработка

In [70]:
def update_produv(produv_train):
    
    produv_start=pd.pivot_table(produv_train,index='NPLV',values='SEC', aggfunc={'min'}).reset_index()
    produv_end=pd.pivot_table(produv_train,index='NPLV',values='SEC', aggfunc={'max'}).reset_index()
    ras_med=pd.pivot_table(produv_train,index='NPLV',values='RAS', aggfunc={'mean'}).reset_index()
    ras_sum=pd.pivot_table(produv_train,index='NPLV',values='RAS', aggfunc={'sum'}).reset_index()
    pol_start=pd.pivot_table(produv_train,index='NPLV',values='POL', aggfunc={'max'}).reset_index()
    pol_end=pd.pivot_table(produv_train,index='NPLV',values='POL', aggfunc={'min'}).reset_index()
    
    produv_upd=pd.merge(produv_start,produv_end, how='left',on='NPLV')
    produv_upd=pd.merge(produv_upd,ras_sum, how='left',on='NPLV')
    produv_upd=pd.merge(produv_upd,pol_start, how='left',on='NPLV')
    produv_upd=pd.merge(produv_upd,pol_end, how='left',on='NPLV')
    
    #df.rename(columns = {list(df)[1]:'new_name'}, inplace=True)
    
    produv_upd.rename(columns={list(produv_upd)[1]: 'PRODUVKA_START'}, inplace=True)
    produv_upd.rename(columns={list(produv_upd)[2]: 'PRODUVKA_END'}, inplace=True)
    produv_upd.rename(columns={list(produv_upd)[3]: 'RASHOD_SUM'}, inplace=True)
    produv_upd.rename(columns={list(produv_upd)[4]: 'POL_MAX'}, inplace=True)
    produv_upd.rename(columns={list(produv_upd)[5]: 'POL_MIN'}, inplace=True)
    
    produv_upd['PRODUVKA_TIME']=produv_upd['PRODUVKA_END']-produv_upd['PRODUVKA_START']
    produv_upd['PRODUVKA_TIME,SEC']=produv_upd['PRODUVKA_TIME'].dt.seconds

    
    return produv_upd


def update_chronom(df):
    
    O2_sum=pd.pivot_table(df,index='NPLV',values='O2', aggfunc={'sum'}).reset_index()
        
    O2_sum.rename(columns={list(O2_sum)[1]: 'O2_SUM'}, inplace=True)      
   
    return O2_sum


def update_lom(lom):
    
    vdl_sum=pd.pivot_table(lom,index='NPLV',values='VDL', aggfunc={'sum'}).reset_index()
    lom_sum=pd.pivot_table(lom,index='NPLV',values='VES', aggfunc={'sum'}).reset_index()
    
    lom_upd=pd.merge(vdl_sum, lom_sum, how='left',on='NPLV')
        
    #df.rename(columns = {list(df)[1]:'new_name'}, inplace=True)
    
    lom_upd.rename(columns={list(lom_upd)[1]: 'VDL_SUM'}, inplace=True)
    lom_upd.rename(columns={list(lom_upd)[2]: 'LOM_SUM'}, inplace=True)
        
    return lom_upd

In [4]:
# тренировочный (train, используется для обучения модели)
gas_train = pd.read_csv('gas_train.csv')
chronom_train = pd.read_csv('chronom_train.csv', index_col = False)
chugun_train = pd.read_csv('chugun_train.csv')
lom_train = pd.read_csv('lom_train.csv')
plavki_train = pd.read_csv('plavki_train.csv')
produv_train = pd.read_csv('produv_train.csv')
sip_train = pd.read_csv('sip_train.csv')
# тестовый (test, используется для оценки точности модели)
gas_test = pd.read_csv('gas_test.csv')
chronom_test = pd.read_csv('chronom_test.csv', index_col = False)
chugun_test = pd.read_csv('chugun_test.csv')
lom_test = pd.read_csv('lom_test.csv')
plavki_test = pd.read_csv('plavki_test.csv')
produv_test = pd.read_csv('produv_test.csv')
sip_test = pd.read_csv('sip_test.csv')
# целевой датасет
target_train = pd.read_csv('target_train.csv')
# исходный датасет
sample_submission = pd.read_csv('sample_submission.csv')

In [108]:
gas_train_time=[]
for date in gas_train.Time: #Проходимся по элементам списка
    dt = datetime.strptime(date,'%Y-%m-%d %H:%M:%S.437') #Преобразуем значения из строки в формат datetime
    gas_train_time.append(dt.strftime('%Y-%m-%d %H:%M:%S'))
gas_train['time'] = gas_train_time
gas_train = gas_train.drop(columns='Time')

In [109]:
#gas_test_time=[]
#for date in gas_test.Time: #Проходимся по элементам списка
#    dt = datetime.strptime(date,'%Y-%m-%d %H:%M:%S.437') #Преобразуем значения из строки в формат datetime
#    gas_test_time.append(dt.strftime('%Y-%m-%d %H:%M:%S'))
#gas_test['time'] = gas_test_time
#gas_test = gas_test.drop(columns='Time')

In [110]:
#chronom_train_time_nach=[]
#for date in chronom_train.VR_NACH: #Проходимся по элементам списка
#    dt = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') #Преобразуем значения из строки в формат datetime
#    chronom_train_time_nach.append(dt)
#chronom_train['data_nach'] = chronom_train_time_nach
#chronom_train = chronom_train.drop(columns='VR_NACH')
#chronom_train

In [111]:
#chronom_train_time_kon=[]
#for date in chronom_train.VR_KON: #Проходимся по элементам списка
#    dt = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') #Преобразуем значения из строки в формат datetime
#    chronom_train_time_kon.append(dt)
#chronom_train['data_kon'] = chronom_train_time_kon
#chronom_train = chronom_train.drop(columns='VR_KON')
#chronom_train

In [112]:
#chugun_train_time=[]
#for date in chugun_train.DATA_ZAMERA: #Проходимся по элементам списка
#    dt = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') #Преобразуем значения из строки в формат datetime
#    chugun_train_time.append(dt)
#chugun_train['data'] = chugun_train_time
#chugun_train = chugun_train.drop(columns='DATA_ZAMERA')
#chugun_train

In [13]:
gas_train

Unnamed: 0,NPLV,Time,V,T,O2,N2,H2,CO2,CO,AR,T фурмы 1,T фурмы 2,O2_pressure
0,510008,2021-01-01 03:08:11.437,218263.343750,262.847229,18.722993,80.132247,0.087755,0.163878,9.229025e-03,0.893243,0.000000,0.000000,13.085938
1,510008,2021-01-01 03:08:12.437,218263.343750,262.847229,18.732721,80.138406,0.087959,0.148980,8.390023e-03,0.892948,0.000000,0.000000,13.085938
2,510008,2021-01-01 03:08:13.437,218369.359375,262.152771,18.742449,80.144565,0.088163,0.134082,7.551021e-03,0.892653,0.000000,0.000000,13.085938
3,510008,2021-01-01 03:08:14.437,218475.359375,261.805573,18.752177,80.150724,0.088367,0.119184,6.712018e-03,0.892358,0.000000,0.000000,13.093172
4,510008,2021-01-01 03:08:15.437,218369.359375,260.763885,18.761905,80.156883,0.088571,0.104286,5.873016e-03,0.892063,0.000000,0.000000,13.093172
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6468013,512322,2021-04-26 18:48:35.437,207386.734375,118.402779,1.890000,97.220001,0.090000,0.050000,0.000000e+00,0.750000,23.900463,27.054343,14.424190
6468014,512322,2021-04-26 18:48:36.437,207051.609375,118.402779,2.710000,96.410004,0.080000,0.040000,0.000000e+00,0.770000,23.900463,27.054342,14.424190
6468015,512322,2021-04-26 18:48:37.437,207163.375000,118.402779,3.770000,95.349998,0.090000,0.030000,0.000000e+00,0.770000,23.900463,27.054341,14.438658
6468016,512322,2021-04-26 18:48:38.437,206827.906250,118.750000,3.400000,95.720001,0.090000,0.040000,0.000000e+00,0.750000,23.900463,27.054340,14.445890


In [26]:
chronom_train['VR_NACH']=pd.to_datetime(chronom_train['VR_NACH'], format='%Y-%m-%d %H:%M:%S')
chronom_train = chronom_train[chronom_train['VR_NACH'].dt.year > 2020]
chronom_train = update_chronom(chronom_train)
chronom_train

In [71]:
lom_train = update_lom(lom_train)
lom_train

Unnamed: 0,NPLV,VDL_SUM,LOM_SUM
0,510008,21,19700
1,510009,25,78600
2,510010,48,76300
3,510011,25,84100
4,510012,25,76100
...,...,...,...
2058,512318,96,73600
2059,512319,48,76600
2060,512320,65,64200
2061,512321,25,66200


In [17]:
plavki_train

Unnamed: 0,NPLV,plavka_VR_NACH,plavka_VR_KON,plavka_NMZ,plavka_NAPR_ZAD,plavka_STFUT,plavka_TIPE_FUR,plavka_ST_FURM,plavka_TIPE_GOL,plavka_ST_GOL
0,510008,2021-01-01 03:08:11,2021-01-01 03:51:10,С255,МНЛЗ,971,цилиндрическая,11,5 сопловая,11
1,510009,2021-01-01 04:00:44,2021-01-01 05:07:28,С255,МНЛЗ,972,цилиндрическая,12,5 сопловая,12
2,510010,2021-01-01 05:12:29,2021-01-01 06:00:53,Ст3пс/Э,Изл,973,цилиндрическая,13,5 сопловая,13
3,510011,2021-01-01 06:13:48,2021-01-01 07:08:39,Св-08А.z02,Изл,974,цилиндрическая,14,5 сопловая,14
4,510012,2021-01-01 07:13:44,2021-01-01 08:01:59,SC2M/ЭТ,МНЛС,975,цилиндрическая,15,5 сопловая,15
...,...,...,...,...,...,...,...,...,...,...
2132,512318,2021-04-26 13:04:26,2021-04-26 13:55:50,C071TM.z01/ЭТ,МНЛС,3281,коническая,22,5 сопловая,56
2133,512319,2021-04-26 14:10:20,2021-04-26 15:14:23,C071TM.z01/ЭТ,МНЛС,3282,коническая,23,5 сопловая,57
2134,512320,2021-04-26 15:21:37,2021-04-26 16:16:42,40Х.1,МНЛЗ,3283,коническая,24,5 сопловая,58
2135,512321,2021-04-26 16:22:37,2021-04-26 17:23:37,40Х.1,МНЛЗ,3284,коническая,25,5 сопловая,59


In [19]:
sip_train.VDSYP.value_counts()

408    19055
346     7583
442     2930
171     1624
104      226
397      164
119        2
Name: VDSYP, dtype: int64

In [5]:
produv_train

Unnamed: 0,NPLV,SEC,RAS,POL
0,510008,2021-01-01 03:18:26,382.000000,3.920000
1,510008,2021-01-01 03:18:28,382.000000,3.920000
2,510008,2021-01-01 03:18:30,553.000000,3.920000
3,510008,2021-01-01 03:18:32,701.000000,3.920000
4,510008,2021-01-01 03:18:34,813.000000,3.920000
...,...,...,...,...
4729797,512322,2021-05-05 16:30:46,363.996249,4.850045
4729798,512322,2021-05-05 16:30:48,363.996999,4.850036
4729799,512322,2021-05-05 16:30:50,363.997749,4.850027
4729800,512322,2021-05-05 16:30:52,363.998499,4.850018


In [43]:
target_train

Unnamed: 0,NPLV,TST,C
0,510008,1690,0.060
1,510009,1683,0.097
2,510010,1662,0.091
3,510011,1609,0.410
4,510012,1682,0.120
...,...,...,...
2058,512318,1626,0.145
2059,512319,1643,0.087
2060,512320,1615,0.141
2061,512321,1654,0.270


In [44]:
sample_submission

Unnamed: 0,NPLV,TST,C
0,512324,1640,0.0
1,512327,1640,0.0
2,512328,1640,0.0
3,512331,1640,0.0
4,512333,1640,0.0
...,...,...,...
775,513369,1640,0.0
776,513370,1640,0.0
777,513371,1640,0.0
778,513372,1640,0.0


In [123]:
# Для объединения
df_train = df_train.merge(produv_train, left_on=['time'],right_on=['SEC'])
#df_train = df_train.merge(chugun_train, gas_train, how='right', on = 'NPLV')
df_train = pd.merge(chronom_train, left_on=['time'],right_on=['time'])
df_train = df_train.merge(chugun_train, gas_train, left_on=['time'],right_on=['SEC'])
df_train = df_train.merge(chugun_train, how='right', on = 'NPLV')
df_train = df_train.merge(lom_train, how='right', on = 'NPLV')
df_train = df_train.merge(plavki_train, how='right', on = 'NPLV')
df_train = df_train.merge(produv_train, on = 'NPLV')
df_train = df_train.merge(sip_train, on = 'NPLV')