In [134]:
from sklearn.metrics import auc, roc_auc_score, roc_curve, recall_score, precision_score
import warnings
from itertools import combinations
from sklearn.metrics import confusion_matrix, cohen_kappa_score, f1_score, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, MissingIndicator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from pandas import concat
from datetime import datetime, timedelta
%matplotlib inline


warnings.filterwarnings('ignore')

In [135]:
# Зафиксируем RANDOM_SEED, чтобы эксперименты были воспроизводимы:
RANDOM_SEED = 41

# Предобработка

In [173]:
def update_produv(produv_train):
    
    produv_start=pd.pivot_table(produv_train,index='NPLV',values='SEC', aggfunc={'min'}).reset_index()
    produv_end=pd.pivot_table(produv_train,index='NPLV',values='SEC', aggfunc={'max'}).reset_index()
    ras_med=pd.pivot_table(produv_train,index='NPLV',values='RAS', aggfunc={'mean'}).reset_index()
    ras_sum=pd.pivot_table(produv_train,index='NPLV',values='RAS', aggfunc={'sum'}).reset_index()
    pol_start=pd.pivot_table(produv_train,index='NPLV',values='POL', aggfunc={'max'}).reset_index()
    pol_end=pd.pivot_table(produv_train,index='NPLV',values='POL', aggfunc={'min'}).reset_index()
    
    produv_upd=pd.merge(produv_start,produv_end, how='left',on='NPLV')
    produv_upd=pd.merge(produv_upd,ras_sum, how='left',on='NPLV')
    produv_upd=pd.merge(produv_upd,pol_start, how='left',on='NPLV')
    produv_upd=pd.merge(produv_upd,pol_end, how='left',on='NPLV')
    
    #df.rename(columns = {list(df)[1]:'new_name'}, inplace=True)
    
    produv_upd.rename(columns={list(produv_upd)[1]: 'PRODUVKA_START'}, inplace=True)
    produv_upd.rename(columns={list(produv_upd)[2]: 'PRODUVKA_END'}, inplace=True)
    produv_upd.rename(columns={list(produv_upd)[3]: 'RASHOD_SUM'}, inplace=True)
    produv_upd.rename(columns={list(produv_upd)[4]: 'POL_MAX'}, inplace=True)
    produv_upd.rename(columns={list(produv_upd)[5]: 'POL_MIN'}, inplace=True)
    
    produv_upd['PRODUVKA_TIME']=produv_upd['PRODUVKA_END']-produv_upd['PRODUVKA_START']
    produv_upd['PRODUVKA_TIME,SEC']=produv_upd['PRODUVKA_TIME'].dt.seconds

    
    return produv_upd


def update_gas(gas_train):
    V_finish=gas_train[['NPLV','Time','V']].sort_values(by=['NPLV','Time'], ascending=[True,False]).drop_duplicates('NPLV')
    V_finish=V_finish.drop('Time', axis=1)
    V_start=gas_train[['NPLV','Time','V']].sort_values(by=['NPLV','Time'], ascending=[True,True]).drop_duplicates('NPLV')
    V_start=V_start.drop('Time', axis=1)
    T_finish=gas_train[['NPLV','Time','T']].sort_values(by=['NPLV','Time'], ascending=[True,False]).drop_duplicates('NPLV')
    T_finish=T_finish.drop('Time', axis=1)
    T_start=gas_train[['NPLV','Time','T']].sort_values(by=['NPLV','Time'], ascending=[True,True]).drop_duplicates('NPLV')
    T_start=T_start.drop('Time', axis=1)
    O2_sum=pd.pivot_table(gas_train,index='NPLV',values='O2', aggfunc={'sum'}).reset_index()
    N2_sum=pd.pivot_table(gas_train,index='NPLV',values='N2', aggfunc={'sum'}).reset_index()
    H2_sum=pd.pivot_table(gas_train,index='NPLV',values='H2', aggfunc={'sum'}).reset_index()
    CO2_sum=pd.pivot_table(gas_train,index='NPLV',values='CO2', aggfunc={'sum'}).reset_index()
    CO_sum=pd.pivot_table(gas_train,index='NPLV',values='CO', aggfunc={'sum'}).reset_index()
    AR_sum=pd.pivot_table(gas_train,index='NPLV',values='AR', aggfunc={'sum'}).reset_index()
    TF1_start=gas_train[['NPLV','Time','T фурмы 1']].sort_values(by=['NPLV','Time'], ascending=[True,True]).drop_duplicates('NPLV')
    TF1_start=TF1_start.drop('Time', axis=1)
    TF1_end=gas_train[['NPLV','Time','T фурмы 1']].sort_values(by=['NPLV','Time'], ascending=[True,False]).drop_duplicates('NPLV')
    TF1_end=TF1_end.drop('Time', axis=1)
    TF2_start=gas_train[['NPLV','Time','T фурмы 2']].sort_values(by=['NPLV','Time'], ascending=[True,True]).drop_duplicates('NPLV')
    TF2_start=TF2_start.drop('Time', axis=1)
    TF2_end=gas_train[['NPLV','Time','T фурмы 2']].sort_values(by=['NPLV','Time'], ascending=[True,False]).drop_duplicates('NPLV')
    TF2_end=TF2_end.drop('Time', axis=1)
    O2_pressure_start=gas_train[['NPLV','Time','O2_pressure']].sort_values(by=['NPLV','Time'], ascending=[True,True]).drop_duplicates('NPLV')
    O2_pressure_start=O2_pressure_start.drop('Time', axis=1)
    O2_pressure_end=gas_train[['NPLV','Time','O2_pressure']].sort_values(by=['NPLV','Time'], ascending=[True,False]).drop_duplicates('NPLV')
    O2_pressure_end=O2_pressure_end.drop('Time', axis=1)
    O2_pressure_max=pd.pivot_table(gas_train,index='NPLV',values='O2_pressure', aggfunc={'max'}).reset_index()
    O2_pressure_min=pd.pivot_table(gas_train,index='NPLV',values='O2_pressure', aggfunc={'min'}).reset_index()
    time_start=pd.pivot_table(gas_train,index='NPLV',values='Time', aggfunc={'min'}).reset_index()
    time_end=pd.pivot_table(gas_train,index='NPLV',values='Time', aggfunc={'max'}).reset_index()
    tmp_1=pd.merge(time_start, time_end, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, V_start, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, V_finish, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, T_start, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, T_finish, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, O2_sum, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, N2_sum, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, H2_sum, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, CO2_sum, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, CO_sum, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, AR_sum, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, TF1_start, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, TF1_end, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, TF2_start, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, TF2_end, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, O2_pressure_start, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, O2_pressure_end, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, O2_pressure_max, how='left', on='NPLV')
    tmp_1=pd.merge(tmp_1, O2_pressure_min, how='left', on='NPLV')
    tmp_1.rename(columns={list(tmp_1)[1]:'GAS_START'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[2]: 'GAS_FINISH'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[3]: 'V_START'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[4]: 'V_FINISH'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[5]: 'T_START'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[6]: 'T_FINISH'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[7]: 'O2_SUM'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[8]: 'N2_SUM'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[9]: 'H2_SUM'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[10]: 'CO2_SUM'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[11]: 'CO_SUM'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[12]: 'AR_SUM'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[13]: 'TF1_START'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[14]: 'TF1_FINISH'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[15]: 'TF2_START'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[16]: 'TF2_FINISH'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[17]: 'O2_PRESSURE_START'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[18]: 'O2_PRESSURE_FINISH'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[19]: 'O2_PRESSURE_MAX'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[20]: 'O2_PRESSURE_MIN'}, inplace=True)
    tmp_1['GAS_TIME']=tmp_1['GAS_FINISH']-tmp_1['GAS_START']
    tmp_1['GAS_TIME,SEC']=tmp_1['GAS_TIME'].dt.seconds
    return tmp_1


def update_sip(sip_train):
    sip_time_start=sip_train[['NPLV','DAT_OTD']].sort_values(by=['NPLV','DAT_OTD'], ascending=[True,True]).drop_duplicates('NPLV')
    sip_time_finish=sip_train[['NPLV','DAT_OTD']].sort_values(by=['NPLV','DAT_OTD'], ascending=[True,False]).drop_duplicates('NPLV')
    sip_sostav=pd.pivot_table(sip_train,index='NPLV',columns='VDSYP',values='VSSYP', aggfunc={'sum'})
    tmp_1=pd.merge(sip_sostav, sip_time_start, how='left',on='NPLV')
    tmp_1=pd.merge(tmp_1, sip_time_finish, how='left',on='NPLV')
    tmp_1.rename(columns={list(tmp_1)[1]:'TOTAL_104'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[2]:'TOTAL_119'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[3]:'TOTAL_171'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[4]:'TOTAL_346'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[5]:'TOTAL_397'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[6]:'TOTAL_408'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[7]:'TOTAL_442'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[8]:'SIP_START'}, inplace=True)
    tmp_1.rename(columns={list(tmp_1)[9]:'SIP_FINISH'}, inplace=True)
    tmp_1['SIP_TIME']=tmp_1['SIP_FINISH']-tmp_1['SIP_START']
    tmp_1['SIP_TIME,SEC']=tmp_1['SIP_TIME'].dt.seconds
    return tmp_1

def update_chronom(df):
    
    O2_sum=pd.pivot_table(df,index='NPLV',values='O2', aggfunc={'sum'}).reset_index()
        
    O2_sum.rename(columns={list(O2_sum)[1]: 'O2_SUM'}, inplace=True)      
   
    return O2_sum


def update_lom(lom):
    
    vdl_sum=pd.pivot_table(lom,index='NPLV',values='VDL', aggfunc={'sum'}).reset_index()
    lom_sum=pd.pivot_table(lom,index='NPLV',values='VES', aggfunc={'sum'}).reset_index()
    
    lom_upd=pd.merge(vdl_sum, lom_sum, how='left',on='NPLV')
        
    #df.rename(columns = {list(df)[1]:'new_name'}, inplace=True)
    
    lom_upd.rename(columns={list(lom_upd)[1]: 'VDL_SUM'}, inplace=True)
    lom_upd.rename(columns={list(lom_upd)[2]: 'LOM_SUM'}, inplace=True)
        
    return lom_upd


def duplicate_data(df):
    '''
    Функция для поиска дубликатов.
    '''
    if len(df) > len(df.drop_duplicates()):
        print('Имеются дубликаты')
        display(df[df.duplicated()])
    else:
        print('Дубликатов не обнаружено')

In [188]:
path= 'C:/Users/Артемий/Python/EVRAZ_AI/data_task1/'

names=['chronom','chugun','gas','lom','plavki','produv','sip']

chrom_train=pd.read_csv(path+names[0]+'_train.csv')
chugun_train=pd.read_csv(path+names[1]+'_train.csv')
gas_train=pd.read_csv(path+names[2]+'_train.csv')
lom_train=pd.read_csv(path+names[3]+'_train.csv')
plavki_train=pd.read_csv(path+names[4]+'_train.csv')
produv_train=pd.read_csv(path+names[5]+'_train.csv')
sip_train=pd.read_csv(path+names[6]+'_train.csv')

chrom_test=pd.read_csv(path+names[0]+'_test.csv')
chugun_test=pd.read_csv(path+names[1]+'_test.csv')
gas_test=pd.read_csv(path+names[2]+'_test.csv')
lom_test=pd.read_csv(path+names[3]+'_test.csv')
plavki_test=pd.read_csv(path+names[4]+'_test.csv')
produv_test=pd.read_csv(path+names[5]+'_test.csv')
sip_test=pd.read_csv(path+names[6]+'_test.csv')

target_train=pd.read_csv(path+'target_train.csv')

In [174]:
gas_train['Time']=pd.to_datetime(gas_train['Time'], format='%Y-%m-%d %H:%M:%S')
gas_train_upd=update_gas(gas_train)
sip_train['DAT_OTD']=pd.to_datetime(sip_train['DAT_OTD'], format='%Y-%m-%d %H:%M:%S')
sip_train_upd=update_sip(sip_train)
chronom_train['VR_NACH']=pd.to_datetime(chronom_train['VR_NACH'], format='%Y-%m-%d %H:%M:%S')
chronom_train = chronom_train[chronom_train['VR_NACH'].dt.year > 2020]
chronom_train_upd = update_chronom(chronom_train)
produv_train['SEC']=pd.to_datetime(produv_train['SEC'], format='%Y-%m-%d %H:%M:%S')
produv_train_upd=update_produv(produv_train)
lom_train_upd = update_lom(lom_train)
plavki_train = plavki_train.drop_duplicates(subset = ['NPLV', 'plavka_STFUT'])

In [163]:
gas_test['Time']=pd.to_datetime(gas_test['Time'], format='%Y-%m-%d %H:%M:%S')
gas_test_upd=update_gas(gas_test)
sip_test['DAT_OTD']=pd.to_datetime(sip_test['DAT_OTD'], format='%Y-%m-%d %H:%M:%S')
sip_test_upd=update_sip(sip_test)
chronom_test['VR_NACH']=pd.to_datetime(chronom_test['VR_NACH'], format='%Y-%m-%d %H:%M:%S')
chronom_test = chronom_test[chronom_test['VR_NACH'].dt.year > 2020]
chronom_test_upd = update_chronom(chronom_test)
produv_test['SEC']=pd.to_datetime(produv_test['SEC'], format='%Y-%m-%d %H:%M:%S')
produv_test_upd=update_produv(produv_test)
lom_test_upd = update_lom(lom_test)
plavki_test = plavki_test.drop_duplicates(subset = ['NPLV', 'plavka_STFUT'])

AttributeError: Can only use .dt accessor with datetimelike values

In [175]:
df_train = pd.merge(chugun_train, gas_train_upd, on = ['NPLV'])
df_train = df_train.merge(chronom_train_upd, how='right', on = 'NPLV')
df_train = df_train.merge(produv_train_upd, how='right', on = 'NPLV')
df_train = df_train.merge(lom_train_upd, how='right', on = 'NPLV')
df_train = df_train.merge(sip_train_upd, how='right', on = 'NPLV')
df_train = df_train.merge(plavki_train, how='right', on = 'NPLV')
df_train = df_train.drop(df.columns[[33, 41, 54]], axis=1)
df_train.fillna(0, inplace=True)

In [187]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2063 entries, 0 to 2062
Data columns (total 62 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   NPLV                2063 non-null   int64         
 1   VES                 2063 non-null   float64       
 2   T                   2063 non-null   float64       
 3   SI                  2063 non-null   float64       
 4   MN                  2063 non-null   float64       
 5   S                   2063 non-null   float64       
 6   P                   2063 non-null   float64       
 7   CR                  2063 non-null   float64       
 8   NI                  2063 non-null   float64       
 9   CU                  2063 non-null   float64       
 10  V                   2063 non-null   float64       
 11  TI                  2063 non-null   float64       
 12  DATA_ZAMERA         2063 non-null   object        
 13  GAS_START           2063 non-null   datetime64[n

In [None]:
df_test = pd.merge(chugun_test, gas_upd, on = ['NPLV'])
df_test = df_test.merge(chronom_upd, how='right', on = 'NPLV')
df_test = df_test.merge(produv_upd, how='right', on = 'NPLV')
df_test = df_test.merge(lom_upd, how='right', on = 'NPLV')
df_test = df_test.merge(sip_upd, how='right', on = 'NPLV')
df_test = df_test.merge(plavki_train, how='right', on = 'NPLV')
df_test.update(df_test.select_dtypes('float').fillna(0, inplace=True))
df_test = df_test.drop(df.columns[[33, 41, 54]], axis=1)
df_test.fillna(0, inplace=True)