In [33]:
import numpy as np
import pandas as pd

import yaml
import json

import joblib

import warnings
warnings.filterwarnings('ignore')

In [34]:
config_path = '../config/params.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config['preprocessing']
training = config['train']
evaluate = config['evaluate']

column_sequence_path = preproc['unique_values_path']
with open(column_sequence_path) as json_file:
    column_sequence = json.load(json_file)

In [35]:
df_test = pd.read_csv(evaluate['predict_path'])
df_test[:5]

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure
0,ID_2MYNQS,1/1/19,PD03,45.289376,11.642394,3.277529,,-0.313361,0.771456,2.4e-05,7.5e-05,,14440.02819
1,ID_P4U5WU,1/1/19,TV03,45.836941,12.510362,0.0,,-0.229512,0.398208,2.3e-05,0.00012,,14434.0479
2,ID_U4KWPK,1/1/19,X5561,45.582894,8.842165,0.0,282.98,-0.470822,0.153694,2.3e-05,0.000171,0.000148,14427.42478
3,ID_QGSNTZ,1/1/19,X5953,45.131947,10.015742,1.928031,,0.132952,0.756917,2.4e-05,0.000266,,14443.09006
4,ID_GHSZ6K,1/1/19,X6701,45.186329,9.146666,0.0,,-0.198272,0.678858,2.3e-05,0.000149,,14440.8584


In [36]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6576 entries, 0 to 6575
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID_Zindi            6576 non-null   object 
 1   Date                6576 non-null   object 
 2   ID                  6576 non-null   object 
 3   LAT                 6576 non-null   float64
 4   LON                 6576 non-null   float64
 5   Precipitation       6576 non-null   float64
 6   LST                 3595 non-null   float64
 7   AAI                 5708 non-null   float64
 8   CloudFraction       5708 non-null   float64
 9   NO2_strat           5708 non-null   float64
 10  NO2_total           5708 non-null   float64
 11  NO2_trop            3998 non-null   float64
 12  TropopausePressure  5708 non-null   float64
dtypes: float64(10), object(3)
memory usage: 668.0+ KB


# Preprocessing

In [37]:
def transform_types(data: pd.DataFrame, 
                    change_type_columns: dict) -> pd.DataFrame:
    """
    Преоборазование признаков в разный тип данных
    param: data: датасет
    param: change_type_columns: словарь с признаками и типами данных
    """
    return data.astype(change_type_columns, errors='raise')


def check_columns_evaluate(data: pd.DataFrame, 
                           unique_values_path: str) -> pd.DataFrame:
    """
    Проверка на наличие признаков из train 
    и упорядочивание признаков согласно train
    param: data: test датасет
    param: unique_values_path: путь до списка с признаками из train
    return: test датасет 
    """
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)
        
    column_sequence = unique_values.keys()
    
    assert set(column_sequence) == set(data.columns), 'Разные признаки'
    return data[column_sequence]


def feature_engineering(data: pd.DataFrame, **kwargs):
    """
    Feature engineering
    :param data: датасет
    :param kwargs: переменная
    :return: новые признаки
    """
    # разобьем столбец с датой на два: месяц и год
    data['month'] = data['Date'].dt.month
    data['year'] = data['Date'].dt.year

    # создадим новый признак, который будет указывать на соотношение
    # тропосфреной и стратосферной концентраций
    data['NO2_ratio'] = data['NO2_trop'] / data['NO2_strat']

    # создадим признак суммарной концентрации
    data["Sum_Concentration"] = data["NO2_strat"] + data["NO2_total"] + data["NO2_trop"]

    # переведем градусы Кельвина в градусы Цельсия
    data['LST'] = data['LST'] - 273.15

    # удалим лишние признаки
    data = data.drop(kwargs['drop_columns'][1:], axis=1)
    
    return data


def fillna_data(data: pd.DataFrame, list_median: list, list_mean: list):
    """
    Функция заполнения пропусков разными значениями
    ----------------
    data: датасет
    list_median: список с признаками, необходимых заполнить медианой
    list_mean: список с признаками, необходимых заполнить средними
    """
    for m in list_median:
        data[m] = data[m].fillna(data[m].median())

    for n in list_mean:
        data[n] = data[n].fillna(data[n].mean())
        
    return data

In [38]:
def pipeline_preprocess(data: pd.DataFrame, flg_evaluate: bool = True, **kwargs):
    """
    params: data: датасет
    params: flg_evaluate: флаг для evaluate
    return: итоговый датасет
    """
    data = data.drop(kwargs['drop_columns'][0], axis=1, errors='ignore')
    
    if flg_evaluate:
        data = check_columns_evaluate(
            data=data, unique_values_path=kwargs['unique_values_path'])
    else:
        save_unique_for_train(
            data=data,
            drop_columns=kwargs['drop_columns'],
            target_column=kwargs['target_column'],
            unique_values_path=kwargs['unique_values_path'])
        
    data = transform_types(data=data, 
                           change_type_columns=kwargs['change_type_columns'])
    
    data = feature_engineering(data=data, **kwargs)
    
    data = fillna_data(data=data, list_median=kwargs['list_median'], list_mean=kwargs['list_mean'])
    
    dict_category = {key: 'category' for key in data.select_dtypes(['object']).columns}
    data = transform_types(data=data, change_type_columns=dict_category)
    
    return data

In [39]:
df_proc_test = pipeline_preprocess(data=df_test, **preproc)
df_proc_test[:5]

Unnamed: 0,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,TropopausePressure,month,year,NO2_ratio,Sum_Concentration
0,PD03,45.289376,11.642394,3.277529,26.244019,-0.313361,0.771456,2.4e-05,14440.02819,1,2019,1.197568,0.000218
1,TV03,45.836941,12.510362,0.0,26.244019,-0.229512,0.398208,2.3e-05,14434.0479,1,2019,1.197568,0.000218
2,X5561,45.582894,8.842165,0.0,9.83,-0.470822,0.153694,2.3e-05,14427.42478,1,2019,6.462882,0.000342
3,X5953,45.131947,10.015742,1.928031,26.244019,0.132952,0.756917,2.4e-05,14443.09006,1,2019,1.197568,0.000218
4,X6701,45.186329,9.146666,0.0,26.244019,-0.198272,0.678858,2.3e-05,14440.8584,1,2019,1.197568,0.000218


# Evaluate

In [40]:
model = joblib.load(training['model_path'])
df_proc_test['predict'] = model.predict(df_proc_test)

In [41]:
df_proc_test.head()

Unnamed: 0,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,TropopausePressure,month,year,NO2_ratio,Sum_Concentration,predict
0,PD03,45.289376,11.642394,3.277529,26.244019,-0.313361,0.771456,2.4e-05,14440.02819,1,2019,1.197568,0.000218,21.063435
1,TV03,45.836941,12.510362,0.0,26.244019,-0.229512,0.398208,2.3e-05,14434.0479,1,2019,1.197568,0.000218,24.541353
2,X5561,45.582894,8.842165,0.0,9.83,-0.470822,0.153694,2.3e-05,14427.42478,1,2019,6.462882,0.000342,28.812387
3,X5953,45.131947,10.015742,1.928031,26.244019,0.132952,0.756917,2.4e-05,14443.09006,1,2019,1.197568,0.000218,18.551195
4,X6701,45.186329,9.146666,0.0,26.244019,-0.198272,0.678858,2.3e-05,14440.8584,1,2019,1.197568,0.000218,23.435742
