In [2]:
# Essential libraries and settings for the notebook
%matplotlib inline
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 40)
data_path = Path('/home/devel/ML/selection_djakonov/')

In [None]:
# As we want to predict we should exclude data representing final rinse
f = open('/home/devel/ML/selection_djakonov/train_values.csv', 'r')
o = open('/home/devel/ML/selection_djakonov/train_values_nofinal.csv','w')
for line in f:
    if 'final_rinse' not in line:
        s = line
        o.write(s) 
f.close()
o.close()

In [None]:
# As we want to use train data to tune weights of algorithm we should make it similar test one 
data = pd.read_csv('/home/devel/ML/selection_djakonov/train_values_nofinal.csv')
data['process_phase'] = data.process_id.astype(str) + '_' + data.phase.astype(str)
process_phases = data.process_phase.unique()
rng = np.random.RandomState(100)
to_keep = rng.choice(process_phases, size = np.int(len(process_phases) * 0.8), replace = False)
data_limited = data[data.process_phase.isin(to_keep)]
data_limited.to_csv('/home/devel/ML/selection_djakonov/train_values_nofinal_liketest.csv')                                                                                                                                                                                        

In [3]:
# Loading training data for our model
train_values = pd.read_csv(data_path / 'train_values_nofinal_liketest.csv',
                           index_col = 0,
                           parse_dates = ['timestamp'])

train_labels = pd.read_csv(data_path / 'train_labels.csv',
                           index_col = 0)

In [4]:
# The should be like train_values in id
train_labels = train_labels.loc[train_values.process_id.unique()]

In [5]:
# Let`s take a look at data
train_values.head()

Unnamed: 0,row_id,process_id,object_id,phase,timestamp,pipeline,supply_flow,supply_pressure,return_temperature,return_conductivity,return_turbidity,return_flow,supply_pump,supply_pre_rinse,supply_caustic,return_caustic,supply_acid,return_acid,supply_clean_water,return_recovery_water,return_drain,object_low_level,tank_level_pre_rinse,tank_level_caustic,tank_level_acid,tank_level_clean_water,tank_temperature_pre_rinse,tank_temperature_caustic,tank_temperature_acid,tank_concentration_caustic,tank_concentration_acid,tank_lsh_caustic,tank_lsh_acid,tank_lsh_clean_water,tank_lsh_pre_rinse,target_time_period,process_phase
0,0,20001,405,pre_rinse,2018-04-15 04:20:47,L4,8550.348,0.615451,18.044704,4.990765,0.177228,15776.91,True,True,False,False,False,False,False,False,True,True,55.499672,41.555992,44.026875,49.474102,32.385708,83.03675,73.03241,45.394646,44.340126,False,0.0,False,0.0,False,20001_pre_rinse
1,1,20001,405,pre_rinse,2018-04-15 04:20:49,L4,11364.294,0.654297,18.229168,3.74968,0.122975,13241.464,True,True,False,False,False,False,False,False,True,True,55.48792,41.62417,44.045685,49.457645,32.385708,83.015045,73.03241,45.394447,44.33938,False,0.0,False,0.0,False,20001_pre_rinse
2,2,20001,405,pre_rinse,2018-04-15 04:20:51,L4,12174.479,0.69987,18.395544,2.783954,0.387008,10698.785,True,True,False,False,False,False,False,False,True,True,55.476166,41.638275,44.045685,49.46235,32.385708,83.015045,73.03241,45.39628,44.336735,False,0.0,False,0.0,False,20001_pre_rinse
3,3,20001,405,pre_rinse,2018-04-15 04:20:53,L4,13436.776,0.761502,18.583622,1.769353,0.213397,8007.8125,True,True,False,False,False,False,False,False,True,True,55.471466,41.647675,44.04803,49.46235,32.385708,83.03675,73.03241,45.401875,44.33311,False,0.0,False,0.0,False,20001_pre_rinse
4,4,20001,405,pre_rinse,2018-04-15 04:20:55,L4,13776.766,0.83724,18.627026,0.90402,0.148293,6004.051,True,True,False,False,False,False,False,False,True,True,55.459705,41.65473,44.04803,49.46235,32.385708,83.015045,73.03241,45.398197,44.334373,False,0.0,False,0.0,False,20001_pre_rinse


In [6]:
# Different statistics can help us realize data
train_values.describe()

Unnamed: 0,row_id,process_id,object_id,supply_flow,supply_pressure,return_temperature,return_conductivity,return_turbidity,return_flow,tank_level_pre_rinse,tank_level_caustic,tank_level_acid,tank_level_clean_water,tank_temperature_pre_rinse,tank_temperature_caustic,tank_temperature_acid,tank_concentration_caustic,tank_concentration_acid,tank_lsh_acid,tank_lsh_pre_rinse
count,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0,3571941.0
mean,2985055.0,24014.63,574.3699,23269.28,1.045687,59.13386,30.87263,3.70404,22198.25,52.80756,42.20547,44.13573,43.00557,28.5902,82.38858,72.57215,45.24451,44.42097,0.0,0.0
std,1730014.0,2317.472,345.8863,17206.63,1.354687,23.45633,19.96129,7.683539,17927.51,3.958271,3.046017,2.432799,6.552683,1.876694,2.331478,0.40917,0.8697421,1.087087,0.0,0.0
min,0.0,20001.0,102.0,-94814.82,-0.1145833,0.0,0.0,-0.3616898,-1251.447,0.0,0.0,27.78157,0.0,0.0,0.0,67.29962,25.69796,0.0,0.0,0.0
25%,1482635.0,22018.0,300.0,6622.54,0.08854166,32.50868,2.453276,0.4195602,4108.796,50.32046,40.70258,42.73148,40.89301,27.17375,82.25188,72.39583,45.12999,44.30475,0.0,0.0
50%,2994167.0,24064.0,420.0,23249.42,0.3760851,70.00868,42.81397,1.124855,21932.87,53.70822,42.47522,44.2032,44.14207,28.49754,82.53761,72.60561,45.29035,44.51433,0.0,0.0
75%,4477179.0,25994.0,934.0,33087.38,1.841363,78.88093,44.60889,2.466725,34848.81,55.95106,44.01982,45.67256,47.44756,29.88281,82.80165,72.82262,45.4526,44.74604,0.0,0.0
max,5987585.0,27989.0,977.0,103161.2,6.19401,96.97627,73.56589,100.9657,103139.5,58.34907,51.35489,52.44104,50.63549,37.78935,83.52864,73.9945,61.46095,65.24091,0.0,0.0


In [7]:
# We want to use all the data that`s why id and pipeline at work
def prep_metadata(df):
    
    # select process_id and pipeline
    metadata = df[['process_id', 'pipeline']].drop_duplicates().set_index('process_id') 
    
    # convert categorical pipeline data to dummy variables
    metadata = pd.get_dummies(metadata)
    
    # pipeline L12 not in test data!!!
    if 'L12' not in metadata.columns:
        metadata['pipeline_L12'] = 0
    
    # calculate number of phases for each process_object
    metadata['num_phases'] = df.groupby('process_id')['phase'].apply(lambda x: x.nunique())
    
    return metadata



In [8]:
# As we saw later we want to solve regression problem hence our data should be in appropriate conditions
def prep_time_series_features_for_regression(df, columns = None):
   
    need_cols = [
    'process_id',
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    ]
    
    if columns is None:
        columns = df.columns
    
    need_data = df[need_cols].set_index('process_id')
    
    # create features: min, max, mean, standard deviation, and mean of the last five observations
    need_features = need_data.groupby('process_id').agg(['min', 'max', 'mean', 'std', lambda x: x.tail(5).mean()])
    
    return need_features



In [9]:
# Generally speaking, I thought these data to be valuable but they are not
def perform_bool_information(df):
    
    need_cols = [
    'process_id',
    'supply_pump',
    'supply_pre_rinse',
    'supply_caustic',
    'return_caustic',
    'supply_acid',
    'return_acid',
    'supply_clean_water',
    'return_recovery_water',
    'return_drain'
    ]
    need_features = df[need_cols].set_index('process_id')
    
    need_features = need_features.groupby('process_id').agg(lambda x: x[lambda y : y == True].sum())
    
    return need_features

In [10]:
# This function is a union of three previuos ones
def create_feature_matrix(df):
    
    metadata = prep_metadata(df)
    
    time_series = prep_time_series_features_for_regression(df)
    
    bool_inf = perform_bool_information(df)
    
    # join metadata and time series features into a single dataframe
    feature_matrix = pd.concat([metadata, time_series], axis=1)# bool_inf does not help
    
    return feature_matrix

In [11]:
# Transform raw data into features for our future model
train_features = create_feature_matrix(train_values)

In [12]:
# Our metric here is not Least Squares but special  
def metrics(X, Y):
    Z = abs(X - Y)
    K = Y.applymap(lambda x: max(abs(x), 290000))
    Z = Z / K
    return ((Z.sum()) / Z.size).values

In [13]:
# Making train set and validation set to evaluate our results fairly
X_train, X_valid, Y_train, Y_valid = train_test_split(train_features, train_labels, random_state = 0)

In [14]:
# This model is one of the easiest that`s why we should try it, but our results are poor
from sklearn.neighbors import KNeighborsRegressor

KNR_model = KNeighborsRegressor(n_neighbors = 1)
KNR_model.fit(X_train, Y_train)
pred_KNR = KNR_model.predict(X_valid)
cr = metrics(pred_KNR, Y_valid)
print(cr)

[1.70690684]


In [15]:
# As we expected Tree methods usually work more regular and robust in different tasks, but it is not well then
from sklearn.tree import DecisionTreeRegressor

DT_model = DecisionTreeRegressor(max_leaf_nodes = 1500, max_depth = 20, random_state = 0)
DT_model.fit(X_train, Y_train)
pred_DT = DT_model.predict(X_valid)
pred_DT.reshape(-1, 1)

cr = metrics(pred_DT.reshape(-1, 1),Y_valid)
print(cr)

[1.60811307]


In [16]:
# Putting it mildly, it is one of the best models to tune and use, our results are increadible!
from sklearn.ensemble import GradientBoostingRegressor

GB_model = GradientBoostingRegressor(loss = 'lad', learning_rate = 0.1, n_estimators = 90, random_state = 0, max_depth = 6)
GB_model.fit(X_train, Y_train)
pred_GB = GB_model.predict(X_valid)
cr = metrics(pred_GB.reshape(-1, 1), Y_valid)
print(cr)

[0.40587567]


In [17]:
# Adjusting parametres is always very necessary 
GB_model = GradientBoostingRegressor(loss = 'lad', learning_rate = 0.1, n_estimators = 50, random_state = 0, max_depth = 5)
GB_model.fit(X_train, Y_train)
pred_GB = GB_model.predict(X_valid)
cr = metrics(pred_GB.reshape(-1, 1), Y_valid)
print(cr)

[0.39328043]


In [18]:
# However we cannot change parameters by hand hence loops can help us
minim = 1
i = 0.02
j = 20
while i <= 0.12:
    while j <= 120:
        GB_model = GradientBoostingRegressor(loss = 'lad', learning_rate = i, n_estimators = j, random_state = 0, max_depth = 5)
        GB_model.fit(X_train, Y_train)
        pred_GB = GB_model.predict(X_valid)
        cr = metrics(pred_GB.reshape(-1, 1), Y_valid)
        if cr < minim:
            minim = cr
            print(cr, ';i:', i, ';j:', j)
        j += 20
    j = 20   
    i += 0.02 
    
print('Stopped Process')     

[0.71146876] ;i: 0.02 ;j: 20
[0.59606305] ;i: 0.02 ;j: 40
[0.52426157] ;i: 0.02 ;j: 60
[0.47846577] ;i: 0.02 ;j: 80
[0.44863653] ;i: 0.02 ;j: 100
[0.42960834] ;i: 0.02 ;j: 120
[0.42535864] ;i: 0.04 ;j: 60
[0.40582901] ;i: 0.04 ;j: 80
[0.40226227] ;i: 0.04 ;j: 100
[0.39753303] ;i: 0.04 ;j: 120
[0.39576022] ;i: 0.06 ;j: 120
[0.39370191] ;i: 0.1 ;j: 60
[0.39088247] ;i: 0.1 ;j: 80
Stopped Process


In [19]:
# Let`s try a bit more models, however the need scaling to work well
from sklearn.preprocessing import Imputer, PolynomialFeatures, MinMaxScaler, RobustScaler, StandardScaler, Normalizer

scaler = Normalizer()
scaler.fit(train_features)
X = scaler.transform(train_features) 
X_train,X_valid,Y_train,Y_valid = train_test_split(X, train_labels, random_state = 0)

In [20]:
# This model is quite well, but not excellent in our notebook
from sklearn.neural_network import MLPRegressor

MLP_model = MLPRegressor(hidden_layer_sizes = (200, ), learning_rate_init = 0.001, alpha = 0.00003)
MLP_model.fit(X_train, Y_train)
pred_MLP = MLP_model.predict(X_valid)
cr = metrics(pred_MLP.reshape(-1, 1), Y_valid)
print(cr)

[0.85252112]


In [21]:
# May be the matter is our parameters, the answer is later no
minim = 1
m = 0.0001
for i in range(1, 300, 25):
    MLP_model = MLPRegressor(hidden_layer_sizes = (i, ), max_iter = 500, random_state = 0, alpha = 0.00003, learning_rate_init = 0.005)
    MLP_model.fit(X_train, Y_train)
    pred_MLP = MLP_model.predict(X_valid)
    cr = metrics(pred_MLP.reshape(-1, 1), Y_valid)
    if cr < minim:
        minim = cr
        print(cr, '; hl=', i)
               
print('Stopped Process')     

[0.84815456] ; hl= 1
[0.66224853] ; hl= 26
[0.62961491] ; hl= 51
Stopped Process


In [22]:
# Sometimes this method works much better than Boosting, but it is hardly to be tuned properly 
from sklearn.svm import NuSVR

SVR_model = NuSVR(nu = 0.999999, C = 0.5)
SVR_model.fit(X_train, Y_train)
pred_SVR = SVR_model.predict(X_valid)
cr = metrics(pred_SVR.reshape(-1, 1), Y_valid)
print(cr)

[0.8728878]


In [23]:
# Load the test data
test_values = pd.read_csv(data_path / 'test_values.csv',
                         index_col = 0,
                         parse_dates = ['timestamp'])

In [24]:
# Create metadata and time series features
test_features = create_feature_matrix(test_values)

In [25]:
# We use Boosting Trees to suggest our last solution
preds = GB_model.predict(test_features)

In [26]:
# We load rules of making submissions
submission_format = pd.read_csv(data_path / 'submission_format.csv', index_col = 0)

In [27]:
# Confirm everything is in the right order
assert np.all(test_features.index == submission_format.index)

In [28]:
# Our submission
my_submission = pd.DataFrame(data = preds,
                             columns = submission_format.columns,
                             index = submission_format.index)

In [29]:
# Our task is done!
my_submission.to_csv('submission20.csv')