In [39]:
def get_smoothed_log_mape_column_value(responses_column, answers_column, epsilon):
    return np.abs(np.log(
        (responses_column + epsilon)
        / (answers_column + epsilon)
    )).mean()


def get_smoothed_mean_log_accuracy_ratio(answers, responses, epsilon=0.005):  # answers - правильные ответы
    log_accuracy_ratio_mean = np.array(
        [
            get_smoothed_log_mape_column_value(responses.at_least_one, answers.at_least_one, epsilon),
            get_smoothed_log_mape_column_value(responses.at_least_two, answers.at_least_two, epsilon),
            get_smoothed_log_mape_column_value(responses.at_least_three, answers.at_least_three, epsilon),
        ]
    ).mean()

    percentage_error = 100 * (np.exp(log_accuracy_ratio_mean) - 1)

    return percentage_error.round(
        decimals=2
    )

In [1]:
import pandas as pd
import numpy as np

In [2]:
DF = "Data/top_validate.tsv"
VALIDATE_ANSWERS = "Data/validate_answers.tsv"

In [3]:
Y = pd.read_csv(VALIDATE_ANSWERS, sep='\t')

In [4]:
Y.head()

Unnamed: 0,at_least_one,at_least_two,at_least_three
0,0.043,0.0152,0.0073
1,0.013,0.0,0.0
2,0.0878,0.0135,0.0
3,0.2295,0.1295,0.0727
4,0.3963,0.2785,0.227


In [15]:
Y_1 = Y['at_least_one'].copy()
Y_2 = Y['at_least_two'].copy()
Y_3 = Y['at_least_three'].copy()

In [16]:
Y_1

0       0.0430
1       0.0130
2       0.0878
3       0.2295
4       0.3963
         ...  
1003    0.0679
1004    0.0165
1005    0.0071
1006    0.3669
1007    0.0710
Name: at_least_one, Length: 1008, dtype: float64

In [7]:
X = pd.read_csv(DF).astype('float32')
X.head()

Unnamed: 0,cpm,hour_start,hour_end,audience_size,count_unique_cities,time_interval,number_of_sex1,number_of_sex2,number_of_age1,number_of_age2,...,publisher_12,publisher_13,publisher_14,publisher_15,publisher_16,publisher_17,publisher_18,publisher_19,publisher_20,publisher_21
0,220.0,1058.0,1153.0,1906.0,7.0,95.0,1025.0,881.0,470.0,776.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,312.0,1295.0,1301.0,1380.0,372.0,6.0,735.0,645.0,428.0,508.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,70.0,1229.0,1249.0,888.0,280.0,20.0,473.0,415.0,234.0,330.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,240.0,1295.0,1377.0,440.0,7.0,82.0,240.0,200.0,119.0,187.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,262.0,752.0,990.0,1476.0,416.0,238.0,903.0,572.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
X.columns

Index(['cpm', 'hour_start', 'hour_end', 'audience_size', 'count_unique_cities',
       'time_interval', 'number_of_sex1', 'number_of_sex2', 'number_of_age1',
       'number_of_age2', 'number_of_age3', 'number_of_age4', 'number_of_age5',
       'number_of_age7', 'percentage_sex1', 'percentage_sex2',
       'number_of_publishers', 'publisher_1', 'publisher_2', 'publisher_3',
       'publisher_4', 'publisher_5', 'publisher_6', 'publisher_7',
       'publisher_8', 'publisher_9', 'publisher_10', 'publisher_11',
       'publisher_12', 'publisher_13', 'publisher_14', 'publisher_15',
       'publisher_16', 'publisher_17', 'publisher_18', 'publisher_19',
       'publisher_20', 'publisher_21'],
      dtype='object')

In [50]:
X = X.drop(['hour_start', 'hour_end'], axis=1)

In [51]:
x_lim=int(len(X)*0.80)

X_train=X[0:x_lim].copy()
Y_train_1=Y_1[0:x_lim]
Y_train_2=Y_2[0:x_lim]
Y_train_3=Y_3[0:x_lim]

X_test=X[x_lim:].copy()
Y_test_1=Y_1[x_lim:]
Y_test_2=Y_2[x_lim:]
Y_test_3=Y_3[x_lim:]

In [52]:
Y_train_1.shape, Y_test_1.shape

((806,), (202,))

In [53]:
Y_test = pd.concat([Y_test_1, Y_test_2, Y_test_3], axis=1)
Y_test = pd.DataFrame(np.array(Y_test), columns=['at_least_one', 'at_least_two', 'at_least_three']).astype('float32')

In [54]:
Y_test.head()

Unnamed: 0,at_least_one,at_least_two,at_least_three
0,0.0163,0.0041,0.0014
1,0.0201,0.0,0.0
2,0.1761,0.0954,0.0578
3,0.0211,0.0105,0.0039
4,0.1165,0.0059,0.0


In [55]:
from sklearn.preprocessing import MinMaxScaler

mms=MinMaxScaler()
X_train_mms=mms.fit_transform(X_train)
X_test_mms=mms.fit_transform(X_test)

print('Минимум и максимум обучающего набора: ', X_train_mms.min(), X_train_mms.max())

Минимум и максимум обучающего набора:  0.0 1.0000000000000002


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [56]:
from sklearn.ensemble import RandomForestRegressor
rf1 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf2 = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf3 = RandomForestRegressor(n_estimators = 1000, random_state = 42)

rf1.fit(X_train_mms, Y_train_1)
predictions_1 = rf1.predict(X_test_mms)
rf2.fit(X_train_mms, Y_train_2)
predictions_2 = rf2.predict(X_test_mms)
rf3.fit(X_train_mms, Y_train_3)
predictions_3 = rf3.predict(X_test_mms)

In [57]:
Y_pesponse = pd.DataFrame([predictions_1, predictions_2, predictions_3]).astype('float32').T
Y_pesponse.columns=['at_least_one', 'at_least_two', 'at_least_three']

In [58]:
get_smoothed_mean_log_accuracy_ratio(Y_test, Y_pesponse)

62.43