In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

## Загрузка данных и их разделение на обучающую и тестовые выборки

In [2]:
load = pd.read_csv('main_train.csv')
x = load.iloc[:, :-1]
y = load.iloc[:, -1]
x = x.iloc[:, 1:]
x = x.fillna(0)

In [3]:
test_size = round(y.size * 0.3)
test_data = x.sample(test_size)
train_data = x.drop(x.sample(test_size).index, axis=0)
test_y = y[test_data.index].to_numpy()
train_y = y[train_data.index].to_numpy()

# Linear Regression

In [98]:
lin_reg = LinearRegression().fit(train_data, train_y)

In [99]:
sum(abs(np.round(lin_reg.predict(test_data)) - test_y))/ test_y.size

6.242801890846583

In [100]:
pkl_filename = './models/linear_regression.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(lin_reg, file)

# Catboost  
### Сначала построим простую модель и оценим важность признаков

In [109]:
catboost_reg = CatBoostRegressor(iterations=200, loss_function='MAE', depth=10, learning_rate=0.03, l2_leaf_reg=3)
catboost_reg = catboost_reg.fit(train_data, train_y, verbose=False)

In [110]:
sum(abs(catboost_reg.predict(test_data) - test_y)) / test_y.size

10.490639818532506

In [111]:
feature_importance = catboost_reg.get_feature_importance()
features = catboost_reg.feature_names_
deleted = []

for feature_id in feature_importance.argsort()[::-1]:
    name = features[feature_id]
    importance = feature_importance[feature_id]
    if importance < 1.1:
        deleted.append(name)
    print(name, importance)
deleted

Water_(g) 19.293182773846915
FA_Sat_(g) 7.187742664160179
Carbohydrt_(g) 6.681301155310559
Lipid_Tot_(g) 6.514616022475402
Iron_(g) 6.408420050314266
FA_Poly_(g) 5.758381829261636
FA_Mono_(g) 5.094183759140599
Sugar_Tot_(g) 4.421599430752451
Sodium_(g) 4.361119788563146
Ash_(g) 3.2019912643230977
Phosphorus_(g) 2.8335159196605626
Protein_(g) 2.287209637254527
Vit_E_(g) 1.986535322731249
Panto_Acid_g) 1.8066954860500115
Calcium_(g) 1.7321744834939543
Niacin_(g) 1.729228057534088
Zinc_(g) 1.505581852528653
Vit_B6_(g) 1.4893884587193664
Copper_g) 1.3179765987894478
Manganese_(g) 1.305309829946572
Thiamin_(g) 1.1943247042570133
Riboflavin_(g) 1.1329610725822232
Vit_C_(g) 1.1291127173706155
Potassium_(g) 1.0650327596985667
Magnesium_(g) 0.9533679994478897
Selenium_(g) 0.9317544604769931
Vit_B12_(g) 0.8937169468787429
Vit_K_(g) 0.7720899460037751
Cholestrl_(g) 0.5946589344492802
Vit_A(g) 0.5751802482263161
Beta_Carot_(g) 0.5683246749661328
Fiber_TD_(g) 0.5239110536982945
Folic_Acid_(g) 0.428

['Potassium_(g)',
 'Magnesium_(g)',
 'Selenium_(g)',
 'Vit_B12_(g)',
 'Vit_K_(g)',
 'Cholestrl_(g)',
 'Vit_A(g)',
 'Beta_Carot_(g)',
 'Fiber_TD_(g)',
 'Folic_Acid_(g)',
 'Retinol_(g)',
 'Choline_Tot_ (g)',
 'Folate_Tot_(g)',
 'Food_Folate_(g)',
 'Lut+Zea_ (g)',
 'Beta_Crypt_(g)',
 'Vit_D_g',
 'Alpha_Carot_(g)',
 'Lycopene_(g)']

In [88]:
test_data_del = test_data.drop(deleted, axis=1)
train_data_del = train_data.drop(deleted, axis=1)

### Воспользуемся несколькими моделями и подберем к ним гиперпараметры

## Catboost

In [119]:
grid_search = pd.DataFrame(columns=['test_mae', 'train_mae', 'learning_rate', 'depth', 'regularization'])
learning_rate = [0.01, 0.03, 0.1, 0.3]
depth = [4, 6, 8, 10, 12]
l2_leaf_reg = [1, 3, 5, 7, 9]
n = 0
for lr in learning_rate:
    for d in depth:
        for l in l2_leaf_reg:
            print(n)
            m = CatBoostRegressor(iterations=1000, loss_function='MAE', depth=d, learning_rate=lr, l2_leaf_reg=l)
            m = m.fit(train_data, train_y, verbose=False)
            grid_search.loc[n] = [sum(abs(m.predict(test_data) - test_y)) / test_y.size, sum(abs(m.predict(train_data) - train_y)) / train_y.size, lr, d, l]
            n += 1
grid_search.head()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


Unnamed: 0,test_mae,train_mae,learning_rate,depth,regularization
0,9.854686,9.397392,0.01,4.0,1.0
1,10.048356,9.630391,0.01,4.0,3.0
2,9.905512,9.540087,0.01,4.0,5.0
3,10.091227,9.648979,0.01,4.0,7.0
4,10.03479,9.571564,0.01,4.0,9.0


In [127]:
grid_search.sort_values(by='test_mae').head()

Unnamed: 0,test_mae,train_mae,learning_rate,depth,regularization
49,4.319072,1.200481,0.03,12.0,9.0
48,4.404898,1.278511,0.03,12.0,7.0
47,4.492026,1.209247,0.03,12.0,5.0
42,4.711015,2.341839,0.03,10.0,5.0
46,4.720463,1.320972,0.03,12.0,3.0


## RandomForest

In [120]:
grid_search_randfor = pd.DataFrame(columns=['test_mae', 'train_mae', 'estimators', 'depth'])
estimators = [100, 200, 50]
depth = [5, 8, 10, None]
n = 0
for est in estimators:
    for d in depth:
        print(n)
        m = RandomForestRegressor(n_estimators=est, max_depth=d, criterion='mae').fit(train_data, train_y)
        grid_search_randfor.loc[n] = [sum(abs(m.predict(test_data) - test_y)) / test_y.size, sum(abs(m.predict(train_data) - train_y)) / train_y.size, est, d]
        n += 1
grid_search_randfor.head()

0
1
2
3
4
5
6
7
8
9
10
11


Unnamed: 0,test_mae,train_mae,estimators,depth
0,12.759525,12.085105,100.0,5.0
1,6.307465,5.628604,100.0,8.0
2,4.830413,3.954255,100.0,10.0
3,3.54954,2.294258,100.0,
4,12.648161,12.002992,200.0,5.0


In [126]:
grid_search_randfor.sort_values(by='test_mae').head()

Unnamed: 0,test_mae,train_mae,estimators,depth
7,3.487013,2.265265,200.0,
3,3.54954,2.294258,100.0,
11,3.59933,2.350448,50.0,
6,4.785035,3.907743,200.0,10.0
2,4.830413,3.954255,100.0,10.0


## Neural Net

In [13]:
grid_search_nn = pd.DataFrame(columns=['test_mae', 'train_mae', 'neurons', 'epochs', 'lr_start'])
neurons = [20, 40, 60]
epochs = [50, 100, 150, 200]
learning_rate = [0.3, 0.1, 0.03, 0.01]
n = 0
for neur in neurons:
    for ep in epochs:
        for ler in learning_rate:
            print(n)
            model = Sequential()
            model.add(Dense(neur, input_shape=(train_data.shape[1],)))
            model.add(Activation('relu'))
            model.add(Dense(neur))
            model.add(Activation('relu'))
            model.add(Dense(neur))
            model.add(Activation('relu'))
            model.add(Dense(1, activation='relu'))
            model.compile(optimizer=Adam(lr=ler), loss="mae", metrics=['accuracy'])
            annealer = LearningRateScheduler(lambda x: ler * 0.95 ** x)
            model.fit(train_data, train_y, batch_size=32, epochs = ep, callbacks=[annealer], validation_data=(test_data, test_y), verbose=0)
            model.save('nn-' + '-'.join([str(neur), str(ep), str(ler)]))
            grid_search_nn.loc[n] = [sum(abs(model.predict(test_data)[:, 0] - test_y)) / test_y.size, sum(abs(model.predict(train_data)[:, 0] - train_y)) / train_y.size, neur, ep, ler]
            n += 1
grid_search_nn.head()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47


Unnamed: 0,test_mae,train_mae,neurons,epochs,lr_start
0,228.441341,223.049733,20.0,50.0,0.3
1,228.441341,223.049733,20.0,50.0,0.1
2,6.125248,5.796064,20.0,50.0,0.03
3,5.387049,5.004012,20.0,50.0,0.01
4,228.441341,223.049733,20.0,100.0,0.3


In [15]:
grid_search_nn.sort_values(by='test_mae').head()

Unnamed: 0,test_mae,train_mae,neurons,epochs,lr_start
43,4.808565,4.433323,60.0,150.0,0.01
47,4.888547,4.501442,60.0,200.0,0.01
46,4.91694,4.573749,60.0,200.0,0.03
42,4.964117,4.613084,60.0,150.0,0.03
30,4.974232,4.590109,40.0,200.0,0.03


Аналогично посмотрим для даты без нескольких столбцов (нейронную сеть и catboost можно не смотреть, так как у них результаты значительно хуже случайного леса)

## Random Forest

In [31]:
grid_search_randfor = pd.DataFrame(columns=['test_mae', 'train_mae', 'estimators', 'depth'])
estimators = [100, 200, 50]
depth = [5, 8, 10, None]
n = 0
for est in estimators:
    for d in depth:
        print(n)
        m = RandomForestRegressor(n_estimators=est, max_depth=d, criterion='mae').fit(train_data_del, train_y)
        grid_search_randfor.loc[n] = [sum(abs(m.predict(test_data_del) - test_y)) / test_y.size, sum(abs(m.predict(train_data_del) - train_y)) / train_y.size, est, d]
        n += 1
grid_search_randfor.head()

0
1
2
3
4
5
6
7
8
9
10
11


Unnamed: 0,test_mae,train_mae,estimators,depth
0,12.440531,12.21434,100.0,5.0
1,6.36737,5.810796,100.0,8.0
2,4.898227,4.070296,100.0,10.0
3,3.594615,2.3945,100.0,
4,12.435909,12.193958,200.0,5.0


In [32]:
grid_search_randfor.sort_values(by='test_mae')

Unnamed: 0,test_mae,train_mae,estimators,depth
7,3.557549,2.394976,200.0,
3,3.594615,2.3945,100.0,
11,3.631332,2.48576,50.0,
6,4.868574,4.064823,200.0,10.0
2,4.898227,4.070296,100.0,10.0
10,4.99807,4.12592,50.0,10.0
5,6.315689,5.780162,200.0,8.0
9,6.35911,5.865867,50.0,8.0
1,6.36737,5.810796,100.0,8.0
4,12.435909,12.193958,200.0,5.0


Так как качество модели ухудшилось совсем на незначительное значение, то мы можем ее спокойно использовать

# Сохранение предсказаний

In [None]:
test = pd.read_csv('main_test.csv')
test = test.iloc[:, 1:]
test = test.fillna(0).drop(deleted, axis=1)
final_model = RandomForestRegressor(n_estimators=200, criterion='mae').fit(train_data_del, train_y)
pkl_filename = './models/random_forest.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(final_model, file)
final_res = np.round(final_model.predict(test)).astype(np.int64)
pd.DataFrame(final_res, columns=['Pred_kcal']).to_csv('Pred_main.csv', index=False)