# Импорты

In [8]:
import warnings
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier, BaggingRegressor, \
    GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
warnings.filterwarnings('ignore')

In [9]:
parameters_ensemble = {'n_estimators': np.arange(20,101,20),
                       'max_features': np.arange(3,24,10)}

# Классификация

In [10]:
def print_classification_model_metrics(estimator, y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))

In [11]:
movie_data = pd.read_csv("../data/CSM_snake_case.csv")
movie_data.drop('Movie', 1, inplace=True)

In [12]:
x_axis = movie_data.drop('Year', 1).values
y_axis = movie_data['Year'].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x_axis, y_axis, test_size=0.2, stratify=y_axis)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Базовая модель DTC

Анализ базовой модели

In [15]:
%%time
parameters_dtc = {'max_depth': np.arange(5,16,1)}
dtc = DecisionTreeClassifier()
dtc_base = GridSearchCV(dtc, parameters_dtc).fit(X_train, y_train)
dtc_base.best_params_

Wall time: 221 ms


{'max_depth': 5}

In [16]:
print_classification_model_metrics(dtc_base, y_test, dtc_base.predict(X_test))

[[25  8]
 [ 9  5]]
              precision    recall  f1-score   support

      2014.0       0.74      0.76      0.75        33
      2015.0       0.38      0.36      0.37        14

    accuracy                           0.64        47
   macro avg       0.56      0.56      0.56        47
weighted avg       0.63      0.64      0.63        47

0.6382978723404256


## Изучение модели BaggingClassifier

In [17]:
%%time
bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=dtc_base.best_params_['max_depth']))
model = GridSearchCV(bag, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 4.68 s


{'max_features': 3, 'n_estimators': 60}

In [18]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[31  2]
 [12  2]]
              precision    recall  f1-score   support

      2014.0       0.72      0.94      0.82        33
      2015.0       0.50      0.14      0.22        14

    accuracy                           0.70        47
   macro avg       0.61      0.54      0.52        47
weighted avg       0.66      0.70      0.64        47

0.7021276595744681


## Изучение модели GradientBoostingClassifier

In [19]:
%%time
gbc = GradientBoostingClassifier()
model = GridSearchCV(gbc, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 1.73 s


{'max_features': 3, 'n_estimators': 100}

In [20]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[32  1]
 [11  3]]
              precision    recall  f1-score   support

      2014.0       0.74      0.97      0.84        33
      2015.0       0.75      0.21      0.33        14

    accuracy                           0.74        47
   macro avg       0.75      0.59      0.59        47
weighted avg       0.75      0.74      0.69        47

0.7446808510638298


## Изучение модели StackingClassifier

In [21]:
%%time
model = StackingClassifier(estimators=[('bag',bag), ('gbc',gbc)],
                           final_estimator=dtc_base).fit(X_train, y_train)

Wall time: 1.06 s


In [22]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[27  6]
 [ 9  5]]
              precision    recall  f1-score   support

      2014.0       0.75      0.82      0.78        33
      2015.0       0.45      0.36      0.40        14

    accuracy                           0.68        47
   macro avg       0.60      0.59      0.59        47
weighted avg       0.66      0.68      0.67        47

0.6808510638297872


## Изучение модели CatBoostClassifier

In [23]:
%%time
cbc = CatBoostClassifier(learning_rate=0.15).fit(X_train, y_train)
print_classification_model_metrics(cbc, y_test, cbc.predict(X_test))

0:	learn: 0.6151437	total: 87.1ms	remaining: 1m 27s
1:	learn: 0.5661767	total: 89.9ms	remaining: 44.9s
2:	learn: 0.5092731	total: 97.3ms	remaining: 32.3s
3:	learn: 0.4728651	total: 100ms	remaining: 25s
4:	learn: 0.4433829	total: 103ms	remaining: 20.6s
5:	learn: 0.4135539	total: 109ms	remaining: 18.1s
6:	learn: 0.3949644	total: 117ms	remaining: 16.6s
7:	learn: 0.3759419	total: 121ms	remaining: 15s
8:	learn: 0.3592193	total: 129ms	remaining: 14.2s
9:	learn: 0.3481501	total: 131ms	remaining: 13s
10:	learn: 0.3294188	total: 134ms	remaining: 12.1s
11:	learn: 0.3159970	total: 138ms	remaining: 11.4s
12:	learn: 0.3008791	total: 142ms	remaining: 10.8s
13:	learn: 0.2886700	total: 145ms	remaining: 10.2s
14:	learn: 0.2760146	total: 149ms	remaining: 9.76s
15:	learn: 0.2700120	total: 154ms	remaining: 9.46s
16:	learn: 0.2569976	total: 157ms	remaining: 9.08s
17:	learn: 0.2464691	total: 160ms	remaining: 8.7s
18:	learn: 0.2336323	total: 162ms	remaining: 8.37s
19:	learn: 0.2269256	total: 165ms	remaining:

# Регрессия

In [24]:
def print_regression_model_metrics(estimator, y_test, y_pred):
    print(f"Коэффициент детерминации: {estimator.score(x_axis,y_axis)}")
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

In [25]:
data = pd.read_csv("../data/CSM_snake_case.csv")
data.drop('Movie', axis=1, inplace=True)
data

Unnamed: 0,Year,Ratings,Genre,Gross,Budget,Screens,Sequel,Sentiment,Views,Likes,Dislikes,Comments,Aggregate Followers
0,2014.0,6.3,8.0,9130.0,4000000.0,45.000000,1.0,0.0,3280543.0,4632.0,425.0,636.0,1.120000e+06
1,2014.0,7.1,1.0,192000000.0,50000000.0,3306.000000,2.0,2.0,583289.0,3465.0,61.0,186.0,1.235000e+07
2,2014.0,6.2,1.0,30700000.0,28000000.0,2872.000000,1.0,0.0,304861.0,328.0,34.0,47.0,4.830000e+05
3,2014.0,6.3,1.0,106000000.0,110000000.0,3470.000000,2.0,0.0,452917.0,2429.0,132.0,590.0,5.680000e+05
4,2014.0,4.7,8.0,17300000.0,3500000.0,2310.000000,2.0,0.0,3145573.0,12163.0,610.0,1082.0,1.923800e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,2015.0,6.4,4.0,1210000.0,50000000.0,66.000000,1.0,4.0,3701061.0,9325.0,641.0,1859.0,3.038193e+06
227,2015.0,5.5,15.0,21000000.0,37000000.0,2815.000000,1.0,13.0,7119456.0,18803.0,1128.0,2290.0,3.038193e+06
228,2015.0,5.4,8.0,10200000.0,35000000.0,2777.000000,1.0,7.0,3450614.0,6823.0,325.0,409.0,3.038193e+06
229,2015.0,5.4,1.0,12300000.0,3000000.0,2209.244344,1.0,10.0,66872.0,400.0,67.0,201.0,3.038193e+06


In [26]:
x_axis = data.drop('Year', 1).values
y_axis = data['Year'].values

In [27]:
X_train, X_test, y_train, y_test = train_test_split(x_axis, y_axis, test_size=0.2, stratify=y_axis)

## Базовая модель DTR

Анализ базовой модели

In [28]:
%%time
parameters_dtr = {'max_depth': np.arange(5,16,1)}
dtr = DecisionTreeRegressor().fit(X_train, y_train)
dtr_base = GridSearchCV(dtr, parameters_dtr).fit(X_train, y_train)
print(dtr_base.best_params_)
print_regression_model_metrics(dtr_base, y_test, dtr_base.predict(X_test))

{'max_depth': 5}
Коэффициент детерминации: 0.48645537755798984
MSE: 0.2898335593738235
RMSE: 0.5383619222918942
MAE: 0.3439149975320106
Wall time: 196 ms


## Изучение модели BaggingRegressor

Анализ композиции (обучение / поиск гиперпараметров)

In [29]:
%%time
br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=dtr_base.best_params_['max_depth']))
model = GridSearchCV(br, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 3.79 s


{'max_features': 3, 'n_estimators': 40}

In [30]:
print_regression_model_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.5772027201633549
MSE: 0.16882461546860453
RMSE: 0.4108827271480325
MAE: 0.35040607388806705


## Изучение модели GradientBoostingRegressor

Анализ композиции (обучение / поиск гиперпараметров)

In [31]:
%%time
gbr = GradientBoostingRegressor()
model = GridSearchCV(gbr, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 840 ms


{'max_features': 3, 'n_estimators': 80}

In [32]:
print_regression_model_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.7509422941263466
MSE: 0.17841340676390077
RMSE: 0.4223901120574448
MAE: 0.3278066986802398


## Изучение модели StackingRegressor

Сделаем стекинг из двух композиций и применим DecisionTreeRegressor с найденными гиперпараметрами

In [33]:
%%time
model = StackingRegressor(estimators=[('br',br), ('gbr',gbr)],
                           final_estimator=dtr_base).fit(X_train, y_train)

Wall time: 858 ms


In [34]:
print_regression_model_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.4423240043922
MSE: 0.1918223941885489
RMSE: 0.43797533513720716
MAE: 0.2981451172940398


## Изучение модели CatBoostRegressor

In [35]:
%%time
cbr = CatBoostRegressor(learning_rate=0.15).fit(X_train, y_train)
print_regression_model_metrics(cbr, y_test, cbr.predict(X_test))

0:	learn: 0.4393409	total: 3.08ms	remaining: 3.08s
1:	learn: 0.4258042	total: 9.85ms	remaining: 4.91s
2:	learn: 0.4148160	total: 12ms	remaining: 3.98s
3:	learn: 0.4059531	total: 14.7ms	remaining: 3.65s
4:	learn: 0.3953991	total: 17.5ms	remaining: 3.49s
5:	learn: 0.3854100	total: 23.4ms	remaining: 3.87s
6:	learn: 0.3796192	total: 26.1ms	remaining: 3.7s
7:	learn: 0.3666763	total: 30ms	remaining: 3.72s
8:	learn: 0.3557381	total: 33.6ms	remaining: 3.7s
9:	learn: 0.3491353	total: 41.5ms	remaining: 4.11s
10:	learn: 0.3423307	total: 44.7ms	remaining: 4.02s
11:	learn: 0.3355062	total: 48ms	remaining: 3.96s
12:	learn: 0.3292525	total: 50.4ms	remaining: 3.83s
13:	learn: 0.3197943	total: 55.5ms	remaining: 3.91s
14:	learn: 0.3130574	total: 57.7ms	remaining: 3.79s
15:	learn: 0.3081221	total: 59.8ms	remaining: 3.68s
16:	learn: 0.3048894	total: 62ms	remaining: 3.58s
17:	learn: 0.2991747	total: 64.7ms	remaining: 3.53s
18:	learn: 0.2939326	total: 68.5ms	remaining: 3.54s
19:	learn: 0.2879011	total: 70.7

# Выводы по композиционным моделям

Как и следовало ожидать, использование композиций алгоритмов дает повышает качество моделей.