In [1]:
import pandas as pd
import plotly.graph_objects as go
import statistics
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ARDRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LassoLars
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import LassoLarsIC
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import TheilSenRegressor

Словарей моделей для обучения

In [2]:
from sklearn.tree import DecisionTreeClassifier

models = {
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'ARDRegression': ARDRegression(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'BaggingRegressor': BaggingRegressor(),
    'BayesianRidge': BayesianRidge(),
    'ElasticNet': ElasticNet(),
    'ExtraTreeRegressor': ExtraTreeRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'KernelRidge': KernelRidge(),
    'Lasso': Lasso(),
    'LassoCV': LassoCV(),
    'LassoLars': LassoLars(),
    'LassoLarsCV': LassoLarsCV(),
    'LassoLarsIC': LassoLarsIC(),
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor(),
    'Ridge': Ridge(),
    'RidgeCV': RidgeCV(),
    'TheilSenRegressor': TheilSenRegressor(),
}

Получение данных для обучения для поиска оптимальной модели

In [5]:
path = "../input/task2/"
# Read initial data
train_data = pd.read_excel(f"{path}TRAIN.xlsx")
test_data = pd.read_excel(f"{path}TEST.xlsx").drop('Item', axis=1)

x_data = train_data[train_data.columns[:-1]]
y_data = train_data[train_data.columns[-1]]


Визуализация коэффициента корреляции для каждой модели

In [6]:
fig = go.Figure()
result = {}
num_iterations = 20
for model_name in models:
    result[model_name] = []
    for i in range(num_iterations):
        # Splitting data into TRAINED and TESTED data
        X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.15)
        # Defining a model for training and prediction
        model = models.get(model_name)
        # Filling in model
        model.fit(X_train, y_train)
        # Prediction of data
        predictions = model.predict(X_test)
        # Accuracy of prediction calculation
        score = r2_score(y_test, predictions)
        result[model_name].append(score)
    fig.add_trace(go.Scatter(name=model_name,
                             x=list(range(num_iterations)),
                             y=result[model_name],
                             opacity=0.6
                             ))
fig.update_layout(title="Динамика коэффициента корреляции (R^2) для разных моделей обучения", title_x=0.45)
fig.update_layout(xaxis_title="Итерация",
                  yaxis_title="Коэффициент корреляции (R^2)")
fig.show()

Выбор наиболее эффективной модели

In [7]:
best_model_name, best_model_coef = None, None
min_needed_result = 0.5
for model in result:
    if best_model_coef is None:
        if min(result.get(model)) > min_needed_result:
            best_model_coef = statistics.mean(result.get(model))
            best_model_name = model
    else:
        curr_model_coef = statistics.mean(result.get(model))
        if best_model_coef < curr_model_coef and min(result.get(model)) > min_needed_result:
            best_model_coef = curr_model_coef
            best_model_name = model
print(f"Best model is {best_model_name} with correlation coefficient of {round(best_model_coef * 100, 2)}%")

Best model is LinearRegression with correlation coefficient of 74.22%


Прогнозирования OF на основе выбранной модели с наилучшим коэффициентом корреляции

In [8]:
X_train, y_train, X_test, = train_data[train_data.columns[:-1]], train_data[train_data.columns[-1]], test_data
model = models.get(best_model_name)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
test_data['predict'] = predictions
test_data['predict'] = test_data['predict'].apply(lambda x: round(x, 5))
test_data['predict'].to_csv("Petroleum_team_120_2.csv", index=False)