Загрузка библиотек

In [1]:
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
import re

import lightgbm as lgb
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

Подготовка данных

In [2]:
df = pd.read_csv('/content/train_dataset_train.csv')

lat_list = []
long_list = []
len_geo_list = []

for row in df['.geo']:
  row = re.sub('[^0-9.,]', '', row).strip(',').strip('.').replace(',,', ',')
  row = np.asarray(row.split(','), dtype=np.float64)
  lat_list.append(round(row[::2].mean(),8)) # средняя широта
  long_list.append(round(row[1::2].mean(),8)) # средняя долгота
  len_geo_list.append(len(row)) # количество координат геоданных (длина геоданных)

insert_index = df.shape[1]-1
df.insert(insert_index, "lat", np.asarray(lat_list)) # столбец Средняя широта
df.insert(insert_index+1, "long", np.asarray(long_list)) # столбец Средняя долгота
df.insert(insert_index+2, "len_geo", np.asarray(len_geo_list)) # столбец Длина геоданных

df.head(1)

Unnamed: 0,id,area,nd_mean_2021-04-16,nd_mean_2021-04-19,nd_mean_2021-04-22,nd_mean_2021-04-26,nd_mean_2021-04-28,nd_mean_2021-05-02,nd_mean_2021-05-04,nd_mean_2021-05-07,...,nd_mean_2021-06-10,nd_mean_2021-07-05,nd_mean_2021-08-13,nd_mean_2021-08-27,nd_mean_2021-05-08,nd_mean_2021-05-24,lat,long,len_geo,crop
0,3536,20,0.072846,0.261778,0.062981,0.104442,0.021096,0.052202,0.158723,0.0,...,0.0,0.026784,0.126832,0.61477,0.008857,0.081498,42.500303,51.412815,72,3


Отбор признаков для обучения

In [4]:
# https://proglib.io/p/feature-selector
!pip install git+https://github.com/WillKoehrsen/feature-selector.git
from feature_selector import FeatureSelector

x = df.drop(columns = ['id','.geo', 'crop'], axis = 1)
y = df['crop']

fs = FeatureSelector(data = x, labels = y)
fs.identify_zero_importance(task = 'classification', eval_metric = 'multiclass', n_iterations = 10, early_stopping = True)

zero_importance_features = fs.ops['zero_importance']
zero_importance_features

# Вывод: все признаки существенны

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[178]	valid_0's multi_logloss: 0.0656091
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[203]	valid_0's multi_logloss: 0.0843014
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[194]	valid_0's multi_logloss: 0.0787147
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[143]	valid_0's multi_logloss: 0.112217
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[161]	valid_0's multi_logloss: 0.0881954
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[205]	valid_0's multi_logloss: 0.0756783
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[159]	valid_0's multi_logloss: 0.11

[]

Подбор лучшей модели

In [8]:
x = df.drop(columns = ['id','.geo', 'crop'], axis = 1)
y = df['crop']

classifiers = {}
classifiers.update({"LDA": LinearDiscriminantAnalysis()})
classifiers.update({"QDA": QuadraticDiscriminantAnalysis()})
classifiers.update({"AdaBoost": AdaBoostClassifier()})
classifiers.update({"Bagging": BaggingClassifier()})
classifiers.update({"Extra Trees Ensemble": ExtraTreesClassifier()})
classifiers.update({"Gradient Boosting": GradientBoostingClassifier()})
classifiers.update({"Random Forest": RandomForestClassifier()})
classifiers.update({"Ridge": RidgeClassifier()})
classifiers.update({"SGD": SGDClassifier()})
classifiers.update({"BNB": BernoulliNB()})
classifiers.update({"GNB": GaussianNB()})
classifiers.update({"KNN": KNeighborsClassifier()})
classifiers.update({"MLP": MLPClassifier(max_iter=1000)})
classifiers.update({"DTC": DecisionTreeClassifier()})
classifiers.update({"ETC": ExtraTreeClassifier()})
classifiers.update({"LGBMClassifier": lgb.LGBMClassifier()})

score_list = []

for i in range(10):

  x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.25)

  # Cтандартизация данных
  scaler = StandardScaler()
  scaler.fit(x_train)
  x_train = scaler.transform(x_train)
  x_test = scaler.transform(x_test)

  score_list_row = []

  for key in classifiers:
    model = classifiers[key]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    score_list_row.append(recall_score(y_test, y_pred, average='macro', zero_division=0))
  
  score_list.append(score_list_row)

score = np.asarray(score_list).mean(axis = 0)
std_value = np.asarray(score_list).std(axis = 0)

i = 0
for key in classifiers:
  print(key, score[i], std_value[i])
  i = i + 1

# Вывод: лучшая модель LGBMClassifier c recall_score = 0.97068 и std = 0.002792

LDA 0.9408784499883985 0.007512955172610972
QDA 0.9468676760629251 0.005917332254438117
AdaBoost 0.5947153111293523 0.07018002277035973
Bagging 0.9285130935945984 0.007089494116007768
Extra Trees Ensemble 0.9610368470262347 0.004285733287419532
Gradient Boosting 0.9635486967878147 0.002871586886149703
Random Forest 0.9657580296669683 0.004843040661410231
Ridge 0.9389419341191101 0.006173333447506652
SGD 0.92251062845617 0.0074698925069881595
BNB 0.8526356871415903 0.008515030246537164
GNB 0.9398856419362922 0.007140188795747817
KNN 0.9270075714106139 0.004358501849411202
MLP 0.9496760014906084 0.005208776309326803
DTC 0.8957765266478956 0.006979069218693303
ETC 0.7441452389687628 0.014155302027780193
LGBMClassifier 0.970683398007583 0.002791734212649729


Оптимизация гиперпараметров лучшей модели

In [35]:
parameters = {'objective': ['multiclass'],
              'metric': ['multiclassova'],
              'boosting_type': ['dart'],
              'num_leaves': [32, 64], # num_leaves = 2^(max_depth)
              'num_class': [7],
              'bagging_freq': [0, 1],
              'feature_fraction': [0.3, 0.4],
              'max_depth': [5, 6],
              'learning_rate': [0.15, 0.2],
              'n_estimators': [300],
              'max_bin': [255]
             }

model = GridSearchCV(lgb.LGBMClassifier(), parameters, cv=5)
model.fit(x, y)

model.best_params_

{'bagging_freq': 0,
 'boosting_type': 'dart',
 'feature_fraction': 0.3,
 'learning_rate': 0.15,
 'max_bin': 255,
 'max_depth': 6,
 'metric': 'multiclassova',
 'n_estimators': 300,
 'num_class': 7,
 'num_leaves': 32,
 'objective': 'multiclass'}

Обучение и сохранение лучшей модели

In [10]:
model = lgb.LGBMClassifier(objective='multiclass', metric='multiclassova', boosting_type='dart', 
                           num_leaves=32, num_class=7, bagging_freq=0, feature_fraction=0.3, 
                           max_depth=6, learning_rate=0.15, n_estimators=300, max_bin=255)
model.fit(x, y)

# Сохраняем модель в файл
model.booster_.save_model('model.txt')

<lightgbm.basic.Booster at 0x7f04fa6b9490>

Прогнозирование

In [15]:
# Библиотеки для прогнозирования
# import numpy as np
# from numpy import mean
# import pandas as pd
# import lightgbm as lgb
# import re

df = pd.read_csv('/content/test_dataset_test.csv')

lat_list = []
long_list = []
len_geo_list = []

for row in df['.geo']:
  row = re.sub('[^0-9.,]', '', row).strip(',').strip('.').replace(',,', ',')
  row = np.asarray(row.split(','), dtype=np.float64)
  lat_list.append(round(row[::2].mean(),8)) # средняя широта
  long_list.append(round(row[1::2].mean(),8)) # средняя долгота
  len_geo_list.append(len(row)) # количество координат геоданных (длина геоданных)

insert_index = df.shape[1]
df.insert(insert_index, "lat", np.asarray(lat_list)) # столбец Средняя широта
df.insert(insert_index+1, "long", np.asarray(long_list)) # столбец Средняя долгота
df.insert(insert_index+2, "len_geo", np.asarray(len_geo_list)) # столбец Длина геоданных

x = df.drop(columns = ['id','.geo'], axis = 1)

# Делаем прогноз
model = lgb.Booster(model_file='model.txt')
y_proba = model.predict(x)
y_pred = []
for value in y_proba:
  y_pred.append(value.argmax())

# Сохраняем решение
df_pred = pd.DataFrame({'id': np.ravel(df[['id']].values), 'crop': y_pred})
df_pred.to_csv('sample_solution.csv', index=False, sep=',')