In [244]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn import metrics

from sklearn.model_selection import GridSearchCV

In [245]:
soybean = pd.read_csv('soybean-large_data.csv')

In [246]:
soybean.head(10)

Unnamed: 0,name,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
0,diaporthe-stem-canker,6,0,2,1,0,1,1,1,0,...,0,0,0,4,0,0,0,0,0,0
1,diaporthe-stem-canker,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,diaporthe-stem-canker,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
3,diaporthe-stem-canker,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
4,diaporthe-stem-canker,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0
5,diaporthe-stem-canker,5,0,2,1,0,3,0,1,0,...,0,0,0,4,0,0,0,0,0,0
6,diaporthe-stem-canker,5,0,2,1,0,2,0,1,1,...,0,0,0,4,0,0,0,0,0,0
7,diaporthe-stem-canker,4,0,2,1,1,1,0,1,0,...,0,0,0,4,0,0,0,0,0,0
8,diaporthe-stem-canker,6,0,2,1,0,3,0,1,1,...,0,0,0,4,0,0,0,0,0,0
9,diaporthe-stem-canker,4,0,2,1,0,2,0,2,0,...,0,0,0,4,0,0,0,0,0,0


На первый взгляд большинство переменных категориальные, но проверим это с помощью функции info().

In [247]:
print(soybean.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             307 non-null    object
 1   date             307 non-null    object
 2   plant-stand      307 non-null    object
 3   precip           307 non-null    object
 4   temp             307 non-null    object
 5   hail             307 non-null    object
 6   crop-hist        307 non-null    object
 7   area-damaged     307 non-null    object
 8   severity         307 non-null    object
 9   seed-tmt         307 non-null    object
 10  germination      307 non-null    object
 11  plant-growth     307 non-null    object
 12  leaves           307 non-null    int64 
 13  leafspots-halo   307 non-null    object
 14  leafspots-marg   307 non-null    object
 15  leafspot-size    307 non-null    object
 16  leaf-shread      307 non-null    object
 17  leaf-malf        307 non-null    ob

Как мы видим, действительно, в датасете только одна переменная numeric - leaves. Посмотрим на более детальное описание данных колонок, включая количество уникальных значений.

In [248]:
soybean.describe(include=['object'])

Unnamed: 0,name,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
count,307,307,307,307,307,307,307,307,307,307,...,307,307,307,307,307,307,307,307,307,307
unique,19,8,3,4,4,3,5,5,4,4,...,4,3,5,5,3,3,3,3,3,4
top,frog-eye-leaf-spot,5,0,2,1,0,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0
freq,40,65,160,210,179,211,99,101,151,140,...,266,286,193,164,227,247,244,251,256,260


Как мы видели из результатов функции info() NA обозначенных непосредственно так у нас в датасете нет, однако отсутствующие значения могут быть обозначены иначе. Чтобы проверить это, посмотрим какие уникальные элементы есть в колонках:

In [249]:
for col in soybean.columns:
    print(soybean[col].unique())

['diaporthe-stem-canker' 'charcoal-rot' 'rhizoctonia-root-rot'
 'phytophthora-rot' 'brown-stem-rot' 'powdery-mildew' 'downy-mildew'
 'brown-spot' 'bacterial-blight' 'bacterial-pustule' 'purple-seed-stain'
 'anthracnose' 'phyllosticta-leaf-spot' 'alternarialeaf-spot'
 'frog-eye-leaf-spot' 'diaporthe-pod-&-stem-blight' 'cyst-nematode'
 '2-4-d-injury' 'herbicide-injury']
['6' '4' '3' '5' '1' '0' '2' '?']
['0' '1' '?']
['2' '0' '1' '?']
['1' '2' '0' '?']
['0' '1' '?']
['1' '2' '3' '0' '?']
['1' '0' '3' '2' '?']
['1' '2' '?' '0']
['0' '1' '?' '2']
['0' '1' '2' '?']
['1' '0' '?']
[1 0]
['0' '?' '2' '1']
['2' '?' '0' '1']
['2' '?' '1' '0']
['0' '?' '1']
['0' '?' '1']
['0' '?' '1' '2']
['1' '0' '?']
['1' '0' '?']
['3' '0' '1' '2' '?']
['1' '0' '3' '2' '?']
['1' '0' '?']
['1' '0' '?']
['0' '1' '?']
['0' '2' '1' '?']
['0' '1' '?']
['0' '3' '?' '1' '2']
['4' '?' '0' '1' '2']
['0' '?' '1']
['0' '?' '1']
['0' '?' '1']
['0' '?' '1']
['0' '?' '1']
['0' '1' '2' '?']


Видим, что практически во всех колонках как одно из значений присутствует знак вопроса, что соответствует пропущенным значениям. Избавимся от них. 

In [251]:
soybean_new = soybean[~(soybean == '?').any(axis=1)]

Еще раз проверим уникальные значения в колонках и удостоверимся, что "вопросов" больше не осталось.

In [252]:
for col in soybean_new:
    print(soybean_new[col].unique())

['diaporthe-stem-canker' 'charcoal-rot' 'rhizoctonia-root-rot'
 'phytophthora-rot' 'brown-stem-rot' 'powdery-mildew' 'downy-mildew'
 'brown-spot' 'bacterial-blight' 'bacterial-pustule' 'purple-seed-stain'
 'anthracnose' 'phyllosticta-leaf-spot' 'alternarialeaf-spot'
 'frog-eye-leaf-spot']
['6' '4' '3' '5' '1' '0' '2']
['0' '1']
['2' '0' '1']
['1' '2' '0']
['0' '1']
['1' '2' '3' '0']
['1' '0' '3' '2']
['1' '2' '0']
['0' '1' '2']
['0' '1' '2']
['1' '0']
[1 0]
['0' '2' '1']
['2' '0' '1']
['2' '1' '0']
['0' '1']
['0' '1']
['0' '1' '2']
['1' '0']
['1' '0']
['3' '0' '1' '2']
['1' '0' '3' '2']
['1' '0']
['1' '0']
['0' '1']
['0' '2' '1']
['0' '1']
['0' '3' '1']
['4' '0' '1' '2']
['0' '1']
['0' '1']
['0' '1']
['0' '1']
['0' '1']
['0' '2' '1']


Теперь проверим датасет на корреляцию между переменными. Так как большинство переменных категориальные - воспользуемся функцией factorize  для числовой репрезентации категориальных данных. С помощью функции corr создадим корреляционную матрицу и визуализируем ее с помощью style.background_gradient

In [206]:
corr = soybean_new.apply(lambda x: x.factorize()[0]).corr()

In [207]:
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,name,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,plant-growth,leaves,leafspots-halo,leafspots-marg,leafspot-size,leaf-shread,leaf-malf,leaf-mild,stem,lodging,stem-cankers,canker-lesion,fruiting-bodies,external decay,mycelium,int-discolor,sclerotia,fruit-pods,fruit spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
name,1.0,-0.19,-0.1,-0.11,-0.16,-0.15,0.12,0.12,0.35,0.07,-0.06,0.62,-0.13,0.49,0.57,0.5,0.05,0.03,-0.17,0.35,0.22,-0.34,0.11,0.13,0.01,-0.13,-0.39,-0.33,0.26,0.66,0.11,0.02,0.14,0.12,0.13,0.02
date,-0.19,1.0,0.26,-0.04,0.01,0.01,0.08,-0.15,-0.17,-0.04,-0.04,-0.15,0.02,0.07,0.04,-0.0,0.14,0.02,0.05,0.1,0.11,0.33,-0.03,0.19,-0.0,0.02,-0.17,-0.14,-0.06,-0.19,-0.13,0.01,-0.19,-0.11,-0.11,-0.01
plant-stand,-0.1,0.26,1.0,-0.0,0.25,-0.07,0.09,-0.16,0.02,0.02,0.39,-0.16,0.1,-0.06,-0.07,-0.12,-0.08,0.01,0.12,0.02,0.01,0.26,0.05,0.1,0.01,0.02,-0.15,-0.17,0.07,-0.08,-0.1,0.02,-0.2,-0.01,0.03,0.12
precip,-0.11,-0.04,-0.0,1.0,0.03,0.13,0.01,0.18,0.23,0.12,-0.07,-0.02,-0.12,0.06,0.03,0.06,-0.02,0.11,0.01,0.11,-0.08,0.08,-0.04,0.14,0.27,-0.05,0.29,0.15,-0.23,-0.13,-0.09,-0.13,0.0,-0.02,-0.07,0.09
temp,-0.16,0.01,0.25,0.03,1.0,0.06,0.05,-0.13,0.17,0.01,0.1,-0.28,0.24,-0.15,-0.13,-0.13,-0.22,0.01,0.16,-0.1,-0.12,0.3,0.07,0.2,-0.09,0.18,0.04,0.02,0.15,-0.17,0.06,0.0,0.05,-0.15,-0.09,-0.01
hail,-0.15,0.01,-0.07,0.13,0.06,1.0,0.14,0.0,0.0,0.06,0.06,-0.2,0.04,-0.08,-0.07,-0.0,0.03,0.11,0.11,0.1,-0.34,0.09,-0.02,0.03,0.03,0.06,0.01,0.14,-0.09,-0.06,0.17,0.12,0.22,-0.0,-0.05,-0.01
crop-hist,0.12,0.08,0.09,0.01,0.05,0.14,1.0,0.11,0.1,-0.08,0.01,0.08,-0.02,0.13,0.1,0.12,-0.04,0.05,-0.02,0.16,-0.08,0.07,-0.05,0.1,0.05,0.02,-0.06,-0.06,-0.08,0.02,-0.04,-0.09,-0.02,-0.05,-0.08,0.01
area-damaged,0.12,-0.15,-0.16,0.18,-0.13,0.0,0.11,1.0,-0.02,-0.01,-0.06,0.24,-0.15,0.13,0.13,0.11,0.03,0.1,-0.01,0.05,-0.01,-0.28,-0.05,-0.07,0.19,-0.12,0.23,0.18,-0.08,0.22,0.05,0.02,0.05,0.05,0.07,-0.02
severity,0.35,-0.17,0.02,0.23,0.17,0.0,0.1,-0.02,1.0,0.08,0.09,0.19,-0.05,0.23,0.24,0.25,-0.04,0.03,-0.07,0.17,0.01,-0.01,0.11,0.1,-0.07,0.02,-0.27,-0.16,0.1,0.14,0.1,-0.13,0.26,-0.06,-0.03,0.02
seed-tmt,0.07,-0.04,0.02,0.12,0.01,0.06,-0.08,-0.01,0.08,1.0,-0.02,0.03,-0.06,-0.07,-0.02,-0.04,0.09,0.0,-0.02,0.09,-0.0,0.03,0.12,0.03,0.03,-0.08,0.02,-0.02,-0.02,0.06,-0.02,-0.06,0.04,-0.08,-0.02,-0.09


Видим достаточно сильную корреляцию между переменными leafspots-halo,leafspots-marg,leafspot-size, поэтому удалим 2 из них из анализа, так как в целом они описывают различные параметры пятен на листе, то есть вполне могут заменять друг друга. 

In [208]:
soybean_new = soybean_new.drop(['leafspots-halo','leafspots-marg'],axis = 1)

Чтобы разбить датасет на стратифицированные выборки надо удостовериться, что во всех классах больше двух значений. 

In [209]:
soybean_new['name'].value_counts()

brown-spot                40
frog-eye-leaf-spot        40
alternarialeaf-spot       40
brown-stem-rot            20
anthracnose               20
phytophthora-rot          16
downy-mildew              10
purple-seed-stain         10
phyllosticta-leaf-spot    10
bacterial-blight          10
powdery-mildew            10
bacterial-pustule         10
diaporthe-stem-canker     10
rhizoctonia-root-rot      10
charcoal-rot              10
Name: name, dtype: int64

In [253]:
y2 = soybean_new.name

In [214]:
 X2_train, X2_test, y2_train, y2_test = train_test_split(soybean_new, y2, test_size=0.33, random_state=42, stratify=soybean_2['name'])

Удалим из тренировочного датасета колонку с классами

In [215]:
X2_train = X2_train.drop(['name'],axis =1)

Создадим классификатор с дефолтными значениями и обучим его на тренировочном датасете

In [217]:
clf = RandomForestClassifier()

In [218]:
clf.fit(X2_train,y2_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Также удалим колонку с классами из тестового датасета

In [219]:
X2_test = X2_test.drop(['name'],axis = 1)

Создадим функцию для нахождения топ-3 признаков для классификации

In [224]:
def top_impact(clf,X):
    f_i = pd.DataFrame({'feature':X.columns,\
                              'importance':pd.array(clf.feature_importances_)})
    f_i = f_i.sort_values('importance',ascending=False)
    f_i.importance = np.around(f_i.importance,decimals=2)
    f_i
    return f_i

In [225]:
top_impact(clf,X2_test)

Unnamed: 0,feature,importance
0,date,0.1
12,leafspot-size,0.09
25,fruit-pods,0.06
26,fruit spots,0.05
2,precip,0.05
19,canker-lesion,0.05
23,int-discolor,0.05
6,area-damaged,0.04
18,stem-cankers,0.04
7,severity,0.04


Как мы можем видеть, наиболее важными для предсказания оказались такие признаки как дата, размер пятен на листе и форма плодов.

Теперь оценим качество классификатора по различным метрикам.

In [227]:
predictions = clf.predict(X2_test)

In [228]:
print(metrics.classification_report(y2_test, predictions))

                        precision    recall  f1-score   support

   alternarialeaf-spot       0.69      0.85      0.76        13
           anthracnose       1.00      1.00      1.00         7
      bacterial-blight       0.75      1.00      0.86         3
     bacterial-pustule       1.00      0.67      0.80         3
            brown-spot       0.93      1.00      0.96        13
        brown-stem-rot       1.00      1.00      1.00         7
          charcoal-rot       1.00      1.00      1.00         3
 diaporthe-stem-canker       1.00      1.00      1.00         3
          downy-mildew       1.00      1.00      1.00         4
    frog-eye-leaf-spot       0.80      0.62      0.70        13
phyllosticta-leaf-spot       1.00      0.67      0.80         3
      phytophthora-rot       1.00      1.00      1.00         5
        powdery-mildew       1.00      1.00      1.00         3
     purple-seed-stain       1.00      1.00      1.00         4
  rhizoctonia-root-rot       1.00      

In [229]:
print(metrics.confusion_matrix(y2_test, predictions))

[[11  0  0  0  0  0  0  0  0  2  0  0  0  0  0]
 [ 0  7  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  3  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 13  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  7  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  3  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  4  0  0  0  0  0  0]
 [ 5  0  0  0  0  0  0  0  0  8  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  5  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  4  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  4]]


Как мы видим, в целом точность предсказаний уже неплохая. Наибольшую сложность представляет классификация alternarialeaf-spot и  frog-eye-leaf-spot, так как 5 frog-eye-leaf-spot были классифицированы как alternarialeaf-spot и наоборот 2 наблюдения относящиеся к alternarialeaf-spot были классифицированы какfrog-eye-leaf-spot . Но тем не менее, для большинства классов наблюдаются значения F1-score близкие к единице, так что точность предсказания уже хорошая. Однако попробуем сделать ее еще лучше.

Попробуем разные значения параметров для количества деревьев и максимальной глубины с помощью GridSearchCV.

In [232]:
parameters = {'n_estimators':[10,20,30,40,50],'max_depth':range(1,10)}

In [233]:
gsc_clf = GridSearchCV(clf,parameters,cv = 5)

In [234]:
gsc_clf.fit(X2_train,y2_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [235]:
best_clf = gsc_clf.best_estimator_

In [236]:
gsc_clf.best_params_

{'max_depth': 7, 'n_estimators': 50}

In [237]:
best_clf.score(X2_test,y2_test)

0.9090909090909091

Оптимальными оказались параметры 50 деревьев и глубина 7 и при их применении значение точности действительно улучшилось. Посмотрим метрики более детально.

In [238]:
predictions_best = best_clf.predict(X2_test)

In [241]:
print(metrics.classification_report(y2_test, predictions_best))

                        precision    recall  f1-score   support

   alternarialeaf-spot       0.69      0.85      0.76        13
           anthracnose       1.00      1.00      1.00         7
      bacterial-blight       1.00      1.00      1.00         3
     bacterial-pustule       1.00      0.67      0.80         3
            brown-spot       0.81      1.00      0.90        13
        brown-stem-rot       1.00      1.00      1.00         7
          charcoal-rot       1.00      1.00      1.00         3
 diaporthe-stem-canker       1.00      1.00      1.00         3
          downy-mildew       1.00      1.00      1.00         4
    frog-eye-leaf-spot       1.00      0.62      0.76        13
phyllosticta-leaf-spot       1.00      1.00      1.00         3
      phytophthora-rot       1.00      1.00      1.00         5
        powdery-mildew       1.00      1.00      1.00         3
     purple-seed-stain       1.00      1.00      1.00         4
  rhizoctonia-root-rot       1.00      

In [243]:
print(metrics.confusion_matrix(y2_test, predictions_best))

[[11  0  0  0  2  0  0  0  0  0  0  0  0  0  0]
 [ 0  7  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  3  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  2  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 13  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  7  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  3  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  4  0  0  0  0  0  0]
 [ 5  0  0  0  0  0  0  0  0  8  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  5  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  4  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  4]]


Как мы видим, у нас улучшилось значение Precision для frog-eye-leaf-spot, то есть alternarialeaf-spot больше не классифицируются ошибочно как frog-eye-leaf-spot (теперь они ошибочно классифицируются как   brown-spot, у которого значение Precision как раз упало) , однако для  alternarialeaf-spot метрики не улучшились. Но в целом классификатор стал немного лучше. 

Посмотрим также топ-3 признаков 

In [254]:
top_impact(best_clf,X2_test)

Unnamed: 0,feature,importance
0,date,0.1
12,leafspot-size,0.1
26,fruit spots,0.06
25,fruit-pods,0.06
19,canker-lesion,0.06
23,int-discolor,0.05
18,stem-cankers,0.05
2,precip,0.05
15,leaf-mild,0.04
20,fruiting-bodies,0.03


Как мы видим в целом leafspot-size по значению значимости поровнялся с date, а пятна на плоде теперь одинаково значимы с формой плода.