In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## Создаем игрушечные данные

In [2]:
def create_df(dic, feature_list):
    out = pd.DataFrame(dic)
    out = pd.concat([out, pd.get_dummies(out[feature_list])], axis = 1)
    out.drop(feature_list, axis = 1, inplace = True)
    return out

def intersect_features(train, test):
    common_feat = list( set(train.keys()) & set(test.keys()))
    return train[common_feat], test[common_feat]

features = ['Looks', 'Alcoholic_beverage','Eloquence','Money_spent']

df_train = {}
df_train['Looks'] = ['handsome', 'handsome', 'handsome', 'repulsive',
                         'repulsive', 'repulsive', 'handsome'] 
df_train['Alcoholic_beverage'] = ['yes', 'yes', 'no', 'no', 'yes', 'yes', 'yes']
df_train['Eloquence'] = ['high', 'low', 'average', 'average', 'low',
                                   'high', 'average']
df_train['Money_spent'] = ['lots', 'little', 'lots', 'little', 'lots',
                                  'lots', 'lots']
df_train['Will_go'] = LabelEncoder().fit_transform(['+', '-', '+', '-', '-', '+', '+'])

df_train = create_df(df_train, features)


df_test = {}
df_test['Looks'] = ['handsome', 'handsome', 'repulsive'] 
df_test['Alcoholic_beverage'] = ['no', 'yes', 'yes']
df_test['Eloquence'] = ['average', 'high', 'average']
df_test['Money_spent'] = ['lots', 'little', 'lots']
df_test = create_df(df_test, features)


df_train

Unnamed: 0,Will_go,Looks_handsome,Looks_repulsive,Alcoholic_beverage_no,Alcoholic_beverage_yes,Eloquence_average,Eloquence_high,Eloquence_low,Money_spent_little,Money_spent_lots
0,0,1,0,0,1,0,1,0,0,1
1,1,1,0,0,1,0,0,1,1,0
2,0,1,0,1,0,1,0,0,0,1
3,1,0,1,1,0,1,0,0,1,0
4,1,0,1,0,1,0,0,1,0,1
5,0,0,1,0,1,0,1,0,0,1
6,0,1,0,0,1,1,0,0,0,1


## Вычисление энтропии исходной системы

**S${_0}$= −$\frac{3}{7}$log${_2}$($\frac{3}{7}$) − $\frac{4}{7}$log${_2}$($\frac{4}{7}$) = 0.985**

### Делим данные по признаку "Looks_handsome", вычисляем энтропию в левой и правой части после деления и вычисляем прирост информации

**S${_1}$ = - $\frac{2}{3}$log${_2}$($\frac{2}{3})$ - $\frac{1}{3}$log${_2}$($\frac{1}{3}$) = 0.918**

**S${_2}$ = -$\frac{3}{4}$log${_2}$($\frac{3}{4})$ - $\frac{1}{4}$log${_2}$($\frac{1}{4}$) = 0.811**

**IG(Q) = S${_0}$ - $\frac{3}{7}$S${_1}$ - $\frac{4}{7}$S${_2}$ = 0.985 - $\frac{3}{7}$*0.918 - $\frac{4}{7}$*0.811 = 0.128**

## Обучите дерево решений с помощью sklearn на обучающих данных. Глубина дерева может быть любой

In [3]:
tree = DecisionTreeClassifier()

In [4]:
y_train = df_train['Will_go']
X_train = df_train.drop(['Will_go'], axis = 1)

In [5]:
tree = DecisionTreeClassifier(criterion='entropy', random_state=17)
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=17, splitter='best')

**Рассмотрим следующий пример: у нас есть 9 синих шаров и 11 желтых шаров. Пусть шар имеет метку 1, если он синий, 0 в противном случае.**

In [6]:
balls = [1 for i in range(9)] + [0 for i in range(11)]

![](Screenshot_1.png)

In [7]:
# two groups
balls_left  = [1 for i in range(8)] + [0 for i in range(5)] # 8 blue and 5 yellow
balls_right = [1 for i in range(1)] + [0 for i in range(6)] # 1 blue and 6 yellow

## Реализовать функцию для вычисления энтропии Шеннона

In [8]:
from math import log

def entropy(a_list):
    if len(set(list(a_list))) < 2:
        return 0
    entropy_shenon = 0
    list_val = list(a_list)
    for i in set(list_val):
        condition = list_val.count(i)
        entropy_shenon -= (condition/len(list_val))*log(condition/len(list_val), 2)
    return entropy_shenon

In [9]:
print(entropy(balls))
print(entropy(balls_left))
print(entropy(balls_right))
print(entropy([1,2,3,4,5,6]))

0.9927744539878084
0.961236604722876
0.5916727785823275
2.584962500721156


## Прирост информации

In [10]:
def information_gain(root, left, right):
    
    root1 = entropy(root)
    left1 = entropy(left)
    right1 = entropy(right)
    
    information_gain = root1 - (len(left)/len(root))*left1 - \
                                    (len(right)/len(root))*right1
    
    return information_gain

information_gain(balls, balls_left, balls_right)

0.16088518841412436

# Dataset Adult

In [11]:
data = pd.read_csv("G:/python/pandas/adult.data.csv")
data[100:105]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
100,76,Private,124191,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
101,44,Private,198282,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,60,United-States,>50K
102,47,Self-emp-not-inc,149116,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States,<=50K
103,20,Private,188300,Some-college,10,Never-married,Tech-support,Own-child,White,Female,0,0,40,United-States,<=50K
104,29,Private,103432,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [12]:
data.tail(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [13]:
data.shape

(32561, 15)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [15]:
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [16]:
for col in data.columns:
    print(col,"\n", data[col].value_counts())
    print()

age 
 36    898
31    888
34    886
23    877
35    876
     ... 
83      6
85      3
88      3
86      1
87      1
Name: age, Length: 73, dtype: int64

workclass 
 Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

fnlwgt 
 203488    13
123011    13
164190    13
148995    12
113364    12
          ..
218551     1
201204     1
362999     1
162297     1
145522     1
Name: fnlwgt, Length: 21648, dtype: int64

education 
 HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: educa

In [17]:
data[data['workclass'] == '?']['workclass'].count()

1836

In [18]:
data[data['native-country'] == '?']['native-country'].count()

583

In [19]:
data[data['occupation'] == '?']['occupation'].count()

1843

In [20]:
data[data['workclass'] == '?'].head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,?,293936,7th-8th,4,Married-spouse-absent,?,Not-in-family,White,Male,0,0,40,?,<=50K
69,25,?,200681,Some-college,10,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,?,212759,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50K
106,17,?,304873,10th,6,Never-married,?,Own-child,White,Female,34095,0,32,United-States,<=50K


In [21]:
data = data.drop(axis = 0, index = data[data['workclass'] == '?'].index)
data = data.drop(axis = 0, index = data[data['occupation'] == '?'].index)
data = data.drop(axis = 0, index = data[data['native-country'] == '?'].index)
data.shape

(30162, 15)

In [22]:
for col in data.columns:
    print(col,"\n", data[col].value_counts())
    print()

age 
 36    852
31    851
33    837
34    836
37    828
     ... 
82      7
83      5
85      3
88      3
86      1
Name: age, Length: 72, dtype: int64

workclass 
 Private             22286
Self-emp-not-inc     2499
Local-gov            2067
State-gov            1279
Self-emp-inc         1074
Federal-gov           943
Without-pay            14
Name: workclass, dtype: int64

fnlwgt 
 203488    13
123011    12
113364    12
121124    12
148995    12
          ..
344414     1
280927     1
106850     1
224277     1
145522     1
Name: fnlwgt, Length: 20263, dtype: int64

education 
 HS-grad         9840
Some-college    6678
Bachelors       5044
Masters         1627
Assoc-voc       1307
11th            1048
Assoc-acdm      1008
10th             820
7th-8th          557
Prof-school      542
9th              455
12th             377
Doctorate        375
5th-6th          288
1st-4th          151
Preschool         45
Name: education, dtype: int64

education-num 
 9     9840
10    6678
13    5044

In [23]:
y = data['salary']
data = data.drop('salary', axis=1)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [24]:
y.head()

0    <=50K
1    <=50K
2    <=50K
3    <=50K
4    <=50K
Name: salary, dtype: object

In [25]:
dt = {'<=50K': 0, '>50K': 1}
y = y.map(dt)
y.unique()

array([0, 1], dtype=int64)

In [26]:
dt1 ={'Male': 1, 'Female': 0}
data['sex'] = data['sex'].map(dt1)
data['sex'].unique()

array([1, 0], dtype=int64)

In [27]:
dt2 = {'Preschool': 0, '1st-4th': 1, '5th-6th': 2, '7th-8th': 3, '9th':4,
      '10th': 5, '11th': 6, '12th': 7, 'HS-grad': 7,
      'Some-college':8, 'Bachelors': 9, 'Masters': 10,
     'Assoc-acdm': 11, 'Prof-school': 12, 'Doctorate': 12,
      'Assoc-voc': 11}

data['education'] = data['education'].map(dt2)

In [28]:
data['education'].unique()

array([ 9,  7,  6, 10,  4,  8, 11,  3, 12,  2,  5,  0,  1], dtype=int64)

In [29]:
categorical_columns = [col for col in data.columns if data[col].dtype.name == 'object']
categorical_columns

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'native-country']

In [30]:
new_data = pd.get_dummies(data, categorical_columns)
new_data

Unnamed: 0,age,fnlwgt,education,education-num,sex,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,9,13,1,2174,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,9,13,1,0,0,13,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,7,9,1,0,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,6,7,1,0,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,9,13,0,0,0,40,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,11,12,0,0,0,38,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,7,9,1,0,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,7,9,0,0,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,7,9,1,0,0,20,0,0,...,0,0,0,0,0,0,0,1,0,0


In [31]:
X_train, X_test, y_train, y_test = train_test_split(new_data, y, test_size = 0.25, random_state=17)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(22621, 88) (22621,) (7541, 88) (7541,)


In [32]:
dec_tree = DecisionTreeClassifier(max_depth = 3, random_state=17)
dec_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=17, splitter='best')

In [33]:
accuracy_score(y_test, dec_tree.predict(X_test))

0.8326481898952394

In [34]:
dct = {'max_depth': np.array(np.arange(2, 11))}
search = GridSearchCV(DecisionTreeClassifier(random_state=17), param_grid=dct, cv = 5)

In [35]:
search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=17,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])},
             pre_dispatch='2*n_jobs', refit=True, retu

In [36]:
search.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=17, splitter='best')

In [37]:
accuracy_score(y_test, search.predict(X_test))

0.8408699111523671

In [38]:
tuned_tree = DecisionTreeClassifier(max_depth = 7, random_state = 17)
tuned_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=17, splitter='best')

In [39]:
tuned_tree.score(X_test, y_test)

0.8408699111523671