In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import normalize
%matplotlib inline

In [2]:
# считывание данных из файла
data = pd.read_excel('/home/es/PycharmProjects/final/data/oil_names.xlsx', skiprows=1, index_col=0, na_values=['n.a.'])

# Расчет показателей (по Шеремету)
# вспомогательные показатели
net_assets = data['ass_all'] - data['obl_long'] - data['current_obl'] + data['reven_futur']
F_prime = data['fix_ass'] + data['ass_cur_deb']
E_c = net_assets - F_prime
C_d = data['obl_long']
E_d = E_c + C_d
C_kk = data['current_obl_loans'] + data['current_obl_contractor'] + data['obl_short']
E_o = E_d + C_kk
E_z = data['ass_cur_stock']
dE_c = E_c - E_z
dE_d = E_d - E_z
dE_o = E_o - E_z

# основные показатели
data['dE_c'] = dE_c
data['dE_d'] = dE_d
data['dE_o'] = dE_o

In [3]:
# отнесение компании к одной из категорий по расчитанным показателям
data['group'] = pd.Series(0, index=data.index)
data['group'].loc[data[(dE_c<0) & (dE_d<0) & (dE_o<0)].index] = 2
data['group'].loc[data[(dE_c<0) & (dE_d<0) & (dE_o>=0)].index] = 2
data['group'].loc[data[(dE_c<0) & (dE_d>=0) & (dE_o>=0)].index] = 1
data['group'].loc[data[(dE_c>=0) & (dE_d>=0) & (dE_o>=0)].index] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [4]:
# отсеивание ошибочных данных
bad = (data['ass_all'] != data['passive']) | (data['obl_long'] < 0) | (data['current_obl'] < 0) | (data['reven_futur'] < 0) | (data['fix_ass'] < 0) | (data['ass_cur_deb'] < 0) | (data['current_obl_loans'] < 0) | (data['current_obl_contractor'] < 0) | (data['obl_short'] < 0) | (data['ass_cur_stock'] < 0) | (data['revenue'] <= 0)
data = data.drop(data[bad].index)
# удаление неинформативных признаков
data = data.drop('nc_ass_rev', 1)
data = data.drop('oth_oper', 1)

In [5]:
# подготовка данных
new_features = []
new_features.append(data['gross_profit']/data['revenue'])
new_features.append(data['oper_profit']/data['revenue'])
new_features.append(data['net_cost']/data['oper_profit'])
new_features.append(data['int_pay']/data['int_rec'])
new_f = pd.concat(new_features, axis=1)
new_f.columns = ['feature_'+str(i) for i in range(1,len(new_features)+1)]
# new_features = pd.concat(new_features, axis=1)
# названия столбцов ОПУ
pl_statement = ['revenue', 'net_cost', 'gross_profit', 'expen_realis', 'expen_adm', 'oper_profit', 'finoper_oth_inc', 'int_rec', 'int_pay', 'fin_oth_inc', 'exp_finoper_oth', 'prof_b_tax', 'tax_on_prof', 'unch_tax', 'def_tax_liab', 'def_tax_ass', 'other', 'prof_a_tax', 'prof_y']
X = data[pl_statement]
y = data['group']
X = pd.concat([X,new_f], axis=1)
classes = y.unique()
classes.sort(axis=0)
# разделение выборки на обучающую и проверочную
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1580)

In [6]:
# X = X.replace(np.nan, 1).replace([np.inf], 10*max(X['feature_3'])).replace([-np.inf], -10*max(X['feature_3']))
for col in X.columns:
    mv=X[col][abs(X[col])!=np.inf].max()
    X[col] = X[col].replace(np.nan, 1).replace([np.inf], 10*mv).replace([-np.inf], -10*mv)

X = pd.DataFrame(normalize(X, norm='l1', axis=0), index=X.index, columns=X.columns)

In [None]:
# rebalancing
# mm = min([len(y_train[y_train==i]) for i in classes])
# X_train_b = pd.concat([X_train[y_train==i].head(mm) for i in classes])
# y_train_b = pd.concat([y_train[y_train==i].head(mm) for i in classes])

In [None]:
# from sklearn.linear_model import LogisticRegression
# создание и обучение дерева
# X_train, y_train = X_train_b, y_train_b
clf = tree.DecisionTreeClassifier(random_state=1580, criterion='entropy', max_features=11, max_depth=3, min_samples_leaf=10)
clf.fit(X_train, y_train)
# вычисление качества предсказания на обучающей выборке
print('accuracy (train):', accuracy_score(y_train, clf.predict(X_train)))
# предсказание с помощью обученного дерева
y_pred = clf.predict(X_test)
# вычисление качества предсказания на проверочной выборке
print('accuracy (test): ', accuracy_score(y_test, y_pred))


In [None]:
from sklearn.externals.six import StringIO
import pydotplus
dot_data = StringIO()
from IPython.display import Image
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names=list(X_train.columns), class_names=['#'+str(cl) for cl in classes], filled=True, rounded=True, special_characters=True)  
graphs = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graphs.create_png())

In [7]:
for col in X.columns:
    print(max(X[col]))

0.130854646372
0.149090803145
0.0985608598001
0.150217424744
0.0904741894323
0.141344916996
0.246512052341
0.558514602653
0.102952098979
0.737672225612
0.609559070988
0.422953501882
0.481593835524
0.150243527736
0.202211513211
0.382119086441
0.162005288039
0.422802101065
0.410955306584
0.00334770433925
0.000119823885234
0.119692508265
0.0155967833069
