In [2]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
# from sklearn.multiclass import OneVsRestClassifier
# from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn import grid_search
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [3]:
# считывание данных из файла
data = pd.read_excel('/home/es/PycharmProjects/final/data/oil_names.xlsx', skiprows=1, index_col=0, na_values=['n.a.'])

# Расчет показателей (по Шеремету)
# вспомогательные показатели
net_assets = data['ass_all'] - data['obl_long'] - data['current_obl'] + data['reven_futur']
F_prime = data['fix_ass'] + data['ass_cur_deb']
E_c = net_assets - F_prime
C_d = data['obl_long']
E_d = E_c + C_d
C_kk = data['current_obl_loans'] + data['current_obl_contractor'] + data['obl_short']
E_o = E_d + C_kk
E_z = data['ass_cur_stock']
dE_c = E_c - E_z
dE_d = E_d - E_z
dE_o = E_o - E_z

# основные показатели
data['dE_c'] = dE_c
data['dE_d'] = dE_d
data['dE_o'] = dE_o

In [4]:
# отнесение компании к одной из категорий по расчитанным показателям
data['group'] = pd.Series(0, index=data.index)
data['group'].loc[data[(dE_c<0) & (dE_d<0) & (dE_o<0)].index] = 2
data['group'].loc[data[(dE_c<0) & (dE_d<0) & (dE_o>=0)].index] = 2
data['group'].loc[data[(dE_c<0) & (dE_d>=0) & (dE_o>=0)].index] = 1
data['group'].loc[data[(dE_c>=0) & (dE_d>=0) & (dE_o>=0)].index] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
# отсеивание ошибочных данных
bad = (data['ass_all'] != data['passive']) | (data['obl_long'] < 0) | (data['current_obl'] < 0) | (data['reven_futur'] < 0) | (data['fix_ass'] < 0) | (data['ass_cur_deb'] < 0) | (data['current_obl_loans'] < 0) | (data['current_obl_contractor'] < 0) | (data['obl_short'] < 0) | (data['ass_cur_stock'] < 0) | (data['revenue'] <= 0)
data = data.drop(data[bad].index)
# удаление неинформативных признаков
data = data.drop('nc_ass_rev', 1)
data = data.drop('oth_oper', 1)

In [6]:
# подготовка данных
# названия столбцов ОПУ
pl_statement = ['revenue', 'net_cost', 'gross_profit', 'expen_realis', 'expen_adm', 'oper_profit', 'finoper_oth_inc', 'int_rec', 'int_pay', 'fin_oth_inc', 'exp_finoper_oth', 'prof_b_tax', 'tax_on_prof', 'unch_tax', 'def_tax_liab', 'def_tax_ass', 'other', 'prof_a_tax', 'prof_y']
X = data[pl_statement]
y = data['group']

classes = y.unique()
classes.sort(axis=0)
# разделение выборки на обучающую и проверочную
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1580)

In [None]:
# X = X.replace(np.nan, 1).replace([np.inf], 10*max(X['feature_3'])).replace([-np.inf], -10*max(X['feature_3']))
# for col in X.columns:
#     mv=X[col][abs(X[col])!=np.inf].max()
#     X[col] = X[col].replace(np.nan, 1).replace([np.inf], 10*mv).replace([-np.inf], -10*mv)

# X = pd.DataFrame(normalize(X, norm='l1', axis=0), index=X.index, columns=X.columns)

In [111]:
new_features = []
new_features.append(data['gross_profit']/data['revenue'])#+
new_features.append(data['oper_profit']/data['revenue'])#+
new_features.append(data['net_cost']/data['oper_profit'])#+
# new_features.append(data['exp_finoper_oth']/data['net_cost'])
# new_features.append(data['fin_oth_inc']/data['oper_profit'])
# new_features.append(data['int_pay']/data['int_rec'])
new_features = pd.concat(new_features, axis=1)
new_features.columns = ['feature_'+str(i) for i in range(1,len(new_features.columns)+1)]

In [112]:
# X = new_features
# # разделение выборки на обучающую и проверочную
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1580)
# for col in X.columns:
#     mv=X[col][abs(X[col])!=np.inf].max()
#     X[col] = X[col].replace(np.nan, 1).replace([np.inf], 10*mv).replace([-np.inf], -10*mv)

In [113]:
grid = {'C': np.arange(2, 16.0, 0.25)}
cv = KFold(y_train.size, n_folds=5, shuffle=True, random_state=1580)
clf = LogisticRegression(random_state=1580)
gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=2)
gs.fit(X_train, y_train)
lr = gs.best_estimator_
# вычисление качества предсказания на обучающей выборке
print('accuracy (train):', accuracy_score(y_train, lr.predict(X_train)))
# предсказание
y_pred = lr.predict(X_test)
# вычисление качества предсказания на проверочной выборке
print('accuracy (test): ', accuracy_score(y_test, y_pred))
# вычисление качества отдельно для каждого класса
y_train_pred1 = lr.predict(X_train[data['group']==1])
y_train_pred2 = lr.predict(X_train[data['group']==2])
y_test_pred1 = lr.predict(X_test[data['group']==1])
y_test_pred2 = lr.predict(X_test[data['group']==2])

print('accuracy (train, class 1): ', accuracy_score(y_train[y_train==1], y_train_pred1))
print('accuracy (train, class 2): ', accuracy_score(y_train[y_train==2], y_train_pred2))
print('accuracy (test, class 1): ', accuracy_score(y_test[y_test==1], y_test_pred1))
print('accuracy (test, class 2): ', accuracy_score(y_test[y_test==2], y_test_pred2))

accuracy (train): 0.814024390244
accuracy (test):  0.682926829268
accuracy (train, class 1):  0.0
accuracy (train, class 2):  1.0
accuracy (test, class 1):  0.0
accuracy (test, class 2):  1.0




In [114]:
def wac(est, X, y):
    w1 = 1
    w2 = 1
    ac1 = accuracy_score(y[y==1], est.predict(X[y==1]))
    ac2 = accuracy_score(y[y==2], est.predict(X[y==2]))
    return (ac1**w1) * (ac2**w2)

In [122]:
grid = {'max_features': np.arange(1, len(X.columns), 1), 'max_depth':np.arange(1, 4, 1), 'min_samples_leaf':np.arange(1, 24, 3)}
cv = KFold(y_train.size, n_folds=5, shuffle=True, random_state=1580)
# создание дерева
clf = tree.DecisionTreeClassifier(random_state=1580, criterion='entropy', class_weight='balanced')
gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=2)
gs.fit(X_train, y_train)

gs.best_params_

{'max_depth': 2, 'max_features': 1, 'min_samples_leaf': 10}

In [123]:
tr = gs.best_estimator_
# вычисление качества предсказания на обучающей выборке
print('accuracy (train):', accuracy_score(y_train, tr.predict(X_train)))
# предсказание с помощью обученного дерева
y_pred = tr.predict(X_test)
# вычисление качества предсказания на проверочной выборке
print('accuracy (test): ', accuracy_score(y_test, y_pred))
y_train_pred1 = tr.predict(X_train[data['group']==1])
y_train_pred2 = tr.predict(X_train[data['group']==2])
y_test_pred1 = tr.predict(X_test[data['group']==1])
y_test_pred2 = tr.predict(X_test[data['group']==2])

print('accuracy (train, class 1): ', accuracy_score(y_train[y_train==1], y_train_pred1))
print('accuracy (train, class 2): ', accuracy_score(y_train[y_train==2], y_train_pred2))
print('accuracy (test, class 1): ', accuracy_score(y_test[y_test==1], y_test_pred1))
print('accuracy (test, class 2): ', accuracy_score(y_test[y_test==2], y_test_pred2))

accuracy (train): 0.603658536585
accuracy (test):  0.585365853659
accuracy (train, class 1):  0.737704918033
accuracy (train, class 2):  0.573033707865
accuracy (test, class 1):  0.653846153846
accuracy (test, class 2):  0.553571428571




In [None]:
grid = {'n_estimators': np.hstack([np.arange(2, 20, 1), np.arange(20, 200, 10), np.arange(200, 2000, 100)]), 'max_features': np.arange(1, len(X.columns), 1), 'max_depth':np.arange(1, 10, 1), 'min_samples_leaf':np.arange(1, 24, 3)}
cv = KFold(y_train.size, n_folds=5, shuffle=True, random_state=1580)
# создание леса
clf = RandomForestClassifier(random_state=1580, criterion='entropy', class_weight='balanced')
gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=2)
gs.fit(X_train, y_train)

gs.best_params_

In [16]:
grid = {'n_estimators': np.hstack([np.arange(2, 20, 2), np.arange(20, 200, 20)]), 'max_features': np.arange(1, len(X.columns), 5), 'max_depth':np.arange(2, 10, 2), 'min_samples_leaf':np.arange(3, 26, 3)}
cv = KFold(y_train.size, n_folds=5, shuffle=True, random_state=1580)
# создание леса
clf = RandomForestClassifier(random_state=1580, criterion='entropy', class_weight='balanced')
gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=2)
gs.fit(X_train, y_train)

gs.best_params_

{'max_depth': 6,
 'max_features': 11,
 'min_samples_leaf': 3,
 'n_estimators': 160}

In [17]:
grid = {'n_estimators': np.hstack([np.arange(120, 200, 20), np.arange(200, 1400, 150)]), 'max_features': np.arange(1, len(X.columns), 5), 'max_depth':np.arange(2, 10, 2), 'min_samples_leaf':np.arange(3, 26, 3)}
cv = KFold(y_train.size, n_folds=5, shuffle=True, random_state=1580)
# создание леса
clf = RandomForestClassifier(random_state=1580, criterion='entropy', class_weight='balanced')
gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=2)
gs.fit(X_train, y_train)

gs.best_params_

{'max_depth': 6,
 'max_features': 11,
 'min_samples_leaf': 3,
 'n_estimators': 160}