# Обучение на полученном наборе данных

Используем sklearn -> RandomForestRegressor

In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

In [2]:
dataset_name = r"dataframe.csv"
data = pd.read_csv(dataset_name, sep=';', index_col=0)

In [3]:
data.head()

Unnamed: 0,len1,len2,add,delete,change,simCode
0,15,6,9,0,3,-1
1,15,6,9,0,4,-1
2,15,7,8,0,5,-1
3,18,8,11,1,2,-1
4,18,4,15,1,2,-1


In [4]:
data.shape

(326739, 6)

Смотрим на данные подробнее

In [5]:
data.describe()

Unnamed: 0,len1,len2,add,delete,change,simCode
count,326739.0,326739.0,326739.0,326739.0,326739.0,326739.0
mean,6.854994,7.990173,3.325253,0.272496,2.823648,0.020637
std,2.671177,2.848265,2.424509,0.576673,2.356393,0.999789
min,1.0,1.0,0.0,0.0,0.0,-1.0
25%,5.0,6.0,2.0,0.0,1.0,-1.0
50%,6.0,8.0,3.0,0.0,3.0,1.0
75%,8.0,10.0,5.0,0.0,5.0,1.0
max,27.0,27.0,24.0,5.0,16.0,1.0


Хотим предсказывать simCode

In [6]:
Y = data.simCode
X = data.drop('simCode', axis=1)

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=123, stratify=Y)

In [8]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [9]:
X_train_scaled = scaler.transform(X_train)
print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

[  5.54535994e-17   1.10472269e-16  -7.69690524e-17   7.57322196e-17
  -6.89092522e-17]
[ 1.  1.  1.  1.  1.]


In [10]:
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))

[ 0.0005142  -0.0026221  -0.00946488  0.00391167  0.00397837]
[ 0.99832678  0.99536432  0.99579238  0.9992142   0.99849507]


In [11]:
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))

In [12]:
pipeline.get_params()

{'memory': None,
 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'randomforestregressor__bootstrap': True,
 'randomforestregressor__criterion': 'mse',
 'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'auto',
 'randomforestregressor__max_leaf_nodes': None,
 'randomforestregressor__min_impurity_decrease': 0.0,
 'randomforestregressor__min_impurity_split': None,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__min_samples_split': 2,
 'randomforestregressor__min_weight_fraction_leaf': 0.0,
 'randomforestregressor__n_estimators': 100,
 'randomforestregressor__n_jobs': 1,
 

In [13]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                    'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [14]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...mators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
clf.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'sqrt'}

In [16]:
clf.refit

True

Считаем точность

In [17]:
Y_pred = clf.predict(X_test)

In [18]:
print(r2_score(Y_test, Y_pred))
print(mean_squared_error(Y_test, Y_pred))

0.72239168589
0.277490187524


Сохраняем модель

In [19]:
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

Смотрим на предсказание модели

In [20]:
clf.predict(X_test)

array([-0.83445661, -0.5184743 ,  1.        , ..., -0.75532215,
        0.37287813,  0.996502  ])