In [1]:
import pandas as pd
import numpy as np
from random import shuffle
from sklearn import tree
from copy import deepcopy
from sklearn.ensemble import BaggingRegressor
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import cross_validation
from sklearn.dummy import DummyRegressor
from scipy.optimize import minimize_scalar
from sklearn import preprocessing
import math
import xgboost as xgb
%matplotlib inline

### Вспомогательные функции

In [2]:
def smape_loss_func(x, y):
    SymAPE = [ 2*np.abs(x-y)/(np.abs(x)+np.abs(y))] 
    return np.mean(SymAPE)

def get_numerated_data(df, categorical_columns):
    simple_df = deepcopy(df)
    for column in categorical_columns:
        simple_df[column] = pd.Categorical.from_array(simple_df[column]).labels
    
    return simple_df

def get_binarized_data(df, categorical_columns):
    binary_df = deepcopy(df)
    for column in categorical_columns:
        binary_df = pd.concat([binary_df, pd.get_dummies(binary_df[column], prefix=column, prefix_sep=': ')], axis=1)
        del binary_df[column]
    
    return binary_df

def split_to_numpy(df, y_column_name):
    #print df
    #print y_column_name
    data_x = df.drop(y_column_name, axis=1).values.astype(np.float32)
    data_y = df[y_column_name].values.astype(np.float32)
    column_names = df.drop(y_column_name, axis=1).columns
    
    return data_x, data_y, column_names

def split_train_test(df, test_size=0.5, random_state=42, y_column_name='y'):
    X = np.asarray(df.drop(y_column_name, axis=1))
    y = np.asarray(df[y_column_name]).ravel()
    trainX, testX, trainY, testY = cross_validation.train_test_split(X, y, test_size=test_size, random_state=42)
    return trainX, testX, trainY, testY

## Другие модели и генерация дополнительных признаков (2 балла)

In [3]:
def get_results(reg, acc=smape_loss_func, min_max_filter=True, max_val=1, min_val=0.0001):
    reg.fit(trainX, trainY)
    predict = reg.predict(testX)
    if min_max_filter:
        predict = np.minimum(np.maximum(predict, min_val),max_val)
    results = (acc(trainY, reg.predict(trainX)), acc(testY, predict))
    print("Loss: " + str(results[1]))    
    return results

In [4]:
def prepare_data(data, scalers=None):
    Redundant_columns = ['MODEL_NAME', 'MODEL_COUNT_SALES_DAYS', 'MODEL_DIAG_TYPE','MODEL_HEIGHT', 'MODEL_WIDTH', 'STORE_OPEN_DTTM',
                        'MODEL_BRAND', 'MODEL_OS', 'MODEL_TYPE']
    data.drop(Redundant_columns, inplace=True, axis=1)
    
    for col in ['STORE_BRAND_ABC', 'STORE_DIAG_ABC', 'STORE_RES_ABC']:
        data[col].fillna('C', inplace=True)
        data[col] = data[col].apply(lambda x: ord('C') - ord(x))
    bad_cat_columns = ['MODEL_ID']
    data.drop(bad_cat_columns, axis=1, inplace=True)
    Categorical_columns = data.columns[([isinstance(z, str) for z in data.ix[0]])]
    for col in Categorical_columns:
        data = get_binarized_data(data, [col])
    data['MODEL_MEAN_SALES'].fillna(-100, inplace=True)
    data.fillna(data.mean(), inplace=True)
    return data

In [5]:
train_data = prepare_data(pd.read_csv('train.csv', sep=';', index_col='ID'))

In [7]:
train_data.shape

(40000, 116)

In [8]:
test_data = prepare_data(pd.read_csv('test.csv', sep=';', index_col='ID'))

In [8]:
print test_data.shape

(11296, 115)


In [9]:
trainX, testX, trainY, testY = split_train_test(df = train_data, test_size=0.15,  y_column_name='DEMAND')

In [None]:
testX = test.drop(['DEMAND'], axis=1)

In [None]:
import xgboost as xgb

In [9]:
trainY = train_data['DEMAND']
trainX = train_data.drop(['DEMAND'], axis=1)
model = tree.DecisionTreeRegressor().fit(trainX, trainY)

In [10]:
pred = pd.DataFrame(model.predict(test_data), index = test_data.index, columns=pd.Index(['DEMAND']))

In [11]:
pred.to_csv('submission3.csv', sep = ',', index = True)