In [9]:
import pandas as pd
import numpy as np
from random import shuffle
from sklearn import tree
from copy import deepcopy
import matplotlib.pyplot as plt
from sklearn import cross_validation
from scipy.optimize import minimize_scalar
from sklearn.ensemble import BaggingRegressor
%matplotlib inline

In [2]:
def smape_loss_func(x, y):
    SymAPE = [ 2*np.abs(x-y)/(np.abs(x)+np.abs(y))] 
    return np.mean(SymAPE)

def get_binarized_data(df, categorical_columns):
    binary_df = deepcopy(df)
    for column in categorical_columns:
        binary_df = pd.concat([binary_df, pd.get_dummies(binary_df[column], prefix=column, prefix_sep=': ')], axis=1)
        del binary_df[column]
    
    return binary_df

def split_train_test(df, test_size=0.5, random_state=42, y_column_name='y'):
    X = np.asarray(df.drop(y_column_name, axis=1))
    y = np.asarray(df[y_column_name]).ravel()
    trainX, testX, trainY, testY = cross_validation.train_test_split(X, y, test_size=test_size, random_state=42)
    return trainX, testX, trainY, testY

### Обучение и Контроль

In [3]:
train = pd.read_csv('train.csv', sep=';',decimal= '.', index_col='ID')
test = pd.read_csv('test.csv', sep=';',decimal= '.', index_col='ID')

In [4]:
def apply_f_to_col_inline(df, col, f):
    df[col] = df[col].apply(f)
    
def get_mean_demand_for_value(df, column, value):
    interested_in = df[['DEMAND', column]]
    interested_in = interested_in.loc[df[column] == value]
    mean = interested_in.mean()[0]
    return 0 if np.isnan(mean) else mean

def process_bad_categorical_columns(df, columns, mean_demands):
    for column in columns:
        values = set(df[column].values)
        apply_f_to_col_inline(df, column,lambda x :
                              0 if x not in mean_demands[column].keys() else mean_demands[column][x])

In [5]:
def collect_mean_demands(data):
    data = train
    Categorical_columns = data.columns[([isinstance(z, str) for z in data.ix[0]])].append([['LOCATION_ID', 'MODEL_ID']])
    dist_val = pd.DataFrame({'NoUniqueValues': [data[z].nunique() for z in Categorical_columns]}, index = Categorical_columns)
    threshold = 7
    bad_cat_columns = dist_val.ix[dist_val['NoUniqueValues']>threshold].axes[0]
    mean_demands = dict()
    for column in bad_cat_columns:
        values = set(data[column].values)
        mean_demands[column] = dict()
        for value in values:
            mean_demands[column][value] = get_mean_demand_for_value(data, column, value)
    return mean_demands

In [11]:
def get_predict(train_data, test_data):
    cat_columns = train_data.columns[([isinstance(z, str) for z in train_data.ix[0]])]
    bad_cat_columns = ['MODEL_OS', 'STORE_PRICE_TYPE', 'STORE_CLUSTER', 'STORE_REGION',
                       'MODEL_NAME', 'MODEL_DIAG_TYPE', 'MODEL_HEIGHT', 'MODEL_WIDTH', 'MODEL_TYPE', 'STORE_OPEN_DTTM',
                       'STORE_BUILDING_TYPE', 'STORE_CITY', 'MODEL_RES_TYPE', 
                       'STORE_BRAND_ABC', 'STORE_FORMAT']
    bad_cat_columns = []
    bad_num_columns = ['STORE_ASSORTMENT', 'STORE_RES_SPECTR', 'STORE_BRAND_SPECTR', 'STORE_DIAG_SPECTR',
                       'MODEL_COUNT_SALES_DAYS']
    bad_num_columns = []
    #for col in ['STORE_BRAND_ABC', 'STORE_DIAG_ABC', 'STORE_RES_ABC']:
    #    train_data[col].fillna('C', inplace=True)
    #    train_data[col] = train_data[col].apply(lambda x: ord('C') - ord(x))
    #    test_data[col].fillna('C', inplace=True)
    #    test_data[col] = test_data[col].apply(lambda x: ord('C') - ord(x))
    
    good_cat_columns = [cat_columns[i] for i in range(len(cat_columns)) if cat_columns[i] not in bad_cat_columns]
    train_data.drop(bad_cat_columns, axis = 1, inplace=True)
    train_data.drop(bad_num_columns, axis = 1, inplace=True)
    
    #mean_demands = collect_mean_demands(train_data)
    #to_process = ['MODEL_ID']
    #process_bad_categorical_columns(train_data, to_process, mean_demands)
    #process_bad_categorical_columns(test_data, to_process, mean_demands)
    
    train_data.fillna(-100, inplace=True)
    bin_train_data = get_binarized_data(train_data, good_cat_columns)
    print ('bin_train:', bin_train_data.shape)
    test_data.drop(bad_cat_columns, axis = 1, inplace=True)
    test_data.drop(bad_num_columns, axis = 1, inplace=True)
    test_data.fillna(-100, inplace=True)
    bin_test_data = get_binarized_data(test_data, good_cat_columns)
    print ('bin_test:', bin_test_data.shape)
    # Все отсутствующие в тестовой выборке столбцы полагаем 0
    add_columns = bin_train_data.columns[1:].difference(bin_test_data.columns)
    bin_test_data = pd.merge(bin_test_data, bin_train_data.ix[bin_test_data.index][add_columns],
                             how='inner', left_index=True, right_index=True)
    bin_test_data[add_columns] = 0
    # Оставляем только столбцы обучающей выборки (целевая переменная в обучающей выборке отсутствует)
    bin_test_data = bin_test_data[bin_train_data.columns[1:]]
    bin_test_data.fillna(-10, inplace=True)
    print ('bin_test:', bin_test_data.shape)
    # Обучающая выборка
    trainX, _, trainY, _ = split_train_test(df=bin_train_data, test_size=0,  y_column_name='DEMAND')
    #reg = tree.DecisionTreeRegressor().fit(trainX, trainY)
    reg = BaggingRegressor(n_estimators=1,
                          max_samples=0.9, max_features=0.9, random_state=1234, verbose=0).fit(trainX, trainY)
    prediction = reg.predict(np.asarray(bin_test_data))
    prediction = np.minimum(np.maximum(prediction, 0.0001), 1)
    return prediction

##### Исключаем лишние произнаки, заменяем NaN

In [None]:
for i in range(10):
    train = pd.read_csv('train.csv', sep=';',decimal= '.', index_col='ID')
    test = pd.read_csv('test.csv', sep=';',decimal= '.', index_col='ID')
    msk = np.random.rand(len(train)) < 0.8
    msk[0] = True
    tr = train.iloc[msk]
    te = train.iloc[~msk]
    y = te['DEMAND'].values.ravel()
    te.drop(['DEMAND'], axis=1, inplace=True)
    test_predict = get_predict(tr, te)
    test_predict = test_predict
    print (smape_loss_func(y, test_predict))

# Сохраняем в csv-файл
#test_predict = get_predict(train, test)
#test['DEMAND'] = test_predict
#test[['DEMAND']].to_csv('.submission0.csv', sep = ',', index = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
test_predict

In [9]:
print smape_loss_func(y, test_predict)

0.734641272794


In [10]:
train = pd.read_csv('train.csv', sep=';',decimal= '.', index_col='ID')
test = pd.read_csv('test.csv', sep=';',decimal= '.', index_col='ID')
test_predict = get_predict(train, test)

bin_train: (40000, 867)
bin_test: (11296, 782)
bin_test: (11296, 866)


In [11]:
# Сохраняем в csv-файл
test['DEMAND'] = test_predict
test[['DEMAND']].to_csv('submission0.csv', sep = ',', index = True)