In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import LinearRegression
from scipy import sparse
from itertools import product
import xgboost as xgb

In [2]:
transactions = pd.read_csv('data/transactions.csv')
customers_gender = pd.read_csv('data/customers_gender_train.csv')

In [3]:
cuses_test = list(set(transactions.customer_id.unique().tolist()).difference(customers_gender.customer_id.unique()))
all_cuses = transactions.customer_id.unique()
all_mcc = transactions.mcc_code.unique()

In [4]:

transactions = transactions[transactions.amount < 0].copy()
transactions['day'] = transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)

In [5]:
transactions.day += 29 - transactions['day'].max()%30
transactions.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id,day
0,39026145,0 10:23:26,4814,1030,-2245.92,,23
2,39026145,1 10:20:56,4829,2330,-56147.89,,24
3,39026145,1 10:39:54,5499,1010,-1392.47,,24
4,39026145,2 15:33:42,5499,1010,-920.83,,25
5,39026145,2 15:53:49,5541,1010,-14643.37,,25


In [6]:
transactions['month_num'] = (transactions.day) // 30
transactions['year_num'] = (transactions.day) // 365

In [7]:
test_transactions = transactions[transactions.month_num == 15]
train_transactions = transactions[transactions.month_num < 15]

In [8]:
test_transactions.shape

(422661, 9)

In [9]:
test_transactions = test_transactions.set_index('customer_id')
test_transactions = test_transactions.loc[cuses_test]
test_transactions = test_transactions.reset_index()

In [10]:
grid = list(product(*[all_cuses, all_mcc, range(10, 15)]))
train_grid = pd.DataFrame(grid, columns = ['customer_id', 'mcc_code', 'month_num'])

In [11]:
test_grid = list(product(*[cuses_test, all_mcc]))       
test_grid = pd.DataFrame(test_grid, columns = ['customer_id', 'mcc_code'])
test_grid['month_num'] = 15

In [12]:
test = pd.merge(test_grid,
         test_transactions.groupby(['year_num', 'month_num', 'customer_id', 'mcc_code'])[['amount']].sum().reset_index(),
         how='left').fillna(0)

In [13]:
train = pd.merge(train_grid,
         train_transactions.groupby(['year_num', 'month_num', 'customer_id', 'mcc_code'])[['amount']].sum().reset_index(),
         how='left').fillna(0)

In [14]:
for month_shift in range(1, 6):
    train_shift = train.copy()
    train_shift['month_num'] = train_shift['month_num'] + month_shift
    train_shift = train_shift.rename(columns={"amount" : 'amount_{0}'.format(month_shift)})  
    train_shift = train_shift[['year_num', 'month_num', 'customer_id', 'mcc_code', 'amount_{0}'.format(month_shift)]]

    train = pd.merge(train, train_shift, 
                                  on=['year_num', 'month_num', 'customer_id', 'mcc_code'], how='left').fillna(0)
    test = pd.merge(test, train_shift, 
                                 on=['year_num', 'month_num', 'customer_id', 'mcc_code'], how='left').fillna(0)

In [15]:
hasher = FeatureHasher(n_features=10000, input_type='string')
train_sparse = \
    hasher.fit_transform(train[['year_num', 'month_num', 'customer_id', 'mcc_code']].astype(str).as_matrix())

In [16]:
test_sparse = \
    hasher.transform(test[['year_num', 'month_num', 'customer_id', 'mcc_code']].astype(str).as_matrix())

In [17]:
train_sparse = sparse.hstack([train_sparse,
                              np.log(np.abs(train[['amount_1', 'amount_2', 'amount_3', 'amount_4', 'amount_5']]) + 1).as_matrix()
                             ])

test_sparse = sparse.hstack([test_sparse,
                             np.log(np.abs(test[['amount_1', 'amount_2','amount_3', 'amount_4', 'amount_5']]) + 1).as_matrix()
                            ])

In [None]:
shift = 1
from sklearn import linear_model
clf = linear_model.Lasso(alpha=1)

In [18]:
shift = 1
clf = LinearRegression(n_jobs=4)
clf.fit(train_sparse, np.log(-train['amount'] + shift))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=4, normalize=False)

In [None]:
train_sparse.shape

In [None]:
clf = xgb.XGBRegressor(max_depth=4, learning_rate=0.025, n_estimators=1550, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, subsample=0.9, colsample_bytree=0.7, reg_alpha=0, reg_lambda=1, seed=7, missing=None)

In [None]:
clf.fit(train_sparse, np.log(-train['amount'] + shift))

In [19]:
test['volume'] = np.e ** clf.predict(test_sparse) - shift
test[['customer_id', 'mcc_code', 'volume']].to_csv('baseline_с(6).csv', index=False)

In [None]:
test.head()

In [None]:
test.head()