In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.cluster import KMeans
import lightgbm as lgb
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score
from scipy import sparse


from math import sqrt
import statistics 

from IPython.core import display as ICD
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
transactions_result = pd.read_csv('transactions_result.csv')

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df.head()

In [None]:
downcast_dtypes(transactions_result)

In [None]:
transactions_result.loc[transactions_result['flag_miss_price']==1, 'item_price_minmax']=0

In [None]:
cols = transactions_result.columns.tolist() 
print(cols)

In [None]:
cols = cols[4:5] + cols[3:4] + cols[2:3]+ cols[1:2] + cols[10:13] + cols[0:1] + cols[7:8] + cols[9:10] + cols[11:-1] + cols[6:7]

In [None]:
df_train = transactions_result[cols]

In [None]:
rng = np.random.RandomState(1)
regr_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6),
                          n_estimators=200, learning_rate=0.1, random_state=rng)

y_pred_list_val = []

shop_list = list(transactions_result['shop_id'].unique())
clus_list = list(transactions_result['cluster_item'].unique())

for shop_id in shop_list:
    for clus_id in clus_list:
        df_train_pred = df_train.loc[(df_train['shop_id'] == shop_id)&(df_train['cluster_item'] == clus_id)]
        df_train_X = df_train_pred.drop(['target'],axis = 1)
    
        X_train = df_train_X.loc[(df_train_X['date_block_num']>12)&(df_train_X['date_block_num']<33)].values       
        X_val = df_train_X.loc[df_train_X['date_block_num']==33].values       
        y_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<33)]['target'].values
        y_val = df_train_pred.loc[df_train_pred['date_block_num']==33]['target'].values
    
        regr_model.fit(X_train, y_train)
    
        y_pred_shop_id_val = regr_model.predict(X_val)
        print(f'Test rmse for {shop_id} AdaBoost is {sqrt(mean_squared_error(y_val, y_pred_shop_id_val))}')
        y_pred_list_val.append(y_pred_shop_id_val)

In [None]:
rng = np.random.RandomState(1)
regr_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                               random_state=rng)
grid = {'n_estimators':[20,30,40],
        'learning_rate':[0.5,0.2,0.1]}

y_pred_list_val = []

shop_list = list(transactions_result['shop_id'].unique())
clus_list = list(transactions_result['cluster_item'].unique())

for shop_id in shop_list:
    for clus_id in clus_list:
        df_train_pred = df_train.loc[(df_train['shop_id'] == shop_id)&(df_train['cluster_item'] == clus_id)]
        df_train_X = df_train_pred.drop(['target'],axis = 1)
    
        X_train = df_train_X.loc[(df_train_X['date_block_num']>12)&(df_train_X['date_block_num']<33)].values       
        X_val = df_train_X.loc[df_train_X['date_block_num']==33].values       
        y_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<33)]['target'].values
        y_val = df_train_pred.loc[df_train_pred['date_block_num']==33]['target'].values
        
        gs = GridSearchCV(regr_model, grid, scoring='neg_mean_squared_error')
        gs.fit(X_train, y_train)
        
        regr = best_estimator_
        
    
        y_pred_shop_id_val = regr.predict(X_val)
        print(f'Test rmse for {shop_id} AdaBoost is {sqrt(mean_squared_error(y_val, y_pred_shop_id_val))}')
        y_pred_list_val.append(y_pred_shop_id_val)

In [None]:
test = pd.read_csv('test.csv')
test['cluster_item'] = transactions_result.loc[transactions_result['date_block_num']==34]['cluster_item'].values
val_clus_list = []



i = 0
for shop_id in shop_list:
    for clus_id in clus_list:
        val_clus = test.loc[(test['shop_id']==shop_id)&(test['cluster_item']==clus_id)].copy()
        val_clus['y_pred'] = y_pred_list_val[i]
        val_clus_list.append(val_clus)       
        i += 1
    
for i in range(len(val_clus_list)):
    val_clus_list[i] = np.where(val_clus_list[i]<0.5, 0, val_clus_list[i])
    
y_pred_val = np.vstack(val_clus_list)
y_pred_val = pd.DataFrame(y_pred_val, columns = ['ID','shop_id','item_id','cluster_item','y_pred'])
y_pred_val = y_pred_val.sort_values(by=['ID'])   

In [None]:
y_val = df_train.loc[df_train['date_block_num']==33]['target'].values
print('Test rmse for DT is %f' % sqrt(mean_squared_error(y_val, y_pred_val['y_pred'])))

In [None]:
rng = np.random.RandomState(1)
regr_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=20, random_state=rng)

y_pred_list = []
shop_list = list(transactions_result['shop_id'].unique())
clus_list = list(transactions_result['cluster_item'].unique())

for shop_id in shop_list:
    for clus_id in clus_list:
        df_train_pred = df_train.loc[(df_train['shop_id'] == shop_id)&(df_train['cluster_item'] == clus_id)]
        df_train_X = df_train_pred.drop(['target'],axis = 1)
    
        X_train = df_train_X.loc[(df_train_X['date_block_num']>12)&(df_train_X['date_block_num']<34)].values
        X_test = df_train_X.loc[df_train_X['date_block_num']==34].values
        y_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<34)]['target'].values
           
        regr_model.fit(X_train, y_train) 
        y_pred_shop_id = regr_model.predict(X_test)
        y_pred_list.append(y_pred_shop_id)
    print(shop_id)

In [None]:
test = pd.read_csv('test.csv')
test['cluster_item'] = transactions_result.loc[transactions_result['date_block_num']==34]['cluster_item'].values

test_clus_list = []


i = 0
for shop_id in shop_list:
    for clus_id in clus_list:
        test_clus = test.loc[(test['shop_id']==shop_id)&(test['cluster_item']==clus_id)].copy()
        test_clus['y_pred'] = y_pred_list[i]
        test_clus_list.append(test_clus)      
        i += 1
    
for i in range(len(test_clus_list)):
    test_clus_list[i] = np.where(test_clus_list[i]<0.5, 0, test_clus_list[i])
      
y_pred = np.vstack(test_clus_list)
y_pred = pd.DataFrame(y_pred, columns = ['ID','shop_id','item_id','cluster_item','y_pred'])
y_pred = y_pred.sort_values(by=['ID'])

In [None]:
one_hot_item = df_train[['cluster_item']]
one_hot_item = pd.get_dummies(one_hot_item['cluster_item'])
colum = one_hot_item.columns
for column in colum:
    one_hot_item.rename(columns={column: 'cluster'+str(column)}, inplace=True)

In [None]:
one_hot_cat = df_train[['item_categories']]
one_hot_cat = pd.get_dummies(one_hot_cat['item_categories'])
colum = one_hot_cat.columns
for column in colum:
    one_hot_cat.rename(columns={column: 'cat'+str(column)}, inplace=True)

In [None]:
one_hot_sub = df_train[['sub_categories']]
one_hot_sub = pd.get_dummies(one_hot_sub['sub_categories'])
colum = one_hot_sub.columns
for column in colum:
    one_hot_sub.rename(columns={column: 'sub'+str(column)}, inplace=True)

In [None]:
one_hot_city = df_train[['city_code']]
one_hot_city = pd.get_dummies(one_hot_city['city_code'])
colum = one_hot_city.columns
for column in colum:
    one_hot_city.rename(columns={column: 'city'+str(column)}, inplace=True)

# Подготовка данных для анализа текста 
## С использованием разряженных матриц

In [None]:
X_train_v = transactions_result.loc[(transactions_result['date_block_num']>12)&(transactions_result['date_block_num']<33),'item_info'].values
X_test_v = transactions_result.loc[transactions_result['date_block_num']==33, 'item_info'].values

In [None]:
vectorizer = TfidfVectorizer()
X_tr_vectorizer = vectorizer.fit_transform(X_train_v)
X_tt_vectorizer = vectorizer.transform(X_test_v)

In [None]:
df_train_lr = df_train.drop(columns = ['sub_categories','item_categories','city_code'])

In [None]:
X_train = df_train_lr.loc[(df_train_lr['date_block_num']>12)&(df_train_lr['date_block_num']<33)]
X_train = X_train.drop(['target','ID','item_id','shop_id','date_block_num','cluster_item'],axis = 1).values
X_test = df_train_lr.loc[df_train_lr['date_block_num']==33]
X_test = X_test.drop(['target','ID','item_id','shop_id','date_block_num','cluster_item'],axis = 1).values

In [None]:
X_tr_vectorizer.shape, X_train.shape

In [None]:
X_train = sparse.csr_matrix(X_train)
X_test = sparse.csr_matrix(X_test)

In [None]:
y_train = X_train[:,-1]
y_test = X_test[:,-1]
X_train = X_train[:,:-1]
X_test = X_test[:,:-1]

In [None]:
X_train = sparse.hstack((X_train,X_tr_vectorizer))
X_test = sparse.hstack((X_test,X_tt_vectorizer))

In [None]:
y_train = np.resize(y_train,(4284000))
y_test = np.resize(y_test,(214200))

In [None]:
# Очень долго считает, так и не дождался
lr = linear_model.Ridge(alpha=1)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f'Test rmse for {shop_id} Linear Regression is {sqrt(mean_squared_error(y_test, y_pred))}')

In [None]:
df_train_lr = df_train.drop(columns = ['sub_categories','item_categories','city_code'])

In [None]:
df_train_lr = pd.concat([df_train_lr,one_hot_item, one_hot_cat, one_hot_sub, one_hot_city], axis = 1)

In [None]:
lr = linear_model.Ridge(alpha=350)
pred_lr_list_val = []

shop_list = list(transactions_result['shop_id'].unique())
clus_list = list(transactions_result['cluster_item'].unique())

for shop_id in shop_list:
    for clus_id in clus_list:
        df_train_pred = df_train_lr.loc[(df_train_lr['shop_id'] == shop_id)&(df_train_lr['cluster_item'] == clus_id)]
    
        X_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<33)]
        X_train = X_train.drop(['target','ID','item_id','shop_id','date_block_num','cluster_item'],axis = 1).values
        X_val = df_train_pred.loc[df_train_pred['date_block_num']==33]
        X_val = X_val.drop(['target','ID','item_id','shop_id','date_block_num','cluster_item'],axis = 1).values            
        y_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<33)]['target'].values
        y_val = df_train_pred.loc[df_train_pred['date_block_num']==33]['target'].values
    
  
        lr.fit(X_train, y_train)
        pred_lr_shop_id_val = lr.predict(X_val)
        print(f'Test rmse for {shop_id} Linear Regression is {sqrt(mean_squared_error(y_val, pred_lr_shop_id_val))}')
        pred_lr_list_val.append(pred_lr_shop_id_val)        

In [None]:
test = pd.read_csv('test.csv')
test['cluster_item'] = transactions_result.loc[transactions_result['date_block_num']==34]['cluster_item'].values

val_clus_list = []

i = 0
for shop_id in shop_list:
    for clus_id in clus_list:
        val_clus = test.loc[(test['shop_id']==shop_id)&(test['cluster_item']==clus_id)].copy()
        val_clus['y_pred'] = pred_lr_list_val[i]
        val_clus_list.append(val_clus)
        i += 1
    
for i in range(len( val_clus_list)):
    val_clus_list[i] = np.where(val_clus_list[i]<0.5, 0, val_clus_list[i])
    
y_pred_lr_val = np.vstack(val_clus_list)
y_pred_lr_val = pd.DataFrame(y_pred_lr_val, columns = ['ID','shop_id','item_id','cluster_item','y_pred'])
y_pred_lr_val = y_pred_lr_val.sort_values(by=['ID'])    

In [None]:
y_val = df_train.loc[df_train['date_block_num']==33]['target'].values
print('Test rmse for LR is %f' % sqrt(mean_squared_error(y_val, y_pred_lr_val['y_pred'])))

In [None]:
lr = linear_model.Ridge(alpha=350)

pred_lr_list = []

shop_list = list(transactions_result['shop_id'].unique())
clus_list = list(transactions_result['cluster_item'].unique())

for shop_id in shop_list:
    for clus_id in clus_list:
        df_train_pred = df_train_lr.loc[(df_train_lr['shop_id'] == shop_id)&(df_train_lr['cluster_item'] == clus_id)]
    
        X_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<34)]
        X_train = X_train.drop(['target','ID','item_id','shop_id','date_block_num'],axis = 1).values
        X_test = df_train_pred.loc[df_train_pred['date_block_num']==34]
        X_test = X_test.drop(['target','ID','item_id','shop_id','date_block_num'],axis = 1).values
        y_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<34)]['target'].values   
  
        lr.fit(X_train, y_train)
        pred_lr_shop_id = lr.predict(X_test)
        pred_lr_list.append(pred_lr_shop_id)
    print(shop_id)

In [None]:
test = pd.read_csv('test.csv')
test['cluster_item'] = transactions_result.loc[transactions_result['date_block_num']==34]['cluster_item'].values
test_clus_list = []

i = 0
for shop_id in shop_list:
    for clus_id in clus_list:  
        test_clus = test.loc[(test['shop_id']==shop_id)&(test['cluster_item']==clus_id)].copy()
        test_clus['y_pred'] = pred_lr_list[i]
        test_clus_list.append(test_clus)
        i += 1
    
for i in range(len(test_clus_list)):
    test_clus_list[i] = np.where(test_clus_list[i]<0.5, 0, test_clus_list[i])
 
y_pred_lr = np.vstack(test_clus_list)
y_pred_lr = pd.DataFrame(y_pred_lr, columns = ['ID','shop_id','item_id','cluster_item','y_pred'])
y_pred_lr = y_pred_lr.sort_values(by=['ID'])

In [None]:
KNN = KNeighborsRegressor(n_neighbors=3, weights='distance')
pred_KNN_list_val = []

shop_list = list(transactions_result['shop_id'].unique())
clus_list = list(transactions_result['cluster_item'].unique())


for shop_id in shop_list:
    for clus_id in clus_list:
        df_train_pred = df_train.loc[(df_train['shop_id'] == shop_id)&(df_train['cluster_item'] == clus_id)]
    
        X_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<33)]
        X_train = X_train.drop(['target','ID','item_id','shop_id','date_block_num','item_categories','cluster_item'],axis = 1).values
        X_val = df_train_pred.loc[df_train_pred['date_block_num']==33]
        X_val = X_val.drop(['target','ID','item_id','shop_id','date_block_num','item_categories','cluster_item'],axis = 1).values
        y_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<33)]['target'].values
        y_val = df_train_pred.loc[df_train_pred['date_block_num']==33]['target'].values
    
  
        KNN.fit(X_train, y_train)
        pred_KNN_shop_id_val = KNN.predict(X_val)
        print(f'Test rmse for {shop_id} KNN is {sqrt(mean_squared_error(y_val, pred_KNN_shop_id_val))}')
        pred_KNN_list_val.append(pred_KNN_shop_id_val)

In [None]:
test = pd.read_csv('test.csv')
test['cluster_item'] = transactions_result.loc[transactions_result['date_block_num']==34]['cluster_item'].values
val_clus_list = []

i = 0
for shop_id in shop_list:
    for clus_id in clus_list:
        val_clus = test.loc[(test['shop_id']==shop_id)&(test['cluster_item']==clus_id)].copy()
        val_clus['y_pred'] = pred_KNN_list_val[i]
        val_clus_list.append(val_clus)
        i += 1
    
for i in range(len(test_clus_list)):
    val_clus_list[i] = np.where(val_clus_list[i]<0.5, 0, val_clus_list[i])
    
pred_KNN_val = np.vstack(val_clus_list)
pred_KNN_val = pd.DataFrame(pred_KNN_val, columns = ['ID','shop_id','item_id','cluster_item','y_pred'])
pred_KNN_val = pred_KNN_val.sort_values(by=['ID'])

In [None]:
y_val = df_train.loc[df_train['date_block_num']==33]['target'].values
print('Test rmse for KNN is %f' % sqrt(mean_squared_error(y_val, pred_KNN_val['y_pred'])))

In [None]:
KNN = KNeighborsRegressor(n_neighbors=3, weights='distance')
pred_KNN_list = []

shop_list = list(transactions_result['shop_id'].unique())
clus_list = list(transactions_result['cluster_item'].unique())

for shop_id in shop_list:
    for clus_id in clus_list:
        df_train_pred = df_train.loc[(df_train['shop_id'] == shop_id)&(df_train['cluster_item'] == clus_id)]
    
        X_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<34)]
        X_train = X_train.drop(['target','ID','item_id','shop_id','date_block_num','item_categories','cluster_item'],axis = 1).values 
        X_test = df_train_pred.loc[df_train_pred['date_block_num']==34]
        X_test = X_test.drop(['target','ID','item_id','shop_id','date_block_num','item_categories','cluster_item'],axis = 1).values
        y_train = df_train_pred.loc[(df_train_pred['date_block_num']>12)&(df_train_pred['date_block_num']<34)]['target'].values
     
        KNN.fit(X_train, y_train)
        pred_KNN_shop_id = KNN.predict(X_test)
        pred_KNN_list.append(pred_KNN_shop_id)
    print(shop_id)

In [None]:
test = pd.read_csv('test.csv')
test['cluster_item'] = transactions_result.loc[transactions_result['date_block_num']==34]['cluster_item'].values
test_clus_list = []

i = 0
for shop_id in shop_list:
    for clus_id in clus_list:        
        test_clus = test.loc[(test['shop_id']==shop_id)&(test['cluster_item']==clus_id)].copy()
        test_clus['y_pred'] = pred_KNN_list[i]
        test_clus_list.append(test_clus)
        i += 1
    
for i in range(len(test_clus_list)):
    test_clus_list[i] = np.where(test_clus_list[i]<0.5, 0, test_clus_list[i])
    
pred_KNN = np.vstack(test_clus_list)
pred_KNN = pd.DataFrame(pred_KNN, columns = ['ID','shop_id','item_id','cluster_item','y_pred'])
pred_KNN = pred_KNN.sort_values(by=['ID'])

In [None]:
predictions = [y_pred_val['y_pred'].values, pred_KNN_val['y_pred'].values, y_pred_lr_val['y_pred'],y_val]

In [None]:
predictions = np.vstack(predictions).T

In [None]:
y = predictions[:,-1]
X = predictions[:,:-1]

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

In [None]:
train, test = zip(*kf.split(X))

In [None]:
score_temp = np.empty([5,1])
y_predict = []
model_list = []

for train_index, test_index, i in zip(train,test,range(5)):
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    model_list.append(linear_model.LinearRegression(normalize=True).fit(X_train, y_train))   
    y_predict.append(model_list[i].predict(X_test))
    y_predict[i] = np.where(y_predict[i]<0.5, 0, y_predict[i])
    score_temp[i] = sqrt(mean_squared_error(y_test, y_predict[i]))

In [None]:
score_temp

In [None]:
predictions = [y_pred['y_pred'].values, pred_KNN['y_pred'].values, y_pred_lr['y_pred']]
predictions = np.vstack(predictions).T
X = predictions

In [None]:
X.shape, predictions.shape

In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
for i in range(5):
    pred = model_list[i].predict(X)
    pred = np.where(pred<0.5, 0, pred)
    submission['item_cnt_month'] = pred
    submission.to_csv(f'submission_{i}.csv', index = False)