In [None]:
import time
import gc
import re
import csv
import string
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from scipy.sparse import hstack

import lightgbm as lgb
import xgboost as xgb

from contextlib import contextmanager


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from nltk.corpus import stopwords
import warnings

warnings.filterwarnings('ignore')
russian_stop = set(stopwords.words('russian'))


Path = "/home/dsxuser/.kaggle/competitions/avito-demand-prediction/"
# Path = "../input/"

target = 'deal_probability'
index = 'item_id'
FNAS = ''
V_SPLT = .2
encode = True
with open('/home/dsxuser/.kaggle/datasets/sophieg/wikipedia-language-iso639/unicode_vars.pkl', 'rb') as f:
    EMOJI, C_ALPHA = pickle.load(f)
EMOJI = "".join(EMOJI)
C_ALPHA = "".join(C_ALPHA)
PUNC = re.escape(string.punctuation)
NUM = string.digits
A_ALPHA = string.ascii_letters
WS = ' '
templ = '(?P<la>[{0}])(?P<not>[^ ])'
tempr = '(?P<not>[^ ])(?P<ra>[{0}])'
lrepl = r'\g<la> \g<not>'
rrepl = r'\g<not> \g<ra>'
lalpha = r'(?P<la>(?:[{0}{1}]+))(?P<not>[^ {0}{1}])'.format(A_ALPHA, C_ALPHA)
ralpha = r'(?P<not>[^ {0}{1}])(?P<ra>(?:[{0}{1}]+))'.format(A_ALPHA, C_ALPHA)
not_valid_char = u'[^{}{}{}{}{}{}]'.format(
    A_ALPHA, C_ALPHA, NUM, PUNC, EMOJI, WS)
repl_dict = {
    # if a Punctuation sequence is not followed by whitespace
    # add it
    # punct on the left
    templ.format(PUNC): lrepl,
    # if a Punctuation sequence is not preceeded by whitespace
    # add it
    # punct on the right
    tempr.format(PUNC): rrepl,
    # if a number sequence is not followed by whitespace
    # add it
    # nums on the left
    templ.format(NUM): lrepl,
    # if a number sequence is not preceeded by whitespace
    # add it
    # nums on the right
    tempr.format(NUM): rrepl,
    # if a number sequence is not followed by whitespace
    # add it
    # emoji on the left
    templ.format(EMOJI): lrepl,
    # if an emoji sequence is not preceeded by whitespace
    # add it
    # emoji on the right
    tempr.format(EMOJI): rrepl,
    # any sequence of alpha chars should end with a white space
    # alpha on the left
    lalpha: lrepl,
    # alpha on the right
    ralpha: rrepl,
    # remove extra whitespace
    r'\s+': ' '
}
text_cols = ['description', 'title',
             'param_1', 'param_2', 'param_3']

files = ['train_active.csv.zip',
         'test_active.csv.zip',
         'train.csv.zip',
         'test.csv.zip']


@contextmanager
def timer(name, verbose=True):
    t0 = time.time()
    yield
    if verbose:
        t1 = time.time()
        m = int((t1 - t0) // 60)
        s = int((t1 - t0) % 60)
        tm = time.strftime("%H:%M")
        print('{} [{}] done in {} m,{} s\n{}'.format(tm,name,m, s,"="*90))


data = {}
dtypes = {**{index: str}, **dict(zip(text_cols, [str] * len(text_cols)))}
for j, fn in enumerate(files):
    with timer('Reading {} data>'.format(fn)):
        f = '{}{}'.format(Path, fn)
        print(f)
        if fn in ['train.csv.zip', 'test.csv.zip']:
            data[fn] = pd.read_csv(f, usecols=[index, *text_cols],
                                   dtype=dtypes,
                                   delimiter=',',
                                   quotechar='"',
                                   quoting=csv.QUOTE_MINIMAL,
                                   na_filter=False,
                                   index_col=index)
        else:
            data[fn] = pd.read_csv(f, usecols=text_cols,
                                   dtype=dtypes,
                                   delimiter=',',
                                   quotechar='"',
                                   quoting=csv.QUOTE_MINIMAL,
                                   na_filter=False)
    for c in text_cols:
        with timer('Processing {} column glove_data/{}'.format(fn, c)):
            pass
        chunk_size = 4000000
        iters = int(np.ceil(data[fn].shape[0] / chunk_size))
        print('chunks total', iters)
        for i in range(iters):
            with timer('processing column {} {} chunk - {} / {}'.format(c,fn,i+1,iters)):
                start = i * chunk_size
                end = (i + 1) * chunk_size
                res = data[fn][c].iloc[start: end].str.split(
                    not_valid_char).str.join(' ').replace(
                    repl_dict,
                    regex=True)
            with timer('Writing column {} {} chunk - {} / {}'.format(c,fn,i+1,iters)):
                with open('pre_tfidf/{}_agg.txt'.format(c), mode='a',
                          newline='\n') as f:
                    f.write('   '.join(res.values))

                if fn in ['train.csv.zip', 'test.csv.zip']:
                    res.to_csv('pre_tfidf/{}_{}'.format(c,fn), index=True,
                               header=True, mode='a')
                del res
                gc.collect()
    del data[fn]
    gc.collect()


/home/dsxuser/.kaggle/competitions/avito-demand-prediction/train_active.csv.zip
07:06 [Reading train_active.csv.zip data>] done in 3 m,7 s
07:06 [Processing train_active.csv.zip column glove_data/description] done in 0 m,0 s
chunks total 4


In [None]:
!mkdir preprocessing

In [55]:
# setup
# !pip install kaggle
# s ='{"username":"sophieg","key":"10bad49b19d9ae80a9c0d7369d58f140"}'
# open('/home/dsxuser/.kaggle/kaggle.json','w').write(s)
# !chmod 600 /home/dsxuser/.kaggle/kaggle.json
# import nltk
# nltk.download('stopwords')
# !kaggle competitions download -c avito-demand-prediction -f train.csv.zip
# !kaggle competitions download -c avito-demand-prediction -f test.csv.zip
# !kaggle competitions download -c avito-demand-prediction -f train_active.csv.zip
# !kaggle competitions download -c avito-demand-prediction -f test_active.csv.zip
# !kaggle competitions download -c avito-demand-prediction -f periods_train.csv.zip
# !kaggle competitions download -c avito-demand-prediction -f periods_test.csv.zip
# !kaggle competitions download -c avito-demand-prediction -f sample_submission.csv
# !kaggle datasets download -d sophieg/wikipedia-language-iso639 -f unicode_vars.pkl


Requirement not upgraded as not directly required: kaggle in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: urllib3>=1.15 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from kaggle)
Requirement not upgraded as not directly required: six>=1.10 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from kaggle)
Requirement not upgraded as not directly required: python-dateutil in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from kaggle)
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from kaggle)
Requirement not upgraded as not directly required: certifi in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from kaggle)
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests->kaggle)
Requirement not upgraded as not directly re

In [None]:
# # ***********************text start****************************


cols = ['description', 'title',
        'param_1', 'param_2', 'param_3']
for i, c in enumerate(cols):
    print(c)
    fn = f'pre_tfidf/{c}_agg.txt'
    tk = TfidfVectorizer(tokenizer=lambda x: x.split(),
                         ngram_range=(1, 2),
                         norm='l2',
                         smooth_idf=True,
                         binary=False,
                         token_pattern=r'(?u)\b\S+\b',
                         lowercase=True,
                         strip_accents=None)
    # with timer(f'reading agg file {fn} > {c} column'):
    #     text = pd.read_csv(fn,
    #                        delimiter=',',
    #                        quotechar='"',
    #                        quoting=csv.QUOTE_MINIMAL,
    #                        na_filter=False,
    #                        squeeze=True)
    with timer(f'fitting tfidf vectoriser> {c} column'):
        with open(fn, 'r') as f:
            tk.fit(f)
    with timer(f'Transforming train via tfidf vectoriser> {c} column'):
        train = pd.read_csv(f'pre_tfidf/{c}_train.csv', index_col=0,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL,
                            na_filter=False)
        x = tk.transform(train).astype(np.float32)
        del train
        gc.collect()
    with timer(f'adding up train word vector> {c} column'):
        css = ['{}_{}'.format(c, w.replace(
            '<', 'less_than').replace(
            '[', 'open_bracket').replace(
            ']', 'close_bracket'))for w in tk.get_feature_names()]
        if i == 0:
            Xt = x
            txt_cols = css
            idf = tk.idf_
        else:
            Xt = hstack([Xt, x])
            txt_cols = [*txt_cols, *css]
            idf = hstack([idf, tk.idf_])
        txt_types = dict(zip(txt_cols, ['float'] * len(txt_cols)))
        del x
        gc.collect()
    with timer(f'Transforming test via tfidf  vectoriser> {c} column'):
        test = pd.read_csv(f'pre_tfidf/{c}_test.csv', index_col=0,
                           delimiter=',',
                           quotechar='"',
                           quoting=csv.QUOTE_MINIMAL,
                           na_filter=False)
        x = tk.transform(test).astype(np.float32)
        del test
        gc.collect()
    with timer(f'adding up test word vector> {c} column'):
        if i == 0:
            Xte = x
        else:
            Xte = hstack([Xte, x])
        del x
        gc.collect()
    del tk
    gc.collect()
np.save('Xt', Xt)
np.save('Xte', Xte)
del Xt, Xte
gc.collect()
# ***********************text ends****************************

In [None]:

# aggregate period info*************************
cols = ['item_id', 'date_from', 'date_to']
with timer('Reading train raw data /aggregated data'):
    train_periods = pd.read_csv(f'{Path}periods_train.csv',
                                parse_dates=['date_from', 'date_to'],
                                engine='c',
                                usecols=cols)
with timer('Reading test raw data/aggregated data'):
    test_periods = pd.read_csv(f'{Path}periods_test.csv',
                               parse_dates=['date_from', 'date_to'],
                               engine='c',
                               usecols=cols)
with timer('Concatenating all data'):
    p = train_periods.append(test_periods)
del train_periods, test_periods
gc.collect()
with timer('Calculating the up time'):
    conv_fact = 1. / ((10 ** 9) * 60. * 60. * 24.)  # convert msec to days
    p['duration'] = (p[['date_from', 'date_to']].astype(int).diff(
        axis=1)['date_to'] * conv_fact).astype(np.int32)
    p.drop(cols[1:], axis=1, inplace=True)
    gc.collect()
with timer('Aggregating period data'):
    grs = p.groupby('item_id')
    periods = grs['duration'].agg(
        ['sum',
         'count',
         'std']).rename(columns={'sum': 'total_up_time',
                                 'count': 'ad-freq',
                                 'std': 'spread'}).reset_index()
#     periods['spread'].fillna(0,inplace=True)
    p = p.merge(periods, on='item_id', how='left')
del periods, grs
gc.collect()

# aggregate period info end*************************
# ***************** agg floats ended **************************************
cats_dict = {'user_id': 'user',
             'parent_category_name': 'pcat',
             'category_name': 'cat'}
cols = ['total_up_time', 'ad-freq', 'spread']
for i, (g_col, g_col_ab) in enumerate(cats_dict.items()):
    used_cols = ['item_id', g_col]
    with timer(f'Reading data/aggregated data {g_col}'):
        train = pd.read_csv(f'{Path}train.csv',
                            usecols=used_cols,
                            engine='c')
        train_active = pd.read_csv(f'{Path}train_active.csv',
                                   usecols=used_cols,
                                   engine='c')
        test = pd.read_csv(f'{Path}test.csv',
                           usecols=used_cols,
                           engine='c')
        test_active = pd.read_csv(f'{Path}/test_active.csv',
                                  usecols=used_cols,
                                  engine='c')
        d = train.append(test).append(train_active).append(
            test_active).drop_duplicates(['item_id'])
        del train_active, test_active, test, train
        gc.collect()
    with timer(f'Collating aggregated data {g_col}'):
        df = p.merge(d, on='item_id', how='left')
        del d
        gc.collect()
        grs = df.groupby(g_col)
    with timer(f'Collating {g_col} stats'):
        agg = pd.concat([grs[cols].mean().rename(
            columns=lambda x: f'avg_{g_col_ab}_{x}'),
            grs[cols].median().rename(
            columns=lambda x: f'median_{g_col_ab}_{x}'),
            grs[cols].max().rename(
            columns=lambda x: f'max_{g_col_ab}_{x}'),
            grs[cols].min().rename(
            columns=lambda x: f'min_{g_col_ab}_{x}'),
            grs[cols].var().rename(
            columns=lambda x: f'spread_{g_col_ab}_{x}'),
            grs['item_id'].count().to_frame().rename(
            columns=lambda x: f'cnt_{g_col_ab}_{x}')],
            axis=1).reset_index()

    del grs, df
    gc.collect()
    with timer(f'Substituting {g_col} into data'):
        train = pd.read_csv(f'{Path}train.csv',
                            usecols=used_cols,
                            engine='c')
        r = train.merge(agg, on=g_col, how='left').set_index('item_id')
        r.drop(g_col, axis=1, inplace=True)
        if i == 0:
            X = r
        else:
            X = X.join(r)
        del r, train
        gc.collect()
        test = pd.read_csv(f'{Path}test.csv',
                           usecols=used_cols,
                           engine='c')
        r = test.merge(agg, on=g_col, how='left').set_index('item_id')
        r.drop(g_col, axis=1, inplace=True)
        r = r[r.columns[r.describe().loc['std'] > 0]]
        if i == 0:
            Xe = r
        else:
            Xe = Xe.join(r)
        del test, agg, r
        gc.collect()
flt_types = X.dtypes.str.replace(r'^int.*$', 'q').str.replace(
    r'^float.*$', 'float').todict()
# ***************** agg floats ended **************************************



In [None]:

# # ***********************start floats****************************
text_cols = ['title', 'description',
             'param_1', 'param_2', 'param_3']


cols = [index, 'price', *text_cols, target]

with timer('Reading raw data/ float columns'):
    train = pd.read_csv(f'{Path}/train.csv',
                        engine='c',
                        index_col=index,
                        usecols=cols)
    test = pd.read_csv(f'{Path}/test.csv',
                       engine='c',
                       index_col=index,
                       usecols=cols[:-1])
y = train[target]
train.drop(target, axis=1, inplace=True)
gc.collect()
with timer('starting text stats'):
    d = train.append(test)

with timer('total length stat'):
    df = d[text_cols].applymap(
        lambda x: len(x) if type(x) == str else x).astype(np.float32).rename(
        columns=lambda x: f"len_{x}")

with timer('word length stat'):
    r = d[text_cols].applymap(
        lambda x: len(x.split())
        if type(x) == str else x).astype(
        np.float32).rename(columns=lambda x: f"word_len_{x}")
    df = df.join(r)
with timer('unique word length stat'):
    r = d[text_cols].applymap(
        lambda x: len(list(set(x.split())))
        if type(x) == str else x).astype(np.float32).rename(
        columns=lambda x: f"unq_wrd_len_{x}")
    df = df.join(r)
with timer('number of new lines stat'):
    r = d[text_cols].applymap(
        lambda x: len(re.findall(r'\n', x))
        if type(x) == str else x).astype(
        np.float32).rename(columns=lambda x: f"new_lines_{x}")
    r = r[r.columns[r.describe().loc['std'] > 0]]
    df = df.join(r)
with timer('number of white spaces stat'):
    r = d[text_cols].applymap(
        lambda x: len(re.findall(r'\s', x))
        if type(x) == str else x).astype(
        np.float32).rename(columns=lambda x: f"white_space_{x}")
    r = r[r.columns[r.describe().loc['std'] > 0]]
    df = df.join(r)
with timer('number of caps stat'):
    r = d[text_cols].applymap(
        lambda x: len(re.findall(r'[A-ZА-Я]', x))
        if type(x) == str else x).astype(
        np.float32).rename(columns=lambda x: f"caps_{x}")
    r = r[r.columns[r.describe().loc['std'] > 0]]
    df = df.join(r)
with timer('number of smalls stat'):
    r = d[text_cols].applymap(
        lambda x: len(re.findall(r'[a-zа-я]', x))
        if type(x) == str else x).astype(
        np.float32).rename(columns=lambda x: f"smalls_{x}")
    r = r[r.columns[r.describe().loc['std'] > 0]]
    df = df.join(r)

with timer('number of digits stat'):
    r = d[text_cols].applymap(
        lambda x: len(re.findall(r'[0-9]', x))
        if type(x) == str else x).astype(
        np.float32).rename(columns=lambda x: f"dig_{x}")
    r = r[r.columns[r.describe().loc['std'] > 0]]
    df = df.join(r)
with timer('number of punctuation stat'):
    r = d[text_cols].applymap(
        lambda x: len(re.findall(fr'[{re.escape(string.punctuation)}]',
                                 x)) if type(x) == str else x).astype(
        np.float32).rename(columns=lambda x: f"punct_{x}")
    r = r[r.columns[r.describe().loc['std'] > 0]]
    df = df.join(r)

with timer('number of emoji stat'):
    r = d[text_cols].applymap(
        lambda x: len(re.findall(u'[\U00010000-\U000fffff]',
                                 x)) if type(x) == str else x).astype(
        np.float32).rename(columns=lambda x: f"emoji_{x}")
    r = r[r.columns[r.describe().loc['std'] > 0]]
    df = df.join(r)
ft = dict(zip(df.columns.tolist(), ['q'] * df.shape[1]))
ft['price'] = 'float'
flt_types = {**flt_types, **ft}
df['price'] = d['price'].astype(np.float32)
del d, r, ft
gc.collect()
X = X.join(df.loc[train.index])
Xe = Xe.join(df.loc[test.index])

flt_cols = X.columns.tolist()
flt_types = dict(zip(flt_cols, ['q'] * len(flt_cols)))

del df, train, test
gc.collect()
Xt = np.load('Xt.npy').item(0)
Xt = hstack([X.values, Xt])
np.save('Xt', Xt)
del X, Xt
gc.collect()
Xte = np.load('Xte.npy').item(0)
Xte = hstack([Xe.values, Xte])
np.save('Xte', Xte)
del Xe, Xte
gc.collect()

# # ***********************end floats****************************




In [None]:

# ***********************cat encoded starts****************************
cat_cols = ['region', 'city',
            'parent_category_name', 'category_name',
            'user_type', 'image_top_1',
            'item_seq_number', 'user_id']
null_cols = ['price', 'description', 'param_1',
             'param_2', 'param_3', 'image']

cols = [index, *cat_cols,
        *null_cols, 'activation_date']
with timer('Reading raw data > cat columns'):
    train = pd.read_csv(f'{Path}/train.csv',
                        parse_dates=['activation_date'],
                        engine='c',
                        usecols=cols,
                        index_col=index)
    test = pd.read_csv(f'{Path}/test.csv',
                       parse_dates=['activation_date'],
                       usecols=cols,
                       engine='c',
                       index_col=index)

train['city'] = train['region'] + '_' + train['city']
test['city'] = test['region'] + '_' + test['city']

with timer('Creating isnull columns'):
    cs = ['image_top_1', *null_cols]
    for c in cs:
        hc = f'null_{c}'
        train[hc] = hc + '_' + train[c].isnull().astype(int).astype(str)
        test[hc] = hc + '_' + test[c].isnull().astype(int).astype(str)
        null_cols.append(hc)
train.drop(null_cols, axis=1, inplace=True)
test.drop(null_cols, axis=1, inplace=True)
gc.collect()
with timer('Creating date statistics'):
    train['day'] = 'day_' + train["activation_date"].dt.day.astype(str)
    test['day'] = 'day_' + test["activation_date"].dt.day.astype(str)

    train['wday'] = 'wday_' + train["activation_date"].dt.weekday.astype(str)
    test['wday'] = 'wday_' + test["activation_date"].dt.weekday.astype(str)

    train['activation_date'] = 'yday_' + \
        train["activation_date"].dt.dayofyear.astype(str)
    test['activation_date'] = 'yday_' + \
        test["activation_date"].dt.dayofyear.astype(str)
date_cols = ['day', 'wday', 'activation_date']

for c in cat_cols:
    with timer(f'Preparing {c}'):
        if c == 'image_top_1':
            train[c] = f'{c}_' + train[c].fillna(FNAS).apply(
                lambda x: str(int(x)) if type(x) != str else x)
            test[c] = f'{c}_' + test[c].fillna(FNAS).apply(
                lambda x: str(int(x)) if type(x) != str else x)
        else:
            train[c] = f'{c}_' + train[c].fillna(FNAS).astype(str)
            test[c] = f'{c}_' + test[c].fillna(FNAS).astype(str)


datac = train.append(test)
with timer(f'Fitting all cat data with label encoder'):
    les = LabelEncoder().fit(datac.unstack())
    categories = dict(zip(les.classes_, range(len(les.classes_))))
    for c in datac.columns:
        with timer(f'transforming {c} with label encoder'):
            datac[c] = les.transform(datac[c])
if encode:
    with timer(f'Fitting hotencoder'):
        enc = OneHotEncoder(dtype=bool)
        enc.fit(datac)
        del datac
        gc.collect()
    with timer(f'Transforming train vian hotencoder'):
        x = enc.transform(train.apply(lambda col: les.transform(col)))
        cat_cols = train.columns.tolist()
        cat_type = dict(zip(cat_cols, ['i'] * len(cat_cols)))
        del train
        gc.collect()
        Xt = np.load('Xt.npy').item(0)
        Xt = hstack([Xt, x]).tocsr()
        np.save('Xt', Xt)
        del x, Xt
        gc.collect()
    with timer(f'Transforming test vian hotencoder'):
        x = enc.transform(test.apply(lambda col: les.transform(col)))
        del test
        gc.collect()
        Xte = np.load('Xte.npy').item(0)
        Xte = hstack([Xte, x]).tocsr()
        np.save('Xte', Xte)
        del x, Xte
        gc.collect()
else:
    with timer(f'Transforming train via label encoder'):
        train = train.apply(lambda col: les.transform(col).astype(np.uint32))
        cat_cols = train.columns.tolist()
        cat_types = dict(zip(cat_cols, ['i'] * len(cat_cols)))
        Xt = np.load('Xt.npy').item(0)
        Xt = hstack([Xt, train.values]).tocsr()
        np.save('Xt', Xt)
        del train, Xt
        gc.collect()
    with timer(f'Transforming test via label encoder'):
        test = test.apply(lambda col: les.transform(col).astype(np.uint32))
        Xte = np.load('Xte.npy').item(0)
        Xte = hstack([Xte, test.values]).tocsr()
        np.save('Xte', Xte)
        del test, Xte
        gc.collect()
# # ***********************cat ends****************************


In [None]:
dtypes = {**flt_types, **cat_types, **txt_types}
cols = [*flt_cols, *cat_cols, *txt_cols]


# # ***********************save starts***************************

dct = {'flt_cols': flt_cols,
       'cat_cols': cat_cols,
       'txt_cols': txt_cols,
       'flt_types': flt_types,
       'cat_types': cat_types,
       'txt_types': txt_types,
       'dtypes': dtypes,
       'cols': cols,
       'idf': idf,
       'y': y}
del flt_cols, cat_cols, txt_cols
del flt_types, cat_types, txt_types
del dtypes, cols, idf, y
gc.collect()

with open(f'dct_enc{int(encode)}.pkl', mode='wb') as f:
    pickle.dump(dct, f)
    f.close()
del dct
gc.collect()
# # ***********************save ends***************************

In [None]:
# # ******************load data starts***************************
with open('col_type.pkl', mode='rb') as f:
    dct = pickle.load(f)
    f.close()

Xt = np.load(f'Xt.npy').item(0)
Xte = np.load(f'Xte.npy').item(0)
# # ***********************load data  ends***************************


# ********************* prepare data for xgb training**************
with timer('Preparing data for training'):
    splt_index = list(range(len(dct['y'])))
    np.random.shuffle(splt_index)
    n = int(np.ceil(V_SPLT * len(dct['y'])))
    tr_ind = splt_index[:n]
    vl_ind = splt_index[n:]

    dtrain_x = xgb.DMatrix(data=Xt[tr_ind, :],
                           label=dct['y'].values[tr_ind],
                           feature_names=dct['cols'],
                           feature_types=dct['dtypes'].values())
    dvalid_x = xgb.DMatrix(data=Xt[vl_ind, :],
                           label=dct['y'].values[vl_ind],
                           feature_names=dct['cols'],
                           feature_types=dct['dtypes'].values())
    dtest_x = xgb.DMatrix(data=Xte,
                          feature_names=dct['cols'],
                          feature_types=dct['dtypes'].values())
param_x = dict(eval_metric="rmse",
               verbose=1,
               objective="reg:logistic",
               booster='gbtree',
               eta=0.05,
               max_depth=18,
               min_child_weight=11,
               gamma=0,
               subsample=0.85,
               colsample_bytree=0.7,
               reg_alpha=2.0,
               reg_lambda=0,
               n_jobs=16)

# ***********************prepare data for training end xgb**************

# ***********************training model xgb starts************************
with timer(f'training xgb model'):
    xgb_clf = xgb.train(param_x,
                        evals=[
                            #                         (dtrain_x,'train' ),
                            (dvalid_x, 'valid')],
                        dtrain=dtrain_x,
                        num_boost_round=80000,
                        verbose_eval=20,
                        early_stopping_rounds=2000)

# ***********************training model lgbm starts****************

# ***********************plot model lgbm ****************************
f, ax = plt.subplots(figsize=[7, 16])
xgb.plot_importance(xgb_clf, max_num_features=100, ax=ax)
plt.title("Xgboost Feature Importance")
plt.savefig('xgimport.png')
# ***********************create submission with model lgbm ***********
test = pd.read_csv(f'{Path}sample_submission.csv')
test[target] = xgb_clf.predict(dtest_x).clip(0.0, 1.0)
test.to_csv(f"xgb_agg_text{time.strftime('%d-%m_%H:%M')}.csv", index=False)

# ***********************training model xgb ends*********************
# LGBM
# ***********************prepare data lgbm for training**************
with timer('Preparing data for lgbm training'):
    splt_index = list(range(len(dct['y'])))
    np.random.shuffle(splt_index)
    n = int(np.ceil(V_SPLT * len(dct['y'])))
    tr_ind = splt_index[:n]
    vl_ind = splt_index[n:]
    lgtrain = lgb.Dataset(data=Xt[tr_ind, :],
                          label=dct['y'].values[tr_ind],
                          feature_name=dct['cols'],
                          categorical_feature=dct['cat_cols'])
    lgvalid = lgb.Dataset(data=Xt[vl_ind, :],
                          label=dct['y'].values[vl_ind],
                          feature_name=dct['cols'],
                          categorical_feature=dct['cat_cols'])
    lgtest = lgb.Dataset(data=Xte,
                         feature_name=dct['cols'],
                         categorical_feature=dct['cat_cols'])

    lgbm_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 32,
        'max_depth': 18,
        'learning_rate': 0.02,
        'feature_fraction': 0.6,
        'verbose': 0,
        'num_threads': 8
    }
# ***********************prepare data for lgbm training end *********
# ***********************training model lgbm starts*************
with timer(f'training lgbm model'):
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=16000,
        valid_sets=[lgvalid],
        valid_names=['train', 'valid'],
        early_stopping_rounds=500,
        verbose_eval=100
    )
# ***********************training model lgbm starts****************

# ***********************plot model lgbm ****************************
f, ax = plt.subplots(figsize=[7, 16])
lgb.plot_importance(lgb_clf, max_num_features=100, ax=ax)
plt.title("Light GBM Feature Importance")
plt.savefig('lgbimport.png')
# **************create submission with model lgbm ***************
test = pd.read_csv(f'{Path}sample_submission.csv')
test[target] = lgb_clf.predict(lgtest).clip(0.0, 1.0)
test.to_csv(f"lgb_agg_text{time.strftime('%d-%m_%H:%M')}.csv", index=False)


