In [188]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 

import sklearn
import scipy.sparse

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import seaborn as sns
from tqdm import tqdm_notebook

In [12]:
DATA_FOLDER = './readonly/'
RESULTS_FOLDER = './results/'

data = pd.read_excel(os.path.join(DATA_FOLDER, 'MFTRAVEL_HACKATHON.xlsx'), sheet_name=None)

In [13]:
countries = data['CountryDict']['CNTRY_CODE'].values.astype('int32')
data.keys()

odict_keys(['Data', 'CountryDict', 'FieldsDiscr'])

## ENV EDA

In [156]:
MaxFlightPrice = 100000
MaxHotelPrice = 100000
MaxHotelPrice = 100000
MaxRandDays = 100
MaxHotels = 500
MaxFlights = 100000

ZeroDate = pd.to_datetime('8/2/2020')

class Environment:
    def gen_dataset(self):
        k = dict()
        hotels = ['Baba', 'Bubu', 'Kata']
        hotel_ind = 0
        start_date = ZeroDate

        print('Generating country_to_hotels dataset...')
        
        for i in tqdm_notebook(countries):
            country_to_hotels = {'name' : [], 'day_price': [], 'available_from' : [], 'available_to' : []}

            for _ in range(MaxHotels):
                country_to_hotels['name'].append('{}_{}'.format(hotels[hotel_ind % 3], hotel_ind))
                country_to_hotels['day_price'].append(np.random.randint(MaxHotelPrice))

                delta1 = np.random.randint(MaxRandDays)
                delta2 = np.random.randint(MaxRandDays//2)
                country_to_hotels['available_from'].append(start_date + pd.DateOffset(days = delta1))
                country_to_hotels['available_to'].append(start_date + pd.DateOffset(days = delta1 + delta2))

                hotel_ind+=1

            k[i] = pd.DataFrame(data=country_to_hotels)
            
        self.country_to_hotels_ = k
        
    def __init__(self, country_to_hotels = None):
        print('Generating time_to_planes dataset...')
        
        d = {'time' : [], 'price': [], 'from': [], 'to' : []}
        start_date = ZeroDate
        
        for i in range(MaxFlights):
            if i % (MaxFlights // (MaxRandDays * 2)) == 0: 
                start_date += pd.DateOffset(days = 1)
                
            d['time'].append(start_date)
            d['price'].append(np.random.randint(MaxFlightPrice))
            d['from'].append(countries[np.random.randint(countries.size)])
            d['to'].append(countries[np.random.randint(countries.size)])

        self.planes_timetable_ = pd.DataFrame(data=d).set_index('time')
    
        if (country_to_hotels == None):
            self.gen_dataset()
        else:
            self.country_to_hotels_ = country_to_hotels
        
        
        print('Done')
        
    def get_tickets(self, time1, time2):
        return self.planes_timetable_[time1 : time2]
    
    def get_hotels(self, country_id, time1, time2): #datetime!!!
        return self.country_to_hotels_[country_id][
            (time1 > self.country_to_hotels_[country_id]['available_from']) &
            (time2 < self.country_to_hotels_[country_id]['available_to'])] 

In [149]:
save = env.country_to_hotels_

In [157]:
env = Environment(save)

Generating time_to_planes dataset...
Done


In [161]:
time1 = pd.to_datetime('28/8/2020')
time2 = pd.to_datetime('29/8/2020')

env.get_hotels(4, time1, time2)

Unnamed: 0,name,day_price,available_from,available_to
3,Kata_503,57931,2020-08-08,2020-09-26
6,Kata_506,84984,2020-08-06,2020-08-31
10,Baba_510,88833,2020-08-27,2020-09-19
23,Bubu_523,76752,2020-08-25,2020-09-06
43,Baba_543,27161,2020-08-25,2020-09-12
...,...,...,...,...
474,Kata_974,26532,2020-08-05,2020-09-14
475,Baba_975,41564,2020-08-24,2020-09-09
478,Baba_978,9088,2020-08-15,2020-09-07
486,Kata_986,13208,2020-08-15,2020-09-27


In [162]:
env.get_tickets(time1, time2)

Unnamed: 0_level_0,price,from,to
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-08-28,1519,554,208
2020-08-28,47774,104,104
2020-08-28,18892,520,404
2020-08-28,77592,232,64
2020-08-28,56376,344,-2147483648
...,...,...,...
2020-08-29,24364,-2147483648,876
2020-08-29,7228,694,860
2020-08-29,73598,4,694
2020-08-29,43070,238,124


## NN EDA

In [4]:
import tensorflow as tf
from sklearn import preprocessing

In [23]:
exp = data['Data'].copy()

def foo(x):
    if type(x) == int:
        return [x]
    return list(map(int, x.split('; ')))

exp['COUNTRIES_IN_TRIP'] = exp['COUNTRIES_IN_TRIP'].apply(foo)
# del data

In [24]:
exp['USING_INTERNET'] = exp['USING_INTERNET'].fillna(0).astype('bool')
le = preprocessing.LabelEncoder()

for column in ['GENDER', 'REGION', 'DEVICE_TYPE',
               'OS', 'SUBSAGE_MF_SEGMENT']:
    le.fit(list(exp[column].values))
    exp[column] = le.transform(list(exp[column].values))
    print('OKAY', column)

OKAY GENDER
OKAY REGION
OKAY DEVICE_TYPE
OKAY OS
OKAY SUBSAGE_MF_SEGMENT


In [95]:
countries

array([        895,           4,           8,          12,          16,
                20,          24,         660,          10,          28,
                32,          51,         533,          36,          40,
                31,          44,          48,          50,          52,
               112,          56,          84,         204,          60,
                64,          68,         535,          70,          72,
                74,          76,          92,          96,         100,
               854,         108,         116,         120,         124,
               132,         136,         140,         148,         152,
               156,         170,         174,         178,         184,
               188,         384,         191,         192,         531,
               196,         203,         408,         626,         180,
               208,          86,         262,         212,         214,
               218,         818,         222,         226,      

In [26]:
# IT"S HEAVY
visited_country = dict()
for i in countries:
    list_t = []
    for j in exp['COUNTRIES_IN_TRIP']:
        list_t.append(i in j)
    
    visited_country[i] = list_t
    
exp = exp.join(pd.DataFrame(visited_country))
exp.drop(['COUNTRIES_IN_TRIP'], inplace = True, axis=1)

In [63]:
exp['User_ID'] = exp['User_ID'].astype('int32')
exp['AGE'] = exp['AGE'].fillna(-1).astype('int8')
exp['GENDER'] = exp['GENDER'].astype('int8')
exp['REGION'] = exp['REGION'].astype('int8')

In [147]:
tempo = exp.groupby(['User_ID']).agg({'SMS_IN_CNT_M3' : ['mean', 'std'], 
                                      'TRIP_DURATION' : ['mean', 'std'],
                                      'MOU_IN_REVENUE_M3' : ['mean', 'std'],
                                      'MOU_OUT_REVENUE_M3' : ['mean', 'std'], 
                                      'DOU_DURATION_M3' : ['mean', 'std'], 
                                      'ARPU_M3' : ['mean', 'std']
                                     })
foo = lambda x : "_".join(x) if x[1]!="" else x[0]
tempo.columns = [foo(x) for x in tempo.columns.ravel()]

user_to_flights = exp.groupby(['User_ID'])[exp.columns[18:]].sum().values
#user_to_mostfreq_country = countries[user_to_flights.argsort(axis=1)[-3:][::-1]]
user_to_mostfreq_country = countries[user_to_flights.argmax(axis=1)]

tempo['most_freq'] = user_to_mostfreq_country
user_data = tempo.join(exp.groupby(['User_ID']).agg({'GENDER' : 'max', 'AGE' : 'max', 'REGION' : 'max',
                              'DEVICE_TYPE' : 'max', 'OS' : 'max', 'SUBSAGE_MF_SEGMENT' : 'max', 
                              'USING_INTERNET' : 'max'}))
del tempo
user_data.fillna(0, inplace=True)

In [152]:
user_data

Unnamed: 0_level_0,SMS_IN_CNT_M3_mean,SMS_IN_CNT_M3_std,TRIP_DURATION_mean,TRIP_DURATION_std,MOU_IN_REVENUE_M3_mean,MOU_IN_REVENUE_M3_std,MOU_OUT_REVENUE_M3_mean,MOU_OUT_REVENUE_M3_std,DOU_DURATION_M3_mean,DOU_DURATION_M3_std,ARPU_M3_mean,ARPU_M3_std,most_freq,GENDER,AGE,REGION,DEVICE_TYPE,OS,SUBSAGE_MF_SEGMENT,USING_INTERNET
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
80000,299.000000,1.154701,4.500,1.290994,532.116667,78.731334,466.699996,67.222821,5381.978539,1291.962179,2401.9950,995.831065,222,2,48,55,5,0,1,True
80001,395.050000,143.792788,4.650,4.749238,250.322501,107.936626,301.601666,79.642260,1857.871891,1692.410125,4072.1595,2428.142838,222,2,60,55,5,20,1,True
80002,164.500000,28.991378,7.000,8.485281,84.649999,19.963982,78.641666,15.026017,8816.631841,3352.524450,2872.4600,2044.655826,895,2,70,55,5,0,1,True
80003,249.666667,63.006878,5.500,4.929503,822.913890,417.014447,628.141667,537.880583,3695.561045,843.805226,3396.3950,2587.648900,222,0,55,55,5,20,1,True
80004,438.625000,109.247998,3.625,4.274091,295.881251,82.766410,353.275003,118.160949,2520.610484,962.627535,5280.9375,2836.995127,84,2,59,55,5,20,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180448,176.000000,0.000000,4.000,0.000000,173.050002,0.000000,280.716670,0.000000,1273.079104,0.000000,1675.1600,0.000000,895,0,-1,18,5,20,2,True
180449,0.000000,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,895,0,-1,0,9,21,0,False
180450,0.000000,0.000000,11.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,112,0,-1,0,9,21,0,False
180451,0.000000,0.000000,13.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,895,0,-1,0,9,21,0,False


In [159]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Reshape
from keras.layers import Conv2D, Conv2DTranspose, UpSampling2D
from keras.layers import LeakyReLU, Dropout
from keras.layers import BatchNormalization
from keras.optimizers import Adam, RMSprop

Using TensorFlow backend.


In [166]:
img_rows = 28
img_cols = 28
channel = 1

G = Sequential()
dropout = 0.4
depth = 64+64+64+64
dim = 7

G.add(Dense(dim*dim*depth, input_dim=100))
G.add(BatchNormalization(momentum=0.9))
G.add(Activation('relu'))
G.add(Reshape((dim, dim, depth)))
G.add(Dropout(dropout))

G.add(UpSampling2D())
G.add(Conv2DTranspose(int(depth/2), 5, padding='same'))
G.add(BatchNormalization(momentum=0.9))
G.add(Activation('relu'))

G.add(UpSampling2D())
G.add(Conv2DTranspose(int(depth/4), 5, padding='same'))
G.add(BatchNormalization(momentum=0.9))
G.add(Activation('relu'))

G.add(Conv2DTranspose(int(depth/8), 5, padding='same'))
G.add(BatchNormalization(momentum=0.9))
G.add(Activation('relu'))

G.add(Conv2DTranspose(1, 5, padding='same'))
G.add(Activation('sigmoid'))

In [167]:
D = Sequential()
depth = 64
dropout = 0.4

input_shape = (img_rows, img_cols, channel)
D.add(Conv2D(depth*1, 5, strides=2, input_shape=input_shape,\
    padding='same'))
D.add(LeakyReLU(alpha=0.2))
D.add(Dropout(dropout))

D.add(Conv2D(depth*2, 5, strides=2, padding='same'))
D.add(LeakyReLU(alpha=0.2))
D.add(Dropout(dropout))

D.add(Conv2D(depth*4, 5, strides=2, padding='same'))
D.add(LeakyReLU(alpha=0.2))
D.add(Dropout(dropout))

D.add(Conv2D(depth*8, 5, strides=1, padding='same'))
D.add(LeakyReLU(alpha=0.2))
D.add(Dropout(dropout))

# Out: 1-dim probability
D.add(Flatten())
D.add(Dense(1))
D.add(Activation('sigmoid'))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 14, 14, 64)        1664      
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 14, 14, 64)        0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 14, 14, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 7, 7, 128)         204928    
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 7, 7, 128)         0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 7, 7, 128)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 4, 4, 256)        

In [170]:
optimizer = RMSprop(lr=0.0002, decay=6e-8)
DM = Sequential([D])
DM.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

optimizer = RMSprop(lr=0.0001, decay=3e-8)
AM = Sequential([G, D])
AM.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [157]:
x_train = input_data.read_data_sets("mnist",\
    one_hot=True).train.images
x_train = x_train.reshape(-1, self.img_rows,\
    self.img_cols, 1).astype(np.float32)

self.discriminator =  DM
self.adversarial = AM
self.generator = G
        
images_train = self.x_train[np.random.randint(0,
                self.x_train.shape[0], size=batch_size), :, :, :]
noise = np.random.uniform(-1.0, 1.0, size=[batch_size, 100])
images_fake = self.generator.predict(noise)
x = np.concatenate((images_train, images_fake))
y = np.ones([2*batch_size, 1])
y[batch_size:, :] = 0
d_loss = self.discriminator.train_on_batch(x, y)

y = np.ones([batch_size, 1])
noise = np.random.uniform(-1.0, 1.0, size=[batch_size, 100])
a_loss = self.adversarial.train_on_batch(noise, y)

NameError: name 'self' is not defined

In [184]:
exp['start_month'] = exp['START_TRIP'].apply(lambda x : x.month)
exp['start_day'] = exp['START_TRIP'].apply(lambda x : x.day)
exp['start_year'] = exp['START_TRIP'].apply(lambda x : x.year)

exp['end_month'] = exp['END_TRIP'].apply(lambda x : x.month)
exp['end_day'] = exp['END_TRIP'].apply(lambda x : x.day)
exp['end_year'] = exp['END_TRIP'].apply(lambda x : x.year)

exp.drop(['END_TRIP', 'START_TRIP'], axis=1, inplace = True)

Unnamed: 0,User_ID,GENDER,AGE,REGION,DEVICE_TYPE,OS,SUBSAGE_MF_SEGMENT,USING_INTERNET,TRIP_DURATION,TRIP_MAIN_COUNTRY,...,732,887,894,716,start_month,start_day,start_year,end_month,end_day,end_year
0,80000,2,48,55,5,0,1,True,5.0,222,...,False,False,False,False,10,3,2019,10,7,2019
1,80000,2,48,55,5,0,1,True,6.0,85,...,False,False,False,False,10,10,2019,10,15,2019
2,80001,0,-1,55,5,0,1,True,2.0,213,...,False,False,False,False,12,1,2019,12,2,2019
3,80001,0,-1,55,5,0,1,True,3.0,23,...,False,False,False,False,12,18,2019,12,20,2019
4,80001,0,-1,55,5,0,1,True,5.0,202,...,False,False,False,False,12,27,2019,12,31,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378462,123215,0,-1,0,9,21,0,False,5.0,19,...,False,False,False,False,8,11,2019,8,15,2019
378463,177003,0,-1,0,9,21,0,False,1.0,175,...,False,False,False,False,3,8,2019,3,8,2019
378464,180451,0,-1,0,9,21,0,False,13.0,1,...,False,False,False,False,7,13,2019,7,25,2019
378465,156886,0,-1,33,5,20,2,True,12.0,12,...,False,False,False,False,2,8,2019,2,19,2019


In [189]:
time_list = []
for i in ['start', 'end']:
    for j in ['day', 'month', 'year']:
        time_list.append('{}_{}'.format(i, j))

In [206]:
ids_list = user_data.index.unique()

In [230]:
time_list = []
batch_size_noise = 64
batch_size_true = 128
ids_list = user_data.index.unique()

for i in ['start', 'end']:
    for j in ['day', 'month', 'year']:
        time_list.append('{}_{}'.format(i, j))
    
random_ids = np.random.choice(ids_list, batch_size_noise, replace=False)
fake_batch = pd.DataFrame(random_ids, columns=['User_ID']).join(user_data, on = 'User_ID').values

In [246]:
true_data = exp[['User_ID', 'TRIP_MAIN_COUNTRY'] + time_list].join(user_data, on = 'User_ID')
true_batch = true_data.loc[np.random.choice(true_data.shape[0], 
                                            batch_size_true, 
                                            replace=False)].values

In [247]:
true_batch.shape

(128, 28)

In [248]:
fake_batch.shape

(64, 21)