In [188]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 

import sklearn
import scipy.sparse

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import seaborn as sns
from tqdm import tqdm_notebook

In [12]:
DATA_FOLDER = './readonly/'
RESULTS_FOLDER = './results/'

data = pd.read_excel(os.path.join(DATA_FOLDER, 'MFTRAVEL_HACKATHON.xlsx'), sheet_name=None)

In [13]:
countries = data['CountryDict']['CNTRY_CODE'].values.astype('int32')
data.keys()

odict_keys(['Data', 'CountryDict', 'FieldsDiscr'])

## ENV EDA

In [156]:
MaxFlightPrice = 100000
MaxHotelPrice = 100000
MaxHotelPrice = 100000
MaxRandDays = 100
MaxHotels = 500
MaxFlights = 100000

ZeroDate = pd.to_datetime('8/2/2020')

class Environment:
    def gen_dataset(self):
        k = dict()
        hotels = ['Baba', 'Bubu', 'Kata']
        hotel_ind = 0
        start_date = ZeroDate

        print('Generating country_to_hotels dataset...')
        
        for i in tqdm_notebook(countries):
            country_to_hotels = {'name' : [], 'day_price': [], 'available_from' : [], 'available_to' : []}

            for _ in range(MaxHotels):
                country_to_hotels['name'].append('{}_{}'.format(hotels[hotel_ind % 3], hotel_ind))
                country_to_hotels['day_price'].append(np.random.randint(MaxHotelPrice))

                delta1 = np.random.randint(MaxRandDays)
                delta2 = np.random.randint(MaxRandDays//2)
                country_to_hotels['available_from'].append(start_date + pd.DateOffset(days = delta1))
                country_to_hotels['available_to'].append(start_date + pd.DateOffset(days = delta1 + delta2))

                hotel_ind+=1

            k[i] = pd.DataFrame(data=country_to_hotels)
            
        self.country_to_hotels_ = k
        
    def __init__(self, country_to_hotels = None):
        print('Generating time_to_planes dataset...')
        
        d = {'time' : [], 'price': [], 'from': [], 'to' : []}
        start_date = ZeroDate
        
        for i in range(MaxFlights):
            if i % (MaxFlights // (MaxRandDays * 2)) == 0: 
                start_date += pd.DateOffset(days = 1)
                
            d['time'].append(start_date)
            d['price'].append(np.random.randint(MaxFlightPrice))
            d['from'].append(countries[np.random.randint(countries.size)])
            d['to'].append(countries[np.random.randint(countries.size)])

        self.planes_timetable_ = pd.DataFrame(data=d).set_index('time')
    
        if (country_to_hotels == None):
            self.gen_dataset()
        else:
            self.country_to_hotels_ = country_to_hotels
        
        
        print('Done')
        
    def get_tickets(self, time1, time2):
        return self.planes_timetable_[time1 : time2]
    
    def get_hotels(self, country_id, time1, time2): #datetime!!!
        return self.country_to_hotels_[country_id][
            (time1 > self.country_to_hotels_[country_id]['available_from']) &
            (time2 < self.country_to_hotels_[country_id]['available_to'])] 

In [149]:
save = env.country_to_hotels_

In [157]:
env = Environment(save)

Generating time_to_planes dataset...
Done


In [161]:
time1 = pd.to_datetime('28/8/2020')
time2 = pd.to_datetime('29/8/2020')

env.get_hotels(4, time1, time2)

Unnamed: 0,name,day_price,available_from,available_to
3,Kata_503,57931,2020-08-08,2020-09-26
6,Kata_506,84984,2020-08-06,2020-08-31
10,Baba_510,88833,2020-08-27,2020-09-19
23,Bubu_523,76752,2020-08-25,2020-09-06
43,Baba_543,27161,2020-08-25,2020-09-12
...,...,...,...,...
474,Kata_974,26532,2020-08-05,2020-09-14
475,Baba_975,41564,2020-08-24,2020-09-09
478,Baba_978,9088,2020-08-15,2020-09-07
486,Kata_986,13208,2020-08-15,2020-09-27


In [162]:
env.get_tickets(time1, time2)

Unnamed: 0_level_0,price,from,to
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-08-28,1519,554,208
2020-08-28,47774,104,104
2020-08-28,18892,520,404
2020-08-28,77592,232,64
2020-08-28,56376,344,-2147483648
...,...,...,...
2020-08-29,24364,-2147483648,876
2020-08-29,7228,694,860
2020-08-29,73598,4,694
2020-08-29,43070,238,124


## NN EDA

In [341]:
import tensorflow as tf
from sklearn import preprocessing

In [23]:
exp = data['Data'].copy()

def foo(x):
    if type(x) == int:
        return [x]
    return list(map(int, x.split('; ')))

exp['COUNTRIES_IN_TRIP'] = exp['COUNTRIES_IN_TRIP'].apply(foo)
del data

In [24]:
exp['USING_INTERNET'] = exp['USING_INTERNET'].fillna(0).astype('bool')
le = preprocessing.LabelEncoder()

for column in ['GENDER', 'REGION', 'DEVICE_TYPE',
               'OS', 'SUBSAGE_MF_SEGMENT']:
    le.fit(list(exp[column].values))
    exp[column] = le.transform(list(exp[column].values))
    print('OKAY', column)

OKAY GENDER
OKAY REGION
OKAY DEVICE_TYPE
OKAY OS
OKAY SUBSAGE_MF_SEGMENT


In [26]:
# IT"S HEAVY
visited_country = dict()
for i in countries:
    list_t = []
    for j in exp['COUNTRIES_IN_TRIP']:
        list_t.append(i in j)
    
    visited_country[i] = list_t
    
exp = exp.join(pd.DataFrame(visited_country))
exp.drop(['COUNTRIES_IN_TRIP'], inplace = True, axis=1)

In [63]:
exp['User_ID'] = exp['User_ID'].astype('int32')
exp['AGE'] = exp['AGE'].fillna(-1).astype('int8')
exp['GENDER'] = exp['GENDER'].astype('int8')
exp['REGION'] = exp['REGION'].astype('int8')

In [147]:
tempo = exp.groupby(['User_ID']).agg({'SMS_IN_CNT_M3' : ['mean', 'std'], 
                                      'TRIP_DURATION' : ['mean', 'std'],
                                      'MOU_IN_REVENUE_M3' : ['mean', 'std'],
                                      'MOU_OUT_REVENUE_M3' : ['mean', 'std'], 
                                      'DOU_DURATION_M3' : ['mean', 'std'], 
                                      'ARPU_M3' : ['mean', 'std']
                                     })
foo = lambda x : "_".join(x) if x[1]!="" else x[0]
tempo.columns = [foo(x) for x in tempo.columns.ravel()]

user_to_flights = exp.groupby(['User_ID'])[exp.columns[18:]].sum().values
#user_to_mostfreq_country = countries[user_to_flights.argsort(axis=1)[-3:][::-1]]
user_to_mostfreq_country = countries[user_to_flights.argmax(axis=1)]

tempo['most_freq'] = user_to_mostfreq_country
user_data = tempo.join(exp.groupby(['User_ID']).agg({'GENDER' : 'max', 'AGE' : 'max', 'REGION' : 'max',
                              'DEVICE_TYPE' : 'max', 'OS' : 'max', 'SUBSAGE_MF_SEGMENT' : 'max', 
                              'USING_INTERNET' : 'max'}))
del tempo
user_data.fillna(0, inplace=True)

In [184]:
exp['start_month'] = exp['START_TRIP'].apply(lambda x : x.month)
exp['start_day'] = exp['START_TRIP'].apply(lambda x : x.day)
exp['start_year'] = exp['START_TRIP'].apply(lambda x : x.year)

exp['end_month'] = exp['END_TRIP'].apply(lambda x : x.month)
exp['end_day'] = exp['END_TRIP'].apply(lambda x : x.day)
exp['end_year'] = exp['END_TRIP'].apply(lambda x : x.year)

exp.drop(['END_TRIP', 'START_TRIP'], axis=1, inplace = True)

Unnamed: 0,User_ID,GENDER,AGE,REGION,DEVICE_TYPE,OS,SUBSAGE_MF_SEGMENT,USING_INTERNET,TRIP_DURATION,TRIP_MAIN_COUNTRY,...,732,887,894,716,start_month,start_day,start_year,end_month,end_day,end_year
0,80000,2,48,55,5,0,1,True,5.0,222,...,False,False,False,False,10,3,2019,10,7,2019
1,80000,2,48,55,5,0,1,True,6.0,85,...,False,False,False,False,10,10,2019,10,15,2019
2,80001,0,-1,55,5,0,1,True,2.0,213,...,False,False,False,False,12,1,2019,12,2,2019
3,80001,0,-1,55,5,0,1,True,3.0,23,...,False,False,False,False,12,18,2019,12,20,2019
4,80001,0,-1,55,5,0,1,True,5.0,202,...,False,False,False,False,12,27,2019,12,31,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378462,123215,0,-1,0,9,21,0,False,5.0,19,...,False,False,False,False,8,11,2019,8,15,2019
378463,177003,0,-1,0,9,21,0,False,1.0,175,...,False,False,False,False,3,8,2019,3,8,2019
378464,180451,0,-1,0,9,21,0,False,13.0,1,...,False,False,False,False,7,13,2019,7,25,2019
378465,156886,0,-1,33,5,20,2,True,12.0,12,...,False,False,False,False,2,8,2019,2,19,2019


In [348]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Reshape, Conv2D, Conv2DTranspose, UpSampling2D
from tensorflow.keras.layers import LeakyReLU, Dropout, ReLU, BatchNormalization, InputLayer, Concatenate
from tensorflow.keras.optimizers import Adam, RMSprop

In [581]:
dropout = 0.2

inp = tf.keras.layers.Input((22,))

k = Dense(32)(inp)
k = LeakyReLU(alpha=0.2)(k)
k = Dropout(dropout)(k)

k = Dense(64)(k)
k = LeakyReLU(alpha=0.2)(k)
k = Dropout(dropout)(k)

k = Dense(64)(k)
k = LeakyReLU(alpha=0.2)(k)
k = Dropout(dropout)(k)

k = Dense(32)(k)
k = LeakyReLU(alpha=0.2)(k)
k = Dropout(dropout)(k)

k = Dense(16)(k)
k = LeakyReLU(alpha=0.2)(k)
k = Dropout(dropout)(k)

k = Dense(1)(k)

D = tf.keras.Model(
            inputs=inp, 
            outputs=Activation('sigmoid')(k))

optimizer = RMSprop(lr=0.0002, decay=6e-8)
D.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [582]:
dropout = 0.2

inputs_ = tf.keras.layers.Input((21,))

batch_norm = BatchNormalization()(inputs_)

dense_1 = Dense(32)(batch_norm)
nonlin_1 = LeakyReLU(alpha=0.2)(dense_1)
drop_1 = Dropout(dropout)(nonlin_1)

dense_2 = Dense(64)(drop_1)
nonlin_2 = LeakyReLU(alpha=0.2)(dense_2)
drop_2 = Dropout(dropout)(nonlin_2)

dense_3 = Dense(32)(drop_2)
nonlin_3 = LeakyReLU(alpha=0.2)(dense_3)
drop_3 = Dropout(dropout)(nonlin_3)

predictions = ReLU()(Dense(1)(drop_3))

conc = tf.concat(axis=1,values=[inputs_, predictions])

optimizer = Adam(1e-4)
G = tf.keras.Model(
            inputs=inputs_, 
            outputs=conc)

def loss(y, y_pred):
    return tf.reduce_mean((y_pred[:, -1:]-y[:, -1:])**2)

G.compile(loss=loss, optimizer=optimizer)
#conc = Concatenate(axis=1)([inputs_, predictions.output])

res_D = D(conc)

AM =  tf.keras.Model(
            inputs=inputs_, 
            outputs=res_D)
optimizer = RMSprop(lr=0.0001, decay=3e-8)
AM.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [583]:
time_list = []
for i in ['start', 'end']:
    for j in ['day', 'month', 'year']:
        time_list.append('{}_{}'.format(i, j))
        
ids_list = user_data.index.unique()

epochs = 50
batch_size_noise = 64
batch_size_true = 128

#true_data = exp[['User_ID', 'TRIP_MAIN_COUNTRY'] + time_list].join(user_data, on = 'User_ID')
true_data = exp[['User_ID', 'TRIP_MAIN_COUNTRY']].join(user_data, on = 'User_ID')

cols = true_data.columns.tolist()
true_data = true_data[cols[0:1] + cols[2:] + cols[1:2] ]

In [585]:
#d_orig = dict()
#for i in time_list: 
#    d_orig[i] = true_data[i].max()
#    true_data[i] /= true_data[i].max()

In [589]:
for j in range(1000):
        r_f = np.random.choice(true_data.shape[0], 
                                            batch_size_true, 
                                            replace=False)
        true_flights = true_data.loc[r_f].values
        us_id = true_data.loc[r_f]['User_ID'].values
        
        fake_batch_df = pd.DataFrame(us_id, columns=['User_ID']).join(user_data, on = 'User_ID')
        fake_batch = fake_batch_df.values
        
        G.fit(fake_batch.astype('float32'), true_flights.astype('float32'))

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 

Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples
Train on 128 samples

Exception ignored in: 

KeyboardInterrupt: 

In [593]:
pd.DataFrame(true_flights[:, -1:])

Unnamed: 0,0
0,213
1,1
2,96
3,213
4,565
...,...
123,201
124,1
125,210
126,242


In [594]:
pd.DataFrame(G.predict(fake_batch.astype('float32'))[:, -1:])

Unnamed: 0,0
0,109.358582
1,103.775558
2,110.010330
3,111.509903
4,94.827812
...,...
123,92.913284
124,145.587753
125,126.711304
126,150.940582


In [595]:
for j in range(50):
    random_ids = np.random.choice(ids_list, batch_size_noise, replace=False)
    fake_batch_df = pd.DataFrame(random_ids, columns=['User_ID']).join(user_data, on = 'User_ID')
    fake_batch = fake_batch_df.values

    fake_flights = G.predict(fake_batch.astype('float32'))

    x = np.concatenate((true_flights, fake_flights))
    y = np.ones([batch_size_true + batch_size_noise, 1])
    y[batch_size_true:, :] = 0
    d_loss = D.fit(x.astype('float32'), y.astype('float32'))

Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples


Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples


In [596]:
for i in tqdm_notebook(range(epochs)):
    true_flights = true_data.loc[np.random.choice(true_data.shape[0], 
                                            batch_size_true, 
                                            replace=False)].values
    
    for j in range(3):
        random_ids = np.random.choice(ids_list, batch_size_noise, replace=False)
        fake_batch_df = pd.DataFrame(random_ids, columns=['User_ID']).join(user_data, on = 'User_ID')
        fake_batch = fake_batch_df.values

        fake_flights = G.predict(fake_batch.astype('float32'))

        x = np.concatenate((true_flights, fake_flights))
        y = np.ones([batch_size_true + batch_size_noise, 1])
        y[batch_size_true:] = 0
        d_loss = D.fit(x.astype('float32'), y.astype('float32'))
    
    for j in range(5):
        random_ids = np.random.choice(ids_list, batch_size_noise, replace=False)
        fake_batch_df = pd.DataFrame(random_ids, columns=['User_ID']).join(user_data, on = 'User_ID')
        fake_batch = fake_batch_df.values

        y = np.ones([batch_size_noise, 1])
        a_loss = AM.fit(fake_batch.astype('float32'), y.astype('float32'))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples


Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Tr

Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Tr

Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Tr

Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Tr

Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 192 samples
Train on 192 samples
Train on 192 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples
Train on 64 samples


KeyboardInterrupt: 

In [597]:
pd.DataFrame(fake_flights)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,138023.0,105.000000,0.000000,1.000000,0.000000,43.899998,0.000000,210.949997,0.000000,7568.087891,...,0.000000,895.0,2.0,57.0,71.0,5.0,0.0,1.0,1.0,121.076118
1,107211.0,347.000000,0.000000,8.000000,0.000000,45.083332,0.000000,66.650002,0.000000,44987.269531,...,0.000000,895.0,1.0,41.0,53.0,5.0,20.0,1.0,1.0,90.299461
2,121571.0,369.000000,165.462982,9.500000,9.192389,203.425003,92.996330,243.066666,81.788689,3831.249023,...,235.791824,76.0,2.0,38.0,33.0,5.0,0.0,1.0,1.0,107.514160
3,96513.0,602.500000,67.175148,4.000000,0.000000,579.691650,55.755367,1390.541626,60.728687,5354.669434,...,562.793335,895.0,2.0,51.0,37.0,5.0,0.0,1.0,1.0,199.094711
4,125621.0,284.333344,23.094011,6.000000,2.000000,394.622223,151.140686,300.166656,191.709152,17103.035156,...,315.614288,895.0,1.0,35.0,33.0,5.0,20.0,1.0,1.0,96.595757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,85911.0,219.666672,62.500668,7.333333,6.658328,332.927765,126.148186,418.277771,74.786232,2604.202881,...,151.872513,895.0,2.0,49.0,11.0,5.0,0.0,1.0,1.0,164.987610
60,112658.0,392.000000,129.903809,4.666667,3.785939,699.127808,45.283504,493.011108,47.236874,516.029602,...,1200.334351,895.0,1.0,59.0,54.0,5.0,0.0,1.0,1.0,140.012207
61,171731.0,249.000000,0.000000,8.000000,0.000000,205.666672,0.000000,217.466660,0.000000,3890.547852,...,0.000000,895.0,0.0,59.0,52.0,5.0,0.0,1.0,1.0,102.315720
62,115473.0,226.000000,0.000000,4.000000,0.000000,369.700012,0.000000,304.750000,0.000000,2095.425781,...,0.000000,895.0,2.0,40.0,54.0,5.0,0.0,1.0,1.0,157.290680


In [598]:
true_data

Unnamed: 0,User_ID,SMS_IN_CNT_M3_mean,SMS_IN_CNT_M3_std,TRIP_DURATION_mean,TRIP_DURATION_std,MOU_IN_REVENUE_M3_mean,MOU_IN_REVENUE_M3_std,MOU_OUT_REVENUE_M3_mean,MOU_OUT_REVENUE_M3_std,DOU_DURATION_M3_mean,...,ARPU_M3_std,most_freq,GENDER,AGE,REGION,DEVICE_TYPE,OS,SUBSAGE_MF_SEGMENT,USING_INTERNET,TRIP_MAIN_COUNTRY
0,80000,299.000000,1.154701,4.500000,1.290994,532.116667,78.731334,466.699996,67.222821,5381.978539,...,995.831065,222,2,48,55,5,0,1,True,222
1,80000,299.000000,1.154701,4.500000,1.290994,532.116667,78.731334,466.699996,67.222821,5381.978539,...,995.831065,222,2,48,55,5,0,1,True,85
2,80001,395.050000,143.792788,4.650000,4.749238,250.322501,107.936626,301.601666,79.642260,1857.871891,...,2428.142838,222,2,60,55,5,20,1,True,213
3,80001,395.050000,143.792788,4.650000,4.749238,250.322501,107.936626,301.601666,79.642260,1857.871891,...,2428.142838,222,2,60,55,5,20,1,True,23
4,80001,395.050000,143.792788,4.650000,4.749238,250.322501,107.936626,301.601666,79.642260,1857.871891,...,2428.142838,222,2,60,55,5,20,1,True,202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378462,123215,141.714286,179.504609,3.285714,2.984085,50.095238,63.497327,53.183333,67.300009,3864.477401,...,2443.794135,895,2,36,33,9,21,1,True,19
378463,177003,58.000000,50.229473,5.333333,6.658328,241.600000,209.231738,163.577776,141.662510,6933.653649,...,7040.220730,895,0,33,70,9,21,1,True,175
378464,180451,0.000000,0.000000,13.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,895,0,-1,0,9,21,0,False,1
378465,156886,35.333333,18.230012,12.666667,2.081666,192.750000,163.797605,419.761113,214.269399,1033.710614,...,624.788358,12,0,-1,33,5,20,4,True,12


In [603]:
toto = G.predict(pd.DataFrame(ids_list, columns=['User_ID']).join(user_data, on = 'User_ID').values.astype('float32'))[:, -1:]

In [609]:
toto[toto.astype('int32').reshape(-1) == 40]

array([[40.20306 ],
       [40.140156],
       [40.668377],
       [40.842495],
       [40.715103],
       [40.131947],
       [40.285076],
       [40.405567],
       [40.714012],
       [40.302696],
       [40.590664],
       [40.976707],
       [40.60391 ],
       [40.603962],
       [40.012596],
       [40.378994]], dtype=float32)