In [None]:
%%capture
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from fastprogress import progress_bar
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from scipy.stats import skew
from scipy.signal import find_peaks
import matplotlib.patches as mpatches
from collections import Counter
import os
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import time
from numpy.random import choice
import numba
import random 
import ctypes
import multiprocessing as mp
!pip install wandb
import wandb
import math
import torch.nn.functional as F
import sklearn

In [None]:


class encoder():

  def __init__(self):
    self.encode_dict={}
    self.last_value=None
    self.last_table= pd.DataFrame() #cache for fast dataloading (if we decide to use it as a model attribute)
    self.use_cache_flag=False
    self.codes_per_column=[]

  def use_cache(self,flag=True):
    self.use_cache_flag=True 

  def get_encode_dict(self):
    return self.encode_dict
  
  def set_encode_dict(self,encode_dict):
    self.encode_dict=encode_dict 

  def encode_column(self,table,column_name):
    #column is an array of strings to encode
    #make sure to create dict before
    tmp=[self.encode_dict[str(table[column_name][i])+column_name] for i in range(len(table))]
    
    return tmp

  def create_dict(self,table):
    
    self.codes_per_column=[]
    for column in table.columns:
      tmp=[]
      if isinstance(table[column][0],str):
        tmp=table[column].unique()
        
        for idx,name in enumerate(tmp):
            self.encode_dict[str(name)+column]=idx 
      self.codes_per_column.append(len(tmp))


  def encode_table(self,table,update_cache=False): 
    
    if self.use_cache_flag: #using cached value
      return self.last_table

    str_columns = [] #create encoding dictionnary if there isn't one
    if self.encode_dict=={}:
  
      self.create_dict(table)
      
    out=table.copy() 
    for column in table.columns:
      num=0
      if isinstance(table[column][0],str): #encode string columns
        out[column]=self.encode_column(table,column) 
    
    #updating cache
    if self.last_table.empty or update_cache: 
      self.last_table=out
        
    return torch.tensor(self.codes_per_column),out





def age_enc(table):
  tmp=[]
  for group in table:
    if group[-2:].isdigit():
      tmp.append(int(group[-2:]))
    else:
      tmp.append(int(group[:2]))
  return tmp



# frequency encoding
def encode_FE(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[col],df2[col]])
        vc = df.value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        
        df1[nm] = df1[col].map(vc)
        df1[nm] = df1[nm].astype('float32')
        df2[nm] = df2[col].map(vc)
        df2[nm] = df2[nm].astype('float32')
        print(nm,', ',end='')


def encode_AG(main_columns, uids,train_df,test_df,aggregations=['mean'],fillna=True, usena=False):
    # AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                new_col_name = main_column+'_'+col+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
                test_df[new_col_name]  = test_df[col].map(temp_df).astype('float32')
                
                if fillna:
                    train_df[new_col_name].fillna(-1,inplace=True)
                    test_df[new_col_name].fillna(-1,inplace=True)
                
                print("'"+new_col_name+"'",', ',end='')

def combine(col1,col2, train_df, test_df, fillna=True):
  
  dfs=[train_df,test_df]
  new_name=col1+'_'+col2
  train_df[new_name]=train_df[col1].fillna(-1).astype('str')+'_'+train_df[col2].fillna(-1).astype('str')
  test_df[new_name]=test_df[col1].fillna(-1).astype('str')+'_'+test_df[col2].fillna(-1).astype('str')
  
  return(new_name)

def update_weights(accuracy,allocated):
  accuracy=1/(accuracy+0.00001)
  accuracy=np.exp(10*accuracy/accuracy.sum())
  accuracy=allocated*accuracy/accuracy.sum()

  return torch.tensor(accuracy).float()

In [None]:
input_path='../input/ai4d-dataset'
#@title Import tables and set-up the encoder
#encoding tables
train_table=pd.read_csv(os.path.join(input_path,"Train.csv"))
train_orig=train_table.fillna(-1)
train_table=train_orig

test_table=pd.read_csv(os.path.join(input_path,"Test.csv"))


encoder_=encoder() #encoder

test_orig=test_table.fillna(-1)
test_table=test_orig


Feature engineering - selection 

In [None]:
# 'country', 'age_group', 'travel_with', 'total_female',
#        'total_male', 'purpose', 'main_activity', 'info_source',
#        'tour_arrangement','first_trip_tz'

#feature engineering 
tar=train_table.pop('cost_category')


tmp=age_enc(train_table.age_group)
train_table["has_more_money"]=[1 if tmp[i]>18 else 0 for i in range(len(tmp))]

tmp=age_enc(test_table.age_group)
test_table["has_more_money"]=[1 if tmp[i]>18 else 0 for i in range(len(tmp))]


train_table["length"]=train_table['night_mainland']+train_table['night_zanzibar']
train_table["total_num"]=train_table['total_female']+train_table['total_male']
train_table["ratio_male"]=train_table['total_male']/train_table['total_num']

test_table["length"]=test_table['night_mainland']+test_table['night_zanzibar']
test_table["total_num"]=test_table['total_female']+test_table['total_male']
test_table["fem2male"]=test_table['total_female']-test_table['total_male']
# test_table["ratio_male"]=test_table['total_male']/test_table['total_num']
fe_combo_names,ag_combo_names=[],[]


# # combo_names.append(combine('travel_with','ratio', train_df=train_table,test_df=test_table))
ag_combo_names.append(combine('tour_arrangement','length', train_df=train_table,test_df=test_table))
ag_combo_names.append(combine('tour_arrangement','total_num', train_df=train_table,test_df=test_table))
# ag_combo_names.append(combine('tour_arrangement','first_trip_tz', train_df=train_table,test_df=test_table))
ag_combo_names.append(combine('purpose','main_activity', train_df=train_table,test_df=test_table))
# # combo_names.append(combine('country','total_num', train_df=train_table,test_df=test_table))
# # combo_names.append(combine('info_source','tour_arrangement', train_df=train_table,test_df=test_table))
# fe_combo_names.append(combine('purpose','main_activity', train_df=train_table,test_df=test_table))
#fe_combo_names.append(combine('package_guided_tour','main_activity', train_df=train_table,test_df=test_table))
# fe_combo_names.append(combine('package_sightseeing','main_activity', train_df=train_table,test_df=test_table))
fe_combo_names.append(combine('package_accomodation','total_num', train_df=train_table,test_df=test_table))
# fe_combo_names.append(combine('package_guided_tour','total_num', train_df=train_table,test_df=test_table))
ag_combo_names.append(combine('package_accomodation','length', train_df=train_table,test_df=test_table))
# # combo_names.append(combine('package_accomodation','total_num', train_df=train_table,test_df=test_table))
# # combo_names.append(combine('package_guided_tour','length', train_df=train_table,test_df=test_table))
# # combo_names.append(combine('package_guided_tour','total_num', train_df=train_table,test_df=test_table))
name1=combine('package_accomodation','package_food', train_df=train_table,test_df=test_table)
name2=combine('package_transport_tz','package_transport_int', train_df=train_table,test_df=test_table)
name3=combine(name1,name2, train_df=train_table,test_df=test_table)
features_to_drop=[name1,name2,'purpose','main_activity','info_source']

# ag_combo_names.append(combine('total_num','age_group', train_df=train_table,test_df=test_table))
# # combo_names.append(combine('length','total_num', train_df=train_table,test_df=test_table))
ag_combo_names.append(combine('age_group','travel_with', train_df=train_table,test_df=test_table))


# train_table.drop(['night_mainland','night_zanzibar','total_female','total_male','purpose','main_activity'], inplace=True, axis=1)
# test_table.drop(['night_mainland','night_zanzibar','total_female','total_male','purpose','main_activity'], inplace=True, axis=1)


# # combo_names.append(combine('main_activity','tour_arrangement',train_df=train_table,test_df=test_table))
# #concatenate the two table to get encoding dict

train_table['cost_category']=tar
test_tmp=pd.concat([train_table.iloc[:,:-1],test_table])
test_tmp[train_table.columns[-1]]=pd.concat([train_table.iloc[:,-1],train_table.iloc[:len(test_table),-1]])
test_tmp=test_tmp.reset_index(drop=True)

num_codes,_=encoder_.encode_table(test_tmp.iloc[:,1:]) # encode table

train_table=encoder_.encode_table(train_table.iloc[:,1:].fillna(-1))[1] # encode table
test_table=encoder_.encode_table(test_table.iloc[:,1:].fillna(-1))[1]





encode_FE(train_table,test_table,[name1,name2,'first_trip_tz'])

# train_table.drop([name1,name2], inplace=True, axis=1)
# test_table.drop([name1,name2], inplace=True, axis=1)

tar=train_table.pop('cost_category')
encode_FE(train_table,test_table,['travel_with','purpose'])
# encode_AG([name1,name2],['country'],train_df=train_table,test_df=test_table,aggregations=['mean'])
encode_AG([ 'package_accomodation','package_food','purpose_main_activity'],['country'],train_df=train_table,test_df=test_table,aggregations=['mean'])
# encode_AG(['total_num','length'],['package_accomodation','package_food','package_transport_tz','package_transport_int'],train_df=train_table,test_df=test_table,aggregations=['mean'])
# encode_AG(['tour_arrangement'],['total_male','total_female'],train_df=train_table,test_df=test_table,aggregations=['mean'])

# encode_AG(['package_transport_int', 'package_accomodation',
#       'package_food', 'package_transport_tz', 'package_sightseeing',
#       'package_guided_tour', 'package_insurance','first_trip_tz'], combo_names[:6],train_df=train_table,test_df=test_table, aggregations=['mean'])

#encode_AG(combo_names[6:], combo_names[:6],train_df=train_table,test_df=test_table, aggregations=['mean'])
# encode_AG(['package_transport_int', 'package_accomodation','package_food', 'package_transport_tz', 'package_sightseeing',
#        'package_guided_tour', 'package_insurance', 'first_trip_tz'],ag_combo_names+['length','total_num','country'],train_df=train_table,test_df=test_table, aggregations=['mean','std'])


train_table.drop(features_to_drop, inplace=True, axis=1)
test_table.drop(features_to_drop, inplace=True, axis=1)

train_table['cost_category']=tar





##########################################################################

train_data=torch.tensor(train_table.to_numpy())
test_data=torch.tensor(test_table.to_numpy())
test_data=test_data[:,:]

ratio=0.2
idx=torch.randperm(train_data.shape[0])
train_targets,val_targets=train_data[idx[:-int(ratio*len(train_data))],-1],train_data[idx[-int(ratio*len(train_data)):],-1]
val_data=train_data[idx[-int(ratio*len(train_data)):],:-1]
train_data=train_data[idx[:-int(ratio*len(train_data))],:-1]

column_names=test_table.columns
print(column_names)

In [None]:
from xgboost import XGBRegressor

# tmp_train=combiner_model.eval()(combiner_train_input)[0]
# tmp_val=combiner_model.eval()(combiner_val_input)[0]
# tmp_test=combiner_model.eval()(combiner_test_input)[0]

# tmp_test,tmp_train,tmp_val=tmp_test.detach().numpy(),tmp_train.detach().numpy(),tmp_val.detach().numpy()

state=np.random.randint(0,10000)
model = XGBRegressor(n_estimators=3000,
            max_depth=12,
            learning_rate=0.02,
            subsample=0.8,
            colsample_bytree=0.4,
            missing=-1,
            random_state=912,#5106
            eval_metric=["mlogloss"],
            objective='multi:softprob',
            num_class=6,
            tree_method='gpu_hist')

model.fit(train_data.numpy(),train_targets.numpy(),verbose=25,eval_set=[(val_data.numpy(),val_targets.numpy())],early_stopping_rounds=200)
predicted = model.predict(val_data.numpy())
print(predicted)