# Function Definition of FUNCTION1() and FUNCTION2()

In [1]:
# all imports
import numpy as np
import pandas as pd 
import joblib, xgboost, warnings 
from datetime import datetime, date
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# https://joblib.readthedocs.io/en/latest/
# get sess_csv and saved_data
sess_csv=pd.read_csv('sessions.csv')
in_dict=joblib.load("in_dict")

# FUNCTION-1 DEFINITION
def FUNCTION1(data, prob_label="label"):
    
    """
    -----------------------------------------------------------
    Function returns predictions for a given raw input
        1.Read, Preprocess and Extract data for given Test_set 
        2.Load pretrained model and preform predictions
        3.Return formated predictions
    -----------------------------------------------------------
        Parameters
        ----------
        Data <List of Data_Frame>   : Contain Test_set Data
        prob_label <string>         : Indicator variable for type of prediction 
                                      ("probability" or "label")
    
        returns 
        --------
        pred_targets <ndarray>      : contain Target labels or Probabilites
        
    -----------------------------------------------------------

    """
    
    # load  test samples
    test_dataframe=data[0]
    s_id=test_dataframe["id"]
    
    # if session samples exists
    if (isinstance(data[1], pd.DataFrame)):
        
        # load session samples
        sessions_dataframe=data[1]
        sessions_dataframe['id'] = sessions_dataframe['user_id']
        sessions_dataframe = sessions_dataframe.drop(['user_id'],axis=1) 

        # Pre-processing Session data
        # Replacing all null value with "NAN"
        sessions_dataframe.action = sessions_dataframe.action.fillna('NAN')
        sessions_dataframe.action_type = sessions_dataframe.action_type.fillna('NAN')
        sessions_dataframe.action_detail = sessions_dataframe.action_detail.fillna('NAN')
        sessions_dataframe.device_type = sessions_dataframe.device_type.fillna('NAN')
        
        # Keeping Thresold value as 100 and load exact feature representation 
        action_threshold = 100 
        actions=in_dict["actions"]
        sessions_dataframe.action = sessions_dataframe.action.apply(lambda x: 'OTHER' if actions[x] < action_threshold else x)
        action_frequency =in_dict["frequency"][0]
        action_detail_frequency = in_dict["frequency"][1]
        action_type_frequency = in_dict["frequency"][2]
        device_type_frequency = in_dict["frequency"][3]

    
    # if session data don't exists
    if (isinstance(data[1], pd.DataFrame))==False:
        matrix=np.concatenate((np.array(s_id).reshape(s_id.shape[0],1),\
                           np.full((s_id.shape[0],457),-2)), axis=1)
    else:
        # Grouping session data by 'id'
        session_group = sessions_dataframe.groupby(['id'])

        # basic inintialization
        matrix = []
        length = len(session_group)
        
        # iterating through individual groups
        for group in session_group:
            group_set = group[1]
            
            # feature extraction
            features = []
            features.append(group[0])
            features.append(len(group_set))
            secs = group_set.secs_elapsed.fillna(0).values   
            
            # Action feature value counts, no of unique actions, mean and std. 
            action_count = [0] * len(action_frequency)
            for i,v in enumerate(group_set.action.values):
                action_count[action_frequency[v]] += 1
            _, action_unique_count = np.unique(group_set.action.values, return_counts=True)
            action_count += [len(action_unique_count), np.mean(action_unique_count), np.std(action_unique_count)]
            features = features + action_count
            
            # Action_detail feature value counts, no of unique Action_details, mean and std.  
            action_detail_count = [0] * len(action_detail_frequency)
            for i,v in enumerate(group_set.action_detail.values):
                action_detail_count[action_detail_frequency[v]] += 1 
            _, action_detail_unique_count = np.unique(group_set.action_detail.values, return_counts=True)
            action_detail_count += [len(action_detail_unique_count), np.mean(action_detail_unique_count), np.std(action_detail_unique_count)]
            features = features + action_detail_count
            
            # Action_type feature value counts, no of unique Action_type, mean and std, log(sum of secs_elapsed) 
            action_type_secs = [0] * len(action_type_frequency)
            action_type_count = [0] * len(action_type_frequency)
            for i,v in enumerate(group_set.action_type.values):
                action_type_secs[action_type_frequency[v]] += secs[i]   
                action_type_count[action_type_frequency[v]] += 1  
            action_type_secs = np.log(1 + np.array(action_type_secs)).tolist()
            _, action_type_unique_count = np.unique(group_set.action_type.values, return_counts=True)
            action_type_count += [len(action_type_unique_count), np.mean(action_type_unique_count), np.std(action_type_unique_count)]
            features = features + action_type_count + action_type_secs    

            # device_type feature value counts, no of unique device_type, mean and std.    
            device_type_count  = [0] * len(device_type_frequency)
            for i,v in enumerate(group_set.device_type .values):
                device_type_count[device_type_frequency[v]] += 1 
            device_type_count.append(len(np.unique(group_set.device_type.values)))
            _, device_type_unique = np.unique(group_set.device_type.values, return_counts=True)
            device_type_count += [len(device_type_unique), np.mean(device_type_unique), np.std(device_type_unique)]        
            features = features + device_type_count    
            
            # creating features from 'secs_elapsed' feature
            secs_features = [0] * 5 
            log_bin = [0] * 15
            if len(secs) > 0:
                secs_features[0] = np.log(1 + np.sum(secs))
                secs_features[1] = np.log(1 + np.mean(secs)) 
                secs_features[2] = np.log(1 + np.std(secs))
                secs_features[3] = np.log(1 + np.median(secs))
                secs_features[4] = secs_features[0] / float(features[1])
                
                # bined features  
                secs_log = np.log(1 + secs).astype(int)
                log_bin = np.bincount(secs_log, minlength=15).tolist()                      
            features = features + secs_features + log_bin
            matrix.append(features)
    
    # creating feature names for matrix
    feat_names = []    
    for i in range(len(matrix[0])-1):
        feat_names.append('feat_' + str(i)) 
    
    # final feature matrix
    matrix = np.array(matrix)
    matrix_array = matrix[:, 1:].astype(np.float16)
    matrix_id = matrix[:, 0]
    
     # creating dataframe from array matrix
    session_matrix_dataframe = pd.DataFrame(matrix_array, columns=feat_names)
    session_matrix_dataframe['id'] = matrix_id
    session_matrix_dataframe.index = session_matrix_dataframe.id
    
    # Pre-processing test data
    dataframe_tt = test_dataframe
    dataframe_tt.index = dataframe_tt.id
    dataframe_tt = dataframe_tt.fillna(-1)  
    dataframe_tt = dataframe_tt.replace('-unknown-', -1) 

    # Feature extration from timestamp feature 'date_account_created'
    dataframe_tt = dataframe_tt.drop(['date_first_booking'], axis=1)
    dataframe_tt['n_null'] = np.array([sum(r == -1) for r in dataframe_tt.values])

    date_acc_crt = np.vstack(dataframe_tt.date_account_created.astype(str).apply(lambda x: x.split('-')).values)
    date_acc_crt=date_acc_crt.astype(int) 
    dataframe_tt['dac_year'] = date_acc_crt[:,0]
    dataframe_tt['dac_month'] = date_acc_crt[:,1]
    dataframe_tt['dac_day'] = date_acc_crt[:,2]

    acc_cret_dates = [datetime(x[0],x[1],x[2]) for x in date_acc_crt]
    dataframe_tt['dac_week_number'] = np.array([d.isocalendar()[1] for d in acc_cret_dates])
    dataframe_tt['dac_week_day'] = np.array([d.weekday() for d in acc_cret_dates])

    dataFrame_tt_wd = pd.get_dummies(dataframe_tt.dac_week_day, prefix='dac_week_day')
    dataframe_tt = dataframe_tt.drop(['date_account_created', 'dac_week_day'], axis=1)
    dataframe_tt = pd.concat((dataframe_tt, dataFrame_tt_wd), axis=1)
    
    # function
    def func(s):
        s=str(s)
        return datetime(year=int(s[0:4]), month=int(s[4:6]), day=int(s[6:8]),
               hour=int(s[8:10]), minute=int(s[10:12]), second=int(s[12:]))
    
    # Feature extration from feature 'timestamp_first_active'
    dataframe_tt['timestamp_first_active'] = pd.to_datetime(dataframe_tt.timestamp_first_active.apply(func))
    first_acc_dates= list(dataframe_tt['timestamp_first_active'])

    dataframe_tt['tfa_day'] = dataframe_tt.timestamp_first_active.dt.day
    dataframe_tt['tfa_month'] = dataframe_tt.timestamp_first_active.dt.month
    dataframe_tt['tfa_year'] = dataframe_tt.timestamp_first_active.dt.year
    dataframe_tt['tfa_hour'] = dataframe_tt.timestamp_first_active.dt.hour
    dataframe_tt['tfa_week_number'] = np.array([d.isocalendar()[1] for d in first_acc_dates])
    dataframe_tt['tfa_week_day'] = np.array([d.weekday() for d in first_acc_dates])

    dataFrame_tt_wd = pd.get_dummies(dataframe_tt.tfa_week_day, prefix='tfa_week_day')
    dataframe_tt = dataframe_tt.drop(['timestamp_first_active', 'tfa_week_day'], axis=1)
    dataframe_tt = pd.concat((dataframe_tt, dataFrame_tt_wd), axis=1)

    # Extracting difference sign and difference in second between the time account_created and first_active
    dataframe_tt['dac_tfa_secs'] = np.array([np.log(1+abs((acc_cret_dates[i]-first_acc_dates[i]).total_seconds())) for i in range(len(acc_cret_dates))])
    dataframe_tt['sig_dac_tfa'] = np.array([np.sign((acc_cret_dates[i]-first_acc_dates[i]).total_seconds()) for i in range(len(acc_cret_dates))])
    
    # function indicator_season
    def indicator_season(date_for_season):
        date_for_season=date_for_season.date().replace(year=2000)

        winter=[date(2000,  1,  1),date(2000,  3, 20),
                date(2000, 12, 21),date(2000, 12, 31)]    
        spring=[date(2000,  3, 21),  date(2000,  6, 20)]  
        summer=[date(2000,  6, 21),  date(2000,  9, 22)]
        autumn=[date(2000,  9, 23),  date(2000, 12, 20)]  

        if (winter[0]<=date_for_season<=winter[1]) or (winter[2]<=date_for_season<=winter[3]):
            sesn=0
        elif spring[0]<=date_for_season<=spring[1]:
            sesn=1
        elif summer[0]<=date_for_season<=summer[1]:
            sesn=2
        elif autumn[0]<=date_for_season<=autumn[1]:
            sesn=3
        return sesn
    
    # Extracting Season feature from account_created and first_active dates
    dataframe_tt['season_dac'] = np.array([indicator_season(date_for_season) for date_for_season in acc_cret_dates])
    dataframe_tt['season_tfa'] = np.array([indicator_season(date_for_season) for date_for_season in first_acc_dates])
    
    # Pre-processing 'age' feature
    age_value = dataframe_tt.age.values
    age_value = np.where((age_value<2000)&(age_value>1900), 2014-age_value, age_value) 
    age_value = np.where((age_value<14)&(age_value>0), 4, age_value) 
    age_value = np.where((age_value<2016)&(age_value>2010), 9, age_value) 
    age_value = np.where(age_value>99, 110, age_value) 
    dataframe_tt['age'] = age_value
    
    # function indicator_season
    age_interval =[i for i in range(0,101,5)]
    def get_interv_value(age):
        interval_value = 20
        for i in range(len(age_interval)):
            if age < age_interval[i]:
                interval_value = i 
                break
        return interval_value

    dataframe_tt['age_interv'] = dataframe_tt.age.apply(lambda x: get_interv_value(x))
    dataFrame_age_interval = pd.get_dummies(dataframe_tt.age_interv, prefix='age_interv')
    dataframe_tt = dataframe_tt.drop(['age_interv'], axis=1)
    dataframe_tt = pd.concat((dataframe_tt, dataFrame_age_interval), axis=1)
    
    # Creating dummy variables (one-hot-encoding) for train data features
    one_hot_features = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
    for feature in one_hot_features:
        dataFrame_tt_dummy = pd.get_dummies(dataframe_tt[feature], prefix=feature)
        dataframe_tt = dataframe_tt.drop([feature], axis=1)
        dataframe_tt = pd.concat((dataframe_tt, dataFrame_tt_dummy), axis=1)    

    # Merging pre-processed session data and test data on 'id' to obtain final dataframe    
    dataframe_tt.reset_index(drop=True,inplace=True)
    session_matrix_dataframe.reset_index(drop=True,inplace=True)
    final_dataFrame = pd.merge(dataframe_tt, session_matrix_dataframe, on="id",how='left')
    final_dataFrame = final_dataFrame.fillna(-2)
    final_dataFrame['all_null'] = np.array([sum(r<0) for r in final_dataFrame.drop(['id'], axis=1).values])
    
    # feature selection
    tr_cols=in_dict["features"]["fin_cols"]
    ts_cols=list(final_dataFrame.columns)

    # format data for correct representation
    test_formated=pd.DataFrame(data=np.zeros((s_id.shape[0],len(tr_cols))),columns=tr_cols)

    for i in ts_cols:
        test_formated[i]=final_dataFrame[i]
     
    # feature trasform and standarscalar
    data = test_formated.values
    ids_test=data[:,0:1]
    data=data[:,1:]
    data = data[:, in_dict["features"]["imp_features"]]
    X_test = in_dict["StandardScaler"].transform(data)
    
    # load model and label encoder for selectioon
    xgb=joblib.load("xgb_8000_best")
    lab_encoder=in_dict["lab_encoder"]
    
    # final predictions
    if prob_label=="label":
        pred_targets=lab_encoder.inverse_transform(xgb.predict(X_test))
    
    elif prob_label=="probality":
        pred_targets=xgb.predict_proba(X_test)
        
    
    return pred_targets


# FUNCTION-2 DEFINITION
def FUNCTION2(prediction,Target_Labels):
    
    """
    -----------------------------------------------------------
    Function computes Ndcg_Score for given Predictions and Target_labels

        Parameters
        ----------
        prediction <ndarray>        : Contain predicted Probabilites
        Target_Labels <pd.series>   : True lables
    
        returns 
        --------
        Ndcg_score <float>          : Final Test score 
    -----------------------------------------------------------

    """
        
    if len(prediction.shape)==1:
        print("Error: FUNCTION2() needs Target probability for Score please Rerun FUNCTION1() with Output_choice as 2.")
        return 0.0
    
    # obtain ground_truth values
    ground_truth=in_dict["lab_encoder"].transform(Target_Labels)

    # DCG Scorer function setup
    def dcg_score(y_true, y_score, k=5):

        # Compute releveace values for predictions
        order = np.argsort(y_score)[::-1]
        y_true = np.take(y_true, order[:k])

        # compute DCG@k for a given point
        dcg_numerator   = 2 ** y_true - 1
        dcg_denominator = np.log2( np.arange( len(y_true) ) + 2 )
        dcg_score = np.sum( dcg_numerator / dcg_denominator )

        return dcg_score

    # NDCG Scorer function setup
    def ndcg_score(ground_truth, predictions, k=5):

        # Compute relevance values for ground_truth
        T =  LabelBinarizer().fit(range(predictions.shape[1] + 1)).transform(ground_truth)

        # Compute NDCG@k score for all samples
        scores = []
        for y_true, y_score in zip(T, predictions):

            dcg_k = dcg_score( y_true, y_score, k)
            idcg_k = dcg_score( y_true, y_true, k)
            ndcg_k = float(dcg_k) / float(idcg_k)
            scores.append(ndcg_k)

        # Mean of all scores
        ndcg_score=np.mean(scores)

        return ndcg_score
    
    # obtain final score
    score=ndcg_score(ground_truth,prediction,5)
    
    return score

# Run Below cell to Invoke FUNCTION-1()

In [10]:
# RUN THIS CELL TO CALL FUNCTION-1()

"""
Obtaining Data samples by Random from Csv files based on given Data, Ouput and Prediction choices
and also based on Number of Samples selected where obtained data is used as Test samples for predictions

"""

# Code to obtain various choices to perform required action
print("Enter number 1 or 2 based on your choice")
output_choice=int(input("1.Enter Output choice: 1.Target labels  2.Target probability:>> "))
print("---------------------- "*5)
prediction_choice=int(input("2.Enter Prediction choice:  1.Predict for single sample point  2.Predict for Random set of samples points:>> "))
print("---------------------- "*5)
Data_choice=int(input("3.Enter Data choice:  1.Predict using only train data  2.Predict using both train and sess_log data:>> "))
print("---------------------- "*5)

if prediction_choice==1 and Data_choice==1:
    
    tr_data, sess_data= pd.read_csv('train_users.csv').sample(n=1),None

elif prediction_choice==1 and Data_choice==2:
    
    while (True):
        tr_data= pd.read_csv('train_users.csv').sample(n=1)
        sess_data=sess_csv[sess_csv['user_id'].isin(list(tr_data["id"].values))]
        if(sess_data.shape[0]!=0):
            break

elif prediction_choice==2 and Data_choice==1:
    
    n=int(input("4.Select the size for Random set of samples points for Prediction: "))
    print("---------------------- "*5)
    tr_data, sess_data= pd.read_csv('train_users.csv').sample(n),None

elif prediction_choice==2 and Data_choice==2:
    
    n=int(input("4.Select the size for Random set of samples points for Prediction: "))
    print("---------------------- "*5)
    
    while (True):
        tr_data= pd.read_csv('train_users.csv').sample(n)
        sess_data=sess_csv[sess_csv['user_id'].isin(list(tr_data["id"].values))]
        
        if(sess_data.shape[0]!=0):
            break

if 'country_destination' in tr_data.columns:
    Target_Labels=tr_data['country_destination']
    tr_data.drop(['country_destination'], axis=1,inplace=True)            


    
# printing Test_set info    
print("--------------"*2)    
print("|  <<Selected Data Info>>  |")
print("--------------"*2)
print("| Test_Data_size:  {} |".format(tr_data.shape))  
if Data_choice!=1: print("| Sess_Data_size: {} |".format(sess_data.shape))      
prob_label= "probality" if output_choice==2 else "label"


# Invoke FUNCTION-1()
prediction=FUNCTION1([tr_data, sess_data], prob_label)

print("| Target_Prediction_size: {}|".format(len(prediction)))
print("--------------"*2)
print("----------"*11)


# print Formated Final Predictions 
if (prob_label=="label"):
    print("\nPrediction Samples:",prediction[0:5])

else:
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.bar.html
    single_out=dict(zip(in_dict["lab_encoder"].classes_, np.round(prediction[0],3)))
    print("\nPrediction Sample:",single_out)
    df=pd.DataFrame({"Destinations":in_dict["lab_encoder"].classes_,"Probabilites":np.round(prediction[0],5)})
    df.plot.bar(x="Destinations", y="Probabilites",title="Probabilistic Prediction for a point",cmap="Dark2",figsize=(10,6))

Enter number 1 or 2 based on your choice
1.Enter Output choice: 1.Target labels  2.Target probability:>> 1
---------------------- ---------------------- ---------------------- ---------------------- ---------------------- 
2.Enter Prediction choice:  1.Predict for single sample point  2.Predict for Random set of samples points:>> 2
---------------------- ---------------------- ---------------------- ---------------------- ---------------------- 
3.Enter Data choice:  1.Predict using only train data  2.Predict using both train and sess_log data:>> 1
---------------------- ---------------------- ---------------------- ---------------------- ---------------------- 
4.Select the size for Random set of samples points for Prediction: 50
---------------------- ---------------------- ---------------------- ---------------------- ---------------------- 
----------------------------
|  <<Selected Data Info>>  |
----------------------------
| Test_Data_size:  (50, 15) |
| Target_Prediction_size: 

# Run Below cell to Invoke FUNCTION-2()

In [4]:
# RUN THIS CELL TO CALL FUNCTION-2()
NDCG_Score=FUNCTION2(prediction,Target_Labels)
print("NDCG_Score:",np.round(NDCG_Score,5))

NDCG_Score: 0.80349


## Run Below cells to see the Test Data that was selected randomly for Prediction

In [5]:
tr_data.reset_index(drop=True,inplace=True) 
tr_data.head(4)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,l2i8p7kf7m,2012-01-28,20120128101544,,-unknown-,,basic,1,en,sem-non-brand,google,,Web,Other/Unknown,-unknown-
1,e9d4umazv6,2013-06-04,20130604193141,,FEMALE,40.0,basic,0,en,direct,direct,untracked,Web,Windows Desktop,Chrome
2,v7ovrhq7ux,2014-03-10,20140310071224,,FEMALE,29.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
3,5s28lmpgu1,2013-12-27,20131227164631,2014-03-05,FEMALE,42.0,facebook,0,en,sem-brand,google,linked,Web,Mac Desktop,Safari


In [6]:
tr_data.columns

Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser'],
      dtype='object')

In [7]:
sess_data.drop(["id"],inplace=True,axis=1)
sess_data.head(4)

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
174502,ldgtf9goal,dashboard,view,dashboard,Mac Desktop,232564.0
174503,ldgtf9goal,authenticate,submit,login,Mac Desktop,
174504,ldgtf9goal,header_userpic,data,header_userpic,Mac Desktop,336.0
174505,ldgtf9goal,personalize,data,wishlist_content_update,Mac Desktop,138.0


In [8]:
sess_data.columns

Index(['user_id', 'action', 'action_type', 'action_detail', 'device_type',
       'secs_elapsed'],
      dtype='object')