## Importing all required Modules

In [1]:
# all imports
import numpy as np
import pandas as pd
import xgboost, joblib, warnings
import matplotlib.pyplot as plt
from datetime import datetime, date
from sklearn.metrics import log_loss
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,StandardScaler
in_dict=joblib.load("in_dict")

## Getting ready to read data CSV

In [0]:
# csv files path
train_path='train_users.csv'
test_path='test_users.csv'
sess_path='sessions.csv'

# path dictionary
Data_Path={"train":train_path,"test":test_path,"session":sess_path}

## Data Preprocessing and Feature extraction

In [1]:
# Function for Data Preprocessing and feature extraction

def Data_Preparation(Data_Path,Invoke_by_module=False):
    
    """
    -----------------------------------------------------------
    Function perform Data Preprocessing and Feature Extraction
        1.Read Csv Files   
        2.Preprocess Data   
        3.Extract Features  
        4.Format Data      
        5.Return Formated Data
    -----------------------------------------------------------
        Parameters
        ----------
        Data_Path <Dictionary>    : The Dict should contain Data and Absolute Path of all Data files.
        Invoke_by_module <Boolean>: True if this Function is invoked from other file else False
    
        returns 
        --------
        Final_DataFrame <DataFrame> : Contain Data that is Preprocessed with Extracted Features
        Target_Labels <pd.series>   : contain Target variables as pandas series 
    -----------------------------------------------------------

    """

    if (Invoke_by_module):
        print("Data_Preparation Module")
    else:
        print("Invoking Function for Data Preparaion.....")
    
    print("1.Reading Csv Files    ..",end =" ")
    
    # ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
    # reading csv files
    train_dataframe = Data_Path["train"] 
    print("..",end =" ")
    test_dataframe = pd.read_csv(Data_Path["test"])
    print("..",end =" ")
    sessions_dataframe = pd.read_csv(Data_Path["session"])
    print(">>> |Done| <1/5>")
    
    print("2.Preprocessing Data   ..",end =" ")
    
    # storing target and ids 
    Target_Labels = train_dataframe['country_destination']
    test_id = test_dataframe['id']
    sessions_dataframe['id'] = sessions_dataframe['user_id']
    print("..",end =" ")

    # droping columns from dataframe
    train_dataframe = train_dataframe.drop(['country_destination'], axis=1)
    sessions_dataframe = sessions_dataframe.drop(['user_id'],axis=1) 
    print("..",end =" ")

    # Pre-processing Session data 
    # ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
    # Replacing all null value with "NAN"
    sessions_dataframe.action = sessions_dataframe.action.fillna('NAN')
    sessions_dataframe.action_type = sessions_dataframe.action_type.fillna('NAN')
    sessions_dataframe.action_detail = sessions_dataframe.action_detail.fillna('NAN')
    sessions_dataframe.device_type = sessions_dataframe.device_type.fillna('NAN')
    print(">>> |Done| <2/5>")
    
    print("3.Extracting Features ",end =" ")
    
    # Keeping Thresold value as 100 
    action_threshold = 100 

    # Any action count value less tha 100 is replaced by 'OTHERS'
    actions = dict(zip(*np.unique(sessions_dataframe.action, return_counts=True)))
    sessions_dataframe.action = sessions_dataframe.action.apply(lambda x: 'OTHER' if actions[x] < action_threshold else x)
    
    # ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html
    # Obtaining unique value counts in order for features action:index 321
    action_frequency = sessions_dataframe.action.value_counts().argsort()
    action_detail_frequency = sessions_dataframe.action_detail.value_counts().argsort()
    action_type_frequency = sessions_dataframe.action_type.value_counts().argsort()
    device_type_frequency = sessions_dataframe.device_type.value_counts().argsort()
    
    # ref:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html
    # Grouping session data by 'id' 
    session_group = sessions_dataframe.groupby(['id'])

    # basic inintialization
    matrix = []
    length = len(session_group)
    print("..",end =" ")
    
    # iterating through individual groups
    for group in session_group:

        # single group by id
        group_set = group[1]

        # feature extraction
        features = []    
        features.append(group[0]) # id
        features.append(len(group_set)) # length of current group

        # replacing all nan value by 0 in 'secs_elapsed' feature
        secs = group_set.secs_elapsed.fillna(0).values   

        # ref: https://stackoverflow.com/questions/28663856/how-to-count-the-occurrence-of-certain-item-in-an-ndarray-in-python
        # ref: https://blog.csdn.net/Datawhale/article/details/80847662.
        # Action feature value counts, no of unique actions, mean and std.  
        action_count = [0] * len(action_frequency)
        for i,v in enumerate(group_set.action.values):
            action_count[action_frequency[v]] += 1
        _, action_unique_count = np.unique(group_set.action.values, return_counts=True)
        action_count += [len(action_unique_count), np.mean(action_unique_count), np.std(action_unique_count)]
        features = features + action_count

        # Action_detail feature value counts, no of unique Action_details, mean and std.     
        action_detail_count = [0] * len(action_detail_frequency)
        for i,v in enumerate(group_set.action_detail.values):
            action_detail_count[action_detail_frequency[v]] += 1 
        _, action_detail_unique_count = np.unique(group_set.action_detail.values, return_counts=True)
        action_detail_count += [len(action_detail_unique_count), np.mean(action_detail_unique_count), np.std(action_detail_unique_count)]
        features = features + action_detail_count

        # Action_type feature value counts, no of unique Action_type, mean and std, log(sum of secs_elapsed) 
        action_type_secs = [0] * len(action_type_frequency)
        action_type_count = [0] * len(action_type_frequency)
        for i,v in enumerate(group_set.action_type.values):
            action_type_secs[action_type_frequency[v]] += secs[i]   
            action_type_count[action_type_frequency[v]] += 1  
        action_type_secs = np.log(1 + np.array(action_type_secs)).tolist()
        _, action_type_unique_count = np.unique(group_set.action_type.values, return_counts=True)
        action_type_count += [len(action_type_unique_count), np.mean(action_type_unique_count), np.std(action_type_unique_count)]
        features = features + action_type_count + action_type_secs    

        # device_type feature value counts, no of unique device_type, mean and std.     
        device_type_count  = [0] * len(device_type_frequency)
        for i,v in enumerate(group_set.device_type .values):
            device_type_count[device_type_frequency[v]] += 1 
        device_type_count.append(len(np.unique(group_set.device_type.values)))
        _, device_type_unique = np.unique(group_set.device_type.values, return_counts=True)
        device_type_count += [len(device_type_unique), np.mean(device_type_unique), np.std(device_type_unique)]        
        features = features + device_type_count    

        # creating features from 'secs_elapsed' feature
        secs_features = [0] * 5 
        log_bin = [0] * 15

        # stats features
        if len(secs) > 0:
            secs_features[0] = np.log(1 + np.sum(secs))
            secs_features[1] = np.log(1 + np.mean(secs)) 
            secs_features[2] = np.log(1 + np.std(secs))
            secs_features[3] = np.log(1 + np.median(secs))
            secs_features[4] = secs_features[0] / float(features[1])

        # bined features  ref: https://docs.scipy.org/doc/numpy/reference/generated/numpy.bincount.html
            secs_log = np.log(1 + secs).astype(int)
            log_bin = np.bincount(minlength=15, x=secs_log).tolist()                      
        features = features + secs_features + log_bin

        # final feature matrix
        matrix.append(features)
    print("..",end =" ")
    
    # creating feature names for matrix
    feat_names = []    
    for i in range(len(matrix[0])-1):
        feat_names.append('feat_' + str(i)) 

    # converting feature matrix to array
    matrix = np.array(matrix)
    matrix_array = matrix[:, 1:].astype(np.float16)
    matrix_id = matrix[:, 0]   

    # creating dataframe from array matrix
    session_matrix_dataframe = pd.DataFrame(matrix_array, columns=feat_names)
    session_matrix_dataframe['id'] = matrix_id
    session_matrix_dataframe.index = session_matrix_dataframe.id

    # ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html
    # Train and test dataframes concatination
    dataframe_tt = pd.concat((train_dataframe, test_dataframe), axis=0, ignore_index=True)
    dataframe_tt.index = dataframe_tt.id

    # Pre-processing training data
    dataframe_tt = dataframe_tt.fillna(-1)  
    dataframe_tt = dataframe_tt.replace('-unknown-', -1) # replace all nan value with -1
    dataframe_tt = dataframe_tt.drop(['date_first_booking'], axis=1)

    # Feature extration from timestamp feature 'date_account_created'
    dataframe_tt['n_null'] = np.array([sum(r == -1) for r in dataframe_tt.values]) # no of nan
    date_acc_crt = np.vstack(dataframe_tt.date_account_created.astype(str).apply(lambda x: x.split('-')).values)
    date_acc_crt=date_acc_crt.astype(int) 
    dataframe_tt['dac_year'] = date_acc_crt[:,0]  # date_account_created year
    dataframe_tt['dac_month'] = date_acc_crt[:,1] # date_account_created month
    dataframe_tt['dac_day'] = date_acc_crt[:,2]   # date_account_created day
    
    # ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.isocalendar.html
    acc_cret_dates = [datetime(x[0],x[1],x[2]) for x in date_acc_crt]
    dataframe_tt['dac_week_number'] = np.array([d.isocalendar()[1] for d in acc_cret_dates]) # date_account_created week_number
    dataframe_tt['dac_week_day'] = np.array([d.weekday() for d in acc_cret_dates]) # date_account_created week_day

    # one-hot-encoding for week_day
    dataFrame_tt_wd = pd.get_dummies(dataframe_tt.dac_week_day, prefix='dac_week_day')  
    dataframe_tt = dataframe_tt.drop(['date_account_created', 'dac_week_day'], axis=1)
    dataframe_tt = pd.concat((dataframe_tt, dataFrame_tt_wd), axis=1)
    print("..",end =" ")
    
    # function
    def func(s):
        
        """
        Function takes integer and convert it to datetime and return it.
        Input type:  Integer
        return type: Datetime
        """
        
        s=str(s)
        return datetime(year=int(s[0:4]), month=int(s[4:6]), day=int(s[6:8]),
               hour=int(s[8:10]), minute=int(s[10:12]), second=int(s[12:]))

    # Feature extration from feature 'timestamp_first_active'
    dataframe_tt['timestamp_first_active'] = pd.to_datetime(dataframe_tt.timestamp_first_active.apply(func))
    first_acc_dates= list(dataframe_tt['timestamp_first_active'])
    dataframe_tt['tfa_day'] = dataframe_tt.timestamp_first_active.dt.day     # first_active day
    dataframe_tt['tfa_month'] = dataframe_tt.timestamp_first_active.dt.month # first_active month
    dataframe_tt['tfa_year'] = dataframe_tt.timestamp_first_active.dt.year   # first_active year
    dataframe_tt['tfa_hour'] = dataframe_tt.timestamp_first_active.dt.hour   # first_active hour
    dataframe_tt['tfa_week_number'] = np.array([d.isocalendar()[1] for d in first_acc_dates]) # first_active week_number
    dataframe_tt['tfa_week_day'] = np.array([d.weekday() for d in first_acc_dates]) # first_active week_day

    # one-hot-encoding for week_day 
    dataFrame_tt_wd = pd.get_dummies(dataframe_tt.tfa_week_day, prefix='tfa_week_day') 
    dataframe_tt = dataframe_tt.drop(['timestamp_first_active', 'tfa_week_day'], axis=1)
    dataframe_tt = pd.concat((dataframe_tt, dataFrame_tt_wd), axis=1)
    
    # ref: https://drive.google.com/file/d/1NIq_IuTgmPEWUKYBv_nfU4rwIUoykYLE/view?usp=sharing
    # Extracting difference sign and difference in second between the time account_created and first_active
    dataframe_tt['dac_tfa_secs'] = np.array([np.log(1+abs((acc_cret_dates[i]-first_acc_dates[i]).total_seconds())) for i in range(len(acc_cret_dates))])
    dataframe_tt['sig_dac_tfa'] = np.array([np.sign((acc_cret_dates[i]-first_acc_dates[i]).total_seconds()) for i in range(len(acc_cret_dates))])
    print(">>> |Done| <3/5>")
    
    # function indicator_season
    def indicator_season(date_for_season):
        
        """
        Function takes Datetime type and return integer 
        Compute season for a given date
        Input type:  datatime type
        return type: Integer
        """
        
        date_for_season=date_for_season.date().replace(year=2000)

        winter=[date(2000,  1,  1),date(2000,  3, 20),
                date(2000, 12, 21),date(2000, 12, 31)]    
        spring=[date(2000,  3, 21),  date(2000,  6, 20)]  
        summer=[date(2000,  6, 21),  date(2000,  9, 22)]
        autumn=[date(2000,  9, 23),  date(2000, 12, 20)]  

        if (winter[0]<=date_for_season<=winter[1]) or (winter[2]<=date_for_season<=winter[3]):
            sesn=0
        elif spring[0]<=date_for_season<=spring[1]:
            sesn=1
        elif summer[0]<=date_for_season<=summer[1]:
            sesn=2
        elif autumn[0]<=date_for_season<=autumn[1]:
            sesn=3
        return sesn

    print("4.Formatting Data      ..",end =" ")
    
    # Extracting Season feature from account_created and first_active dates
    dataframe_tt['season_dac'] = np.array([indicator_season(date_for_season) for date_for_season in acc_cret_dates])
    dataframe_tt['season_tfa'] = np.array([indicator_season(date_for_season) for date_for_season in first_acc_dates])

    # ref: https://docs.scipy.org/doc/numpy/reference/generated/numpy.where.html
    # Pre-processing 'age' feature
    age_value = dataframe_tt.age.values
    age_value = np.where((age_value<2000)&(age_value>1900), 2014-age_value, age_value) # replace all dob value to correct age
    age_value = np.where((age_value<14)&(age_value>0), 4, age_value) # replace all age value <14 by value 4 
    age_value = np.where((age_value<2016)&(age_value>2010), 9, age_value) # replace all 2016<age_value>2010 by value 9 
    age_value = np.where(age_value>99, 110, age_value) # replace all age_value>99 by value 110
    dataframe_tt['age'] = age_value
    print("..",end =" ")
    
    # ref: https://drive.google.com/file/d/1NIq_IuTgmPEWUKYBv_nfU4rwIUoykYLE/view
    # Age-binning into 20 equal intervals with an interval_value=5 
    age_interval =[i for i in range(0,101,5)]
    def get_interv_value(age):
        
        """
        Function take an integer and return its coressponding interval that it belong to.
        input type:  integer(age)
        return type: integer(interval_value)
        """
        
        interval_value = 20
        for i in range(len(age_interval)):
            if age < age_interval[i]:
                interval_value = i 
                break
        return interval_value
    dataframe_tt['age_interv'] = dataframe_tt.age.apply(lambda x: get_interv_value(x))

    # one-hot-encoding binned age features
    dataFrame_age_interval = pd.get_dummies(dataframe_tt.age_interv, prefix='age_interv')
    dataframe_tt = dataframe_tt.drop(['age_interv'], axis=1)
    dataframe_tt = pd.concat((dataframe_tt, dataFrame_age_interval), axis=1)
    print("..",end =" ")
    
    # ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
    # Creating dummy variables (one-hot-encoding) for train data features
    one_hot_features = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 
                        'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
    for feature in one_hot_features:
        dataFrame_tt_dummy = pd.get_dummies(dataframe_tt[feature], prefix=feature)
        dataframe_tt = dataframe_tt.drop([feature], axis=1)
        dataframe_tt = pd.concat((dataframe_tt, dataFrame_tt_dummy), axis=1)    
    dataframe_tt.reset_index(drop=True,inplace=True)
    session_matrix_dataframe.reset_index(drop=True,inplace=True)
    print(">>> |Done| <4/5>")
    
    print("5.Returning Final Data ..",end =" ")
    
    # Merging pre-processed session data and train data on 'id' to obtain final dataframe
    Final_DataFrame = pd.merge(dataframe_tt, session_matrix_dataframe, on="id",how='left')
    print("..",end =" ")

    # replace all nan by -2
    Final_DataFrame = Final_DataFrame.fillna(-2) 
    print("..",end =" ")
    
    # Nan count for each row
    Final_DataFrame['all_null'] = np.array([sum(r<0) for r in Final_DataFrame.drop(['id'], axis=1).values]) 
    print(">>>",end =" ")

    # formating data to avoid data_leakage
    tr_cols, ts_cols=in_dict["features"]["fin_cols"], list(Final_DataFrame.columns)
    Final_Formated=pd.DataFrame(data=np.zeros((Final_DataFrame.shape[0],len(tr_cols))),columns=tr_cols)

    for i in ts_cols:
        Final_Formated[i]=Final_DataFrame[i]
    
    print("|Done| <5/5>")
    
    # returning prepared data
    return Final_Formated, Target_Labels ;

## Feature Selection by using Feature Importance

In [0]:
def Feature_Selection(Final_DataFrame, Target_Labels, Keep_percent=0.7, kag_sub=False, Rerun=True):
    
    """
    -----------------------------------------------------------------------------------------
    Perform Feature Selection with Data for preserving specified percent of important features 
        1.Performing Train Validation split on Data 
        2.Training Xgboost for Feature Importance
        3.Format Data by using Importance Score
        4.Returning Data and Selected Features               
    ------------------------------------------------------------------------------------------
        Parameters
        ----------
        Final_DataFrame <DataFrame> : Data that is Preprocessed and Formated
        Target_Labels <pd.series>   : Target variables
        Keep_percent <float>        : (defaulf=0.7) 
                                      value between 0-1 which specify percent of 
                                      Features to preserve
        kag_sub <Boolean>           : (defaulf=False) 
                                       True : Feature_Selection for kaggle submission 
                                       False: Normal Feature_Selection 
        Rerun <Boolean>             : (defaulf=False)
                                       True : Rerun Model for Feature selection 
                                       False: Load saved Model for Feature_Selection                                 
                                       
        returns 
        --------
        Data <numpy array>          : Data with only Selected Features
        Imp_Features <list>         : Mask of Selected Features 
    -------------------------------------------------------------------------------------------

    """
    
    
    if kag_sub:print("Feature Selection")
    print("1.Performing Train Validation split on Data ..",end =" ")
    
    # Data inintialization
    len_tr = len(Target_Labels)
    data = Final_DataFrame.values[:len_tr]
    Test_x = Final_DataFrame.values[len_tr:]
    print("..",end =" ")
    
    # lable encoding
    label_en = LabelEncoder()
    labels = label_en.fit_transform(Target_Labels.values)
    print("..",end =" ")

    # train val split
    # ref: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    random_state=0
    Train_x, CV_x, Train_y, CV_y = train_test_split(data, labels, test_size=0.2, random_state=0,stratify=labels)
    test_id, Train_x, CV_x, Test_x=Test_x[:,0:1], Train_x[:,1:], CV_x[:,1:], Test_x[:,1:]
    data=data[:,1:]
    print(">>> |Done| <1/3>")
    
    print("2.Training Xgboost for Feature Importance   .. ..",end =" ")
    
    # ref: https://xgboost.readthedocs.io/en/latest/parameter.htm
    # Feature selection by Xg-boost
    if Rerun==True:
        
        params = {'eta': 0.09,'max_depthresold': 6,'subsample': 0.5,'colsample_bytree': 0.5,'objective': 'multi:softprob',
                              'eval_metric': 'mlogloss','num_class': 12,"n_jobs":6, "silent": 1}
        num_rounds = 900
        xg_tr = xgboost.DMatrix(Train_x, label=Train_y)  
        xg_cv = xgboost.DMatrix(CV_x, label=CV_y)  
        watchlist = [(xg_tr,'TRAIN'), (xg_cv, 'CV')]
        xgb = xgboost.train(params, xg_tr, num_rounds, watchlist, early_stopping_rounds=10,verbose_eval=0)
        
        print("..",end =" ")
    
    # load saved model
    else:
        
        xgb=in_dict["Best_Model"]["Feature_Sel_Model"]
        print("..",end =" ")
        
    print(">>> |Done| <2/3>")
    
    
    print("3.Returning Selected Features               .. ..",end =" ")
    
    # getting feature importance for each
    imp_scores = xgb.get_fscore()

    # mapping important feature value with features
    Imp_Features = np.zeros(Train_x.shape[1])
    for k,v in imp_scores.items():
        Imp_Features[int(k[1:])] = v

    # normalization of feature importance    
    Imp_Features = Imp_Features/float(np.max(Imp_Features))

    # finding thresold to select important feature
    score=Imp_Features
    thresold = np.sort(score)[::-1][int(len(score)*Keep_percent)] # selecting top 70% fetaures
    Imp_Features = score > thresold
    print("..",end =" ")
    
    # Re-initilaizing data to keep only important feature
    data = data[:, Imp_Features]
    Train_x,CV_x,Test_x = Train_x[:, Imp_Features],CV_x[:, Imp_Features],Test_x[:, Imp_Features]
    print(">>> |Done| <3/3>")

    print('Selected %s Important Features from %s Features' %(Train_x.shape[1], Imp_Features.shape[0]))
    
    # return data after feature selection
    if kag_sub==True:
        
        data = [Train_x, CV_x, Train_y, CV_y, Test_x, test_id, label_en]
    
    
    return data, Imp_Features

In [0]:
def Get_Imp_Features(Model, Top, Want_to_load=False):
    
    """
    -----------------------------------------------------------------------------------------
    Get Important Features names and plot Barh_plot with their score
        1.Load Model and get Feature_Importances
        2.Extract Top Features using Feature_Importances
        3.Plot a Barh_plot to see relative Imporatance
    -----------------------------------------------------------------------------------------
        Parameters
        ----------
        Model <str> or <ML Model>  : str or ML-Model type 
        Top <int>                  : Top Important Features names to return
        Want_to_load <Boolean>     : (defaulf=False)
                                      True : When Model has to be loaded from specified path  
                                      False: When ML-Model is direcly passed 
        
        returns 
        --------        
        Top_Features <list>        : Feature_names of Top features in order
    
    -----------------------------------------------------------------------------------------
    
    """
    
    # load Ml Model for feature_importances_ 
    if (Want_to_load):
        xgb=in_dict["Best_Model"][Model]    
    
    else:
        xgb=Model       

    # get feature_importances and extract Top features      
    f_index_sort=xgb.feature_importances_.argsort()[::-1]
    Top_Features=np.take(in_dict["features"]["selected_features_names"],f_index_sort)[0:Top]
    Top_scores=np.take(xgb.feature_importances_,f_index_sort)[0:Top]

    # ref: https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.barh.html
    # plot barh_plot for relative importance
    plt.figure(figsize=(8,6))
    plt.title('Feature Importances')
    plt.barh(range(len(Top_Features)),Top_scores[::-1], color="green",label="Feature Importance")
    plt.yticks(range(len(Top_Features)), Top_Features[::-1])
    plt.xlabel('Relative Importance')
    plt.legend(loc=0)
    plt.show()
    
    return Top_Features[0:Top]

In [0]:
"""
Get_Imp_Features(Model, Top, Want_to_load=False):
Feature_Selection(Final_DataFrame, Target_Labels, Keep_percent=0.7, kag_sub=False, Rerun=True):
Data_Preparation(Data_Path,Invoke_by_module=False):"""