In [None]:
def get_df_info(path):
    """get basic information about the dataset
    input: path of the json to load
    output: df, df.info(), df.head(), df.describe(): dataset and its summmary statistics with a sample data"""
    df = pd.read_json(path, orient='records', lines=True)
    return df, df.info(), df.head(), df.describe()

In [None]:
def flatten_multiindex_colnames(df):
    """flattens the multi index column names and assign them to the new dataframe
    """
    df.columns = (['_'.join(col).strip() for col in df.columns.values])
    return df

In [None]:
def get_success_flag_cust_offer_disc_bogo(offers_master):
    """
     customer offer is mentioned as successful if the offer was viewed before completed 
     or in the few (533) scenarios where customer has consumed 2 offers in one go it is assumed they are offer success pplas they have known that they had 2 offers
    input: offers dataframe with custimer, offerid, event, time, portfolio data
    output: get a pandas dataframe with customer, offer and success flag - only the successful customers
    """
    # add time columns
    offers_master['time'] = offers_master['time'] + 0.1 # adding 0.1 time as offer time is mentioned as 0
    offers_master['received_time'] = offers_master['offer_received'] * offers_master['time']
    offers_master['viewed_time'] = offers_master['offer_viewed'] * offers_master['time']
    offers_master['completed_time'] = offers_master['offer_completed'] * offers_master['time']
    
    cust_offers = offers_master[["cust_id", "offer_id", "event", "time", "completed_time", "received_time", "viewed_time", "offer_type"]]

    cust_offers['lagged_event'] = (cust_offers
                       .sort_values(by=['time'], ascending=True)
                       .groupby(["cust_id", "offer_id"])["event"].shift(1))
    cust_offers['lagged_viewed_time'] = (cust_offers
                       .sort_values(by=['time'], ascending=True)
                       .groupby(["cust_id", "offer_id"])["viewed_time"].shift(1))
    cust_offers['lagged_event_2'] = (cust_offers
                       .sort_values(by=['time'], ascending=True)
                       .groupby(["cust_id", "offer_id"])["event"].shift(2))
    cust_offers['lagged_event_3'] = (cust_offers
                       .sort_values(by=['time'], ascending=True)
                       .groupby(["cust_id", "offer_id"])["event"].shift(3))
    cust_offers['lagged_received_time'] = (cust_offers
                       .sort_values(by=['time'], ascending=True)
                       .groupby(["cust_id", "offer_id"])["received_time"].shift(2))
    cust_offers_completed = cust_offers[cust_offers['event']=='offer completed']

    cust_offers_completed_bogo_disc = cust_offers_completed[cust_offers_completed['offer_type']!='informational']

    cust_offers_completed_bogo_disc['cust_offer_success_bogo_disc'] = (np.where(
                                                                        (
                                                                            (
                                                                                (cust_offers_completed_bogo_disc['lagged_event'] == 'offer viewed')
                                                                                & 
                                                                                (cust_offers_completed_bogo_disc['lagged_event_2'] == 'offer received')
                                                                            )
                                                                            |
                                                                            (
                                                                                cust_offers_completed_bogo_disc['lagged_event'] == 'offer completed'
                                                                            )

                                                                        ),
                                                                     1, 0)
                                                             )

    cust_success_bogo_disc = (cust_offers_completed_bogo_disc
                                       .groupby(["cust_id", "offer_id"])
                                      .max()['cust_offer_success_bogo_disc']).reset_index()

    cust_success_bogo_disc = cust_success_bogo_disc[cust_success_bogo_disc['cust_offer_success_bogo_disc']==1]
    return cust_success_bogo_disc




In [None]:
def calculate_informational_success(offers, transcript):
    """caclculate the offer success for informational offers
    success has been calculated if the customer has been received, viewed and 
    transaction happened one after the other"""
    # informational offers customers
    transcript_info = transcript.copy(deep = True)
    transcript_info['offer_id'] =  transcript_info['value'].apply(lambda x: list(x.values())[0])
    transcript_info = transcript_info.merge(portfolio[['offer_id', 'offer_type']], on = 'offer_id', how = 'left')
    
    # get the customers who have received atleast one informational offers
    info_offers = offers[offers['offer_type']=='informational']
    info_offers_cust = info_offers[['cust_id']].drop_duplicates()
    duration = offers[offers['offer_type']=='informational'][['offer_id', 'duration']].drop_duplicates()

    trx_info = (transcript_info
        .merge(info_offers_cust, on = 'cust_id')
           .merge(duration, on = 'offer_id', how = 'left')
           .fillna(0))


    
    

    # remove other offers except informational ones
    trx_info = (trx_info[~((trx_info['offer_type'] == 'bogo')
                        |(trx_info['offer_type'] == 'discount')) ])

    trx_info['next_event'] = (trx_info
                       .sort_values(by=['time'], ascending=True)
                       .groupby(["cust_id"])["event"].shift(-1))

    trx_info['next_event_2'] = (trx_info
                       .sort_values(by=['time'], ascending=True)
                       .groupby(["cust_id"])["event"].shift(-2))
    
    trx_info['next_event_time_2'] = (trx_info
                   .sort_values(by=['time'], ascending=True)
                   .groupby(["cust_id"])["time"].shift(-2))
    
    trx_info['time_diff'] = trx_info['next_event_time_2'] - trx_info['time']

    trx_info['cust_offer_success_info'] = (np.where(
                                                        (
                                                            (   (trx_info['event'] == 'offer received')
                                                             & 
                                                                (trx_info['next_event'] == 'offer viewed')
                                                                & 
                                                                (trx_info['next_event_2'] == 'transaction')
                                                              & 
                                                                (trx_info['time_diff'] <= trx_info['duration'])
                                                            )


                                                        ),
                                                     1, 0)
                                             )

    trx_info_success = (trx_info
                                       .groupby(["cust_id", "offer_id"])
                                      .max()['cust_offer_success_info']).reset_index()

    trx_info_success = trx_info_success[trx_info_success['cust_offer_success_info']==1]
    return trx_info_success






In [None]:
def plot_box_plots_bi_level_multiple(df, y_var, x_var, hue_var, query_var, axes, title_var):
    """get groupwise(x) boxplots for y variables with dataframe filtered
    input:df: pandas dataframe
           y_var: string column name of variable to plot
           x_var: string column name to grop var
           hue_var: string col name to go in the legend
           query_var: string with the query ex: ('colname>0')
           title_var: title as a string
           axes: position as ax[0], ax[1]
    output: return boxplots as charts
    """
    
    ax = sns.boxplot(data = df.query(query_var),
        x = x_var,
               y = y_var,
               showfliers = False,
               hue=hue_var
                , ax = axes
               )
    ax.set(title=title_var)

In [None]:
def get_corrleation_plot(features, ant):
    """input: features-pandas dataframe with features to plot
            ant - True means to annotate
    output: plot the correlations"""
    plt.figure(figsize=(20, 10))
    sns.set(font_scale=1.5)
    sns.heatmap(features.corr(),
            annot = ant,
            fmt = '.2f',
            linewidths=.8,
            cmap='YlGnBu')

    plt.title('Correlation between features')

In [None]:
def get_train_test_split(x,y):
    '''
    split the features and target into train and test set and scale the datasets
    input: x : pandas dataframe with set of features
            y : pandas dataframe with the target
    output: X_train : scaled x train values as a numpy ndarray
            X_test : scaled x test values as a numpy ndarray
            y_train : pandas series
            y_test : pandas series

    '''
    
    #split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y, 
                                                        test_size=0.20, 
                                                        random_state=42)

    #fit and transform scaling on training data
    scaler=StandardScaler()
    X_train=scaler.fit_transform(X_train)

    #scale test data
    X_test=scaler.transform(X_test)
    
    return X_train,X_test,y_train, y_test

In [None]:
def get_feature_importance(model, features, plot= True):
    """get feature importance as a pandas dataframe given the classifer and if plot is True provide a bar chart"""
    feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = features.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
    
    feature_importances = feature_importances[feature_importances['importance']>0]
    if plot:
        feature_importances.plot.bar()
        plt.show()
    
    
    return feature_importances

In [None]:
def run_models_v2(clf_list, X_train, y_train, X_test, y_test, master_name, print_res = True):
    '''
    inputs:
    - clf_list: list of classifiers
    -X_train, y_train, X_test, y_test: train test splitted datasets
    outputs:
    - Dataframe of results from model training and prediction
    '''
    
    results = {}
    for clf in clf_list:
        clf_name = clf.__class__.__name__
        results[clf_name] = {}
        results[clf_name]= predict_evaluate_model_v2(clf, X_train, y_train, X_test, y_test, master_name, True, print_res)
    return pd.DataFrame(results)

In [None]:
def predict_evaluate_model_v2(model, X_train, y_train, X_test, y_test, dataframe_name, as_list = True, print_res = True): 
    '''
    generate predictions using a given model and return the results
    input: model: classifier
          X_train, y_train, X_test, y_test: master splitted to x and y and train, test
          dataframe_name : master name as a string
          as_list: if True output as a list else as a dataframe
    output: results as a dictionary
    '''
    results = {}
    
    #Fit the learner to the training data and get training time
    model = model.fit(X_train, y_train)
    
    # Get predictions on the test set(X_test)
    predictions_test = model.predict(X_test)
    predictions_train = model.predict(X_train)
    
    
    results['dataframe_name'] = [dataframe_name]
    
    #add training accuracy to results
    results['training_score']=model.score(X_train,y_train)
    
    #add testing accuracy to results
    results['testing_score']=model.score(X_test,y_test)
    
    
    precision_te, recall_te, fscore_te, support_te = score(y_test, predictions_test)
    precision_tr, recall_tr, fscore_tr, support_tr = score(y_train, predictions_train)
    
#     results['training_precion_class0'] = precision_tr[0]
#     results['testing_precion_class0'] = precision_te[0]
#     results['training_recall_class_0'] = recall_tr[0]
#     results['testing_recall_class_0'] = recall_te[0]
    results['training_fscore_class_0'] = fscore_tr[0]
    results['testing_fscore_class_0'] = fscore_te[0]
    results['training_fscore_class_1'] = fscore_tr[1]
    results['testing_fscore_class_1'] = fscore_te[1]
#     results['training_support_class_0'] = support_tr[0]
#     results['testing_support_class_0'] = support_te[0]
    
    if print_res:
     
        print("{} trained on {} samples.".format(model.__class__.__name__, len(y_train)))
        print("MSE_train: %.4f" % mean_squared_error(y_train,predictions_train))
        print("MSE_test: %.4f" % mean_squared_error(y_test,predictions_test))
        print("Training accuracy: %.4f" % results['training_score'])
        print("Test accuracy: %.4f" % results['testing_score'])
        print(classification_report(y_test, predictions_test,digits=4))
        

        cm = confusion_matrix(y_test, predictions_test)
        confusion = sns.heatmap(cm, annot=True, fmt='g')
        
        
    
    if as_list:
        return (results)
    else:
        return pd.DataFrame(results)

In [None]:
def get_x_y(master, x_col_list, y_col_name):
    """split the master to x and y sets"""
    y = master[[y_col_name]]
    x = master[x_col_list]
    return x, y
    

In [None]:
def compare_imbalance_methods(master, model, x_var, y_var):
    """input: master: pandas dataframe
                model: classifier
                features_to_drop: feature list to drop
    
    output: pandas dataframe with accuracy across different methods
    """
    df_minority = master[master['tot_success']==0]
    df_majority = master[master['tot_success']==1]

    # oversampling
    df_minority_oversampled = resample(df_minority, replace=True, n_samples=10016, random_state=0)
    df_oversampled = pd.concat([df_majority, df_minority_oversampled])

    # undersampling
    df_majority_undersampled = resample(df_majority, replace=True, n_samples=4809, random_state=0)
    df_undersampled = pd.concat([df_majority_undersampled, df_minority])

    rus = RandomUnderSampler()
    ros = RandomOverSampler()
    cc = ClusterCentroids(sampling_strategy={0: 10})
    smote = SMOTE(sampling_strategy='minority')
    
    
    x, y = get_x_y(df_oversampled, x_var, y_var)
    X_train, X_test, y_train, y_test = get_train_test_split(x,y)
    df2 = predict_evaluate_model_v2(model, X_train, y_train, X_test, y_test, 'df_oversampled', as_list = False, print_res = False)
    
    x, y = get_x_y(df_undersampled, x_var, y_var)
    X_train,X_test,y_train, y_test = get_train_test_split(x,y)
    df3 = predict_evaluate_model_v2(model, X_train, y_train, X_test, y_test, 'df_undersampled', as_list = False, print_res = False)

    x, y = get_x_y(master, x_var, y_var)
    X_train, X_test, y_train, y_test = get_train_test_split(x,y)
    df1 = predict_evaluate_model_v2(model, X_train, y_train, X_test, y_test, 'master', as_list = False, print_res = False)

    X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
    df_rus = predict_evaluate_model_v2(model, X_train_rus, y_train_rus, X_test, y_test, 'RUS', as_list = False, print_res = False)

    X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
    df_ros = predict_evaluate_model_v2(model, X_train_ros, y_train_ros, X_test, y_test, 'ROS', as_list = False, print_res = False)

    X_train_cc, y_train_cc= cc.fit_resample(X_train, y_train)
    df_cc = predict_evaluate_model_v2(model, X_train_cc, y_train_cc, X_test, y_test, 'cc', as_list = False, print_res = False)

    X_train_smote, y_train_smote = ros.fit_resample(X_train, y_train)
    df_smote = predict_evaluate_model_v2(model, X_train_smote, y_train_smote , X_test, y_test, 'SMOTE', as_list = False, print_res = False)


    concat_results = pd.concat([df1, df2, df3, df_rus, df_ros, df_cc, df_smote])
    return concat_results





In [None]:
def get_barplots_for_gender_success(df, i, title_str):
    """df : pandas dataframe 
       i : axis position
       title_str : string with title
       return: count plots with x : gender, y : success flag count
    
    """
    ax = sns.barplot('gender',
                     y='sucess_flag_bogo', 
                     data=df.query('event_count_bogo >0'), 
                     ci=None,
                     order=["M", "F", "O"],
                    ax=axs[i, 0])
    ax.set(title='BOGO: ' + title_str)
    ax = sns.barplot('gender', 
                     y='sucess_flag_discount', 
                     data=df.query('event_count_discount >0'), 
                     ci=None,
                     order=["M", "F", "O"],
                    ax=axs[i, 1])
    ax.set(title='Discount: '+ title_str)
    ax = sns.barplot('gender',
                     y='sucess_flag_informational', 
                     data=df.query('event_count_informational >0'), 
                     ci=None,
                     order=["M", "F", "O"],
                    ax=axs[i, 2])
    ax.set(title='Informational: ' + title_str)

In [None]:
def display(results):
    """display the results of the model"""
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')