### Requirements

In [None]:
!pip install catboost
from sklearn import metrics
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np 

### Helper functions for data-preprocessing

In [115]:
model = CatBoostClassifier().load_model("/content/drive/MyDrive/Datasets/Case-Study-1/Final_cb")


def custom_where(ele):
    '''
    Helper function to combine 3 redundant columns into 1 using simple if-else
    Code is copied from our EDA notebook directly
    '''

    x,y,z = list(map(str,ele))
    if x == '1':
        if y=='1':
            if z=='1':
                return 'within 25mins'
            else:
                return 'within 15mins'
        else:
            return 'within 5mins'


def mis_val_imputer(cols_to_impute,df):
    ''' Impute missing values with most frequent category 
         Code is copied from our EDA notebook directly
    '''
    mis_val_imputewith_trainMode = {'Bar': 'never',
                    'CarryAway': '1~3',
                    'CoffeeHouse': 'less1',
                    'Restaurant20To50': 'less1',
                    'RestaurantLessThan20': '1~3'
                    }
    for col in cols_to_impute:
        most_frequent_val = mis_val_imputewith_trainMode[col]
        df.loc[:,col].fillna(most_frequent_val,inplace=True)
    
    return df

def full_df_preprocessing(test_data):
    '''
    Whole test data chunk will be preprocessed here.
    Code is copied from our EDA notebook directly
    '''
    df = test_data.copy()

    # simple script to change values in a few columns
    cols_that_need_wrangling = ['time','temperature', 'Bar',  'CoffeeHouse',  'CarryAway', 'RestaurantLessThan20','Restaurant20To50']

    d_time = {
        '7AM':'Morning',
        '10AM':'Morning',
        '2PM':'Evening',
        '6PM':'Evening',
        '10PM':'Night'
    }
    d_temp = {
        55:'Low',
        80:'High', 
        30:'Medium'
    }
    d = {
        'less1':'Atmost 1', 
        '1~3':'1 to 3',
        'gt8':'Greater than 8' , 
        '4~8':'4 to 8',
        'never':'never'
    }

    mapping = [d_time,d_temp] + [d]*5

    for column,d in zip(cols_that_need_wrangling,mapping):
        df[column] = df[column].map(d)

    df.expiration.replace({'1d':'24h'},inplace=True)

    df['driving_distance'] = df[['toCoupon_GEQ5min','toCoupon_GEQ15min','toCoupon_GEQ25min']].apply(custom_where,axis=1,raw=True)
    df.drop(['toCoupon_GEQ5min','toCoupon_GEQ15min','toCoupon_GEQ25min','direction_same','car'],axis=1,inplace=True)

    possible_null_cols = ['Bar','CoffeeHouse','CarryAway','RestaurantLessThan20','Restaurant20To50']
    df = mis_val_imputer(possible_null_cols,df)

    return df.astype(str)

### Main Functions

In [116]:


def final_fun_1(data_point,already_preprocessed=False):

    ''' Receive a single data-point and make prediction 
        Please make sure 'data_point' is an array OR list
    ''' 
    if not already_preprocessed:
        # preprocessing

        data_point = list(data_point)
        # drop car,direc_same cols
        data_point.pop(-2)
        data_point.pop(-10)

        # missing_val_imputer and remapping the values
        d_mode_from_train = {
            -5:'less1',
            -6:'1~3',
            -7:'1~3',
            -8:'less1',
            -9:'never'
        }
        coupon_freq_mapper = {
        'less1':'Atmost 1', 
        '1~3':'1 to 3',
        'gt8':'Greater than 8' , 
        '4~8':'4 to 8',
        'never':'never'
        }
        # -5 to -10 cols are the respective indices from back
        for idx in range(-5,-10,-1):
            if data_point[idx]=='nan' or type(data_point[idx])==float:
                if np.isnan(data_point[idx])==True:
                    data_point[idx] = d_mode_from_train[idx]
            data_point[idx] = coupon_freq_mapper[data_point[idx]]
        

        d_time = {
        '7AM':'Morning',
        '10AM':'Morning',
        '2PM':'Evening',
        '6PM':'Evening',
        '10PM':'Night'
        }
        d_temp = {
            55:'Low',
            80:'High', 
            30:'Medium'
        }
        d_expiration = {'1d':'24h',
                        '2h':'2h'}

        data_point[3] = d_temp[data_point[3]]
        data_point[4] = d_time[data_point[4]]
        data_point[6] = d_expiration[data_point[6]]

        # combine cols 
            # create a new column
        driving_distance_colvalue = custom_where(data_point[-4:-1])
            # drop 5,15,25-mins distance cols as they'be been combined above
        del data_point[-4:-1]
            # append the new column to data_point
        data_point.append(driving_distance_colvalue)

        # Finally convert point to str type
        data_point = list(map(str,data_point))

    # send the data point to model
    y_pred = model.predict(data_point)
    return y_pred

def final_fun_2(test_data,y_test,already_preprocessed=False):

    ''' 
    This function takes several data points as input and returns the evaluation metric
    Please make sure that test_data is a dataframe
    '''
    if not already_preprocessed:
        # Preprocessing
        test_data = full_df_preprocessing(test_data)
        # rearrange cols in a way that model was trained on
        test_data = test_data[model.feature_names_]

    # prediction
    y_pred = model.predict(test_data)
    y_pred_proba = model.predict_proba(test_data)[:,1]

    # evaluation
    f1 = metrics.f1_score(y_test.astype(int),y_pred)
    auc = metrics.roc_auc_score(y_test.astype(int),y_pred_proba)

    return f1,auc



### For personal testing

In [None]:
# # unpreprocessed data / RAW data / leakaged-data (AS MODEL IS TRAINED ON SOME OF THESE POINTS)
# data = pd.read_csv('/content/drive/MyDrive/Datasets/Case-Study-1/in-vehicle-coupon-recommendation.csv')
# Y = data.Y
# data.drop("Y",axis=1,inplace=True)
# print("Prediction:",final_fun_1(data.iloc[56,:]),"Observed value:",Y[56])
# print("Metrics on all datapoints",final_fun_2(data,Y))

# # preprocessed data / unseen data
# X_train = pd.read_csv('/content/drive/MyDrive/temporary_datasets/X_train_catboost.csv')
# y_train = pd.read_csv('/content/drive/MyDrive/temporary_datasets/y_train.csv')

# X_test = pd.read_csv('/content/drive/MyDrive/temporary_datasets/X_test_catboost.csv')
# y_test = pd.read_csv('/content/drive/MyDrive/temporary_datasets/y_test.csv')

# print("Prediction:",final_fun_1(X_test.iloc[56,:],already_preprocessed=True),"Observed value:",y_test.iloc[56].values)
# print("Metrics on unseen datapoints",final_fun_2(X_test,y_test,already_preprocessed=True))

## END