In [37]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotnine as p9

%matplotlib inline 


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.model_selection import train_test_split,GroupShuffleSplit

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from xgboost import XGBClassifier



from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score

import warnings
warnings.filterwarnings("ignore")


In [19]:
#constants 
SEED = 42
TARGET_COLUMN = 'matched'
CORRELATED_DROP_COLUMNS = ['DifferentPredictedTime','DifferentPredictedDate']
REMOVE_COLUMNS = ['receipt_id','company_id','matched_transaction_id','feature_transaction_id']


In [6]:
#reading the data
df = pd.read_pickle('./data/processed_data.pkl')
df.head()

Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch,matched
0,10000,10000,10468,10000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,10000,10000,10468,10001,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,10000,10000,10468,10003,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,10000,10000,10468,10004,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,10000,10000,10468,10005,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [54]:
num_cols = df.select_dtypes(['integer','float']).columns
cat_cols = df.select_dtypes(['category']).columns
print(f'Categorical Columns : {str(cat_cols)}')
print(f'Numerical Columns : {str(num_cols)}')

Categorical Columns : Index(['DifferentPredictedTime', 'TimeMappingMatch', 'ShortNameMatch',
       'DifferentPredictedDate', 'PredictedTimeCloseMatch'],
      dtype='object')
Numerical Columns : Index(['DateMappingMatch', 'AmountMappingMatch', 'DescriptionMatch',
       'PredictedNameMatch', 'PredictedAmountMatch', 'matched'],
      dtype='object')


In [55]:
# Ensure that all transactions for a given receipt are either in the train or test set
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(df, groups=df['receipt_id']))

train_df = df.iloc[train_idx,:]
test_df =df.iloc[test_idx,:]

X_train, y_train = train_df.loc[:,train_df.columns !=TARGET_COLUMN], train_df[TARGET_COLUMN]
X_test, y_test = test_df.loc[:,test_df.columns != TARGET_COLUMN], test_df[TARGET_COLUMN]

X_train = X_train.drop(columns=REMOVE_COLUMNS)
X_train = X_train.drop(columns = CORRELATED_DROP_COLUMNS)
X_train.head()

X_test = X_test.drop(columns=REMOVE_COLUMNS)
X_test = X_test.drop(columns = CORRELATED_DROP_COLUMNS)
X_test.head()


Unnamed: 0,DateMappingMatch,AmountMappingMatch,DescriptionMatch,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,PredictedAmountMatch,PredictedTimeCloseMatch
34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
38,0.85,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [56]:
# Creating Column Transformer 
#numeric_transformer = StandardScaler()
#oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [  
    ],
    remainder='passthrough'
)

X_train = preprocessor.fit_transform(X_train)
X_train.shape

(9445, 8)

In [33]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1score = f1_score(true,predicted)
    auc = roc_auc_score(true, predicted)
    return accuracy,precision,recall,f1score,auc


In [42]:
def model_executor(models,X_train,X_test,y_train,y_test,params):
    model_list = []
    auc_list =[]
    f1_list = []
    model_obj = []

    for i in range(len(list(models))):
        model = list(models.values())[i]
        para=params[list(models.keys())[i]]

        gs = GridSearchCV(model,para,cv=3)
        gs.fit(X_train,y_train)

        model.set_params(**gs.best_params_)
        model.fit(X_train,y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Evaluate Train and Test dataset
        model_train_accuracy , model_train_precision, model_train_recall,model_train_f1score,model_train_auc= evaluate_model(y_train, y_train_pred)
        model_test_accuracy , model_test_precision, model_test_recall,model_test_f1score,model_test_auc = evaluate_model(y_test, y_test_pred)


        print(list(models.keys())[i])
        model_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print("- Precision: {:.4f}".format(model_train_precision))
        print("- Recall: {:.4f}".format(model_train_recall))
        print("- F1 Score: {:.4f}".format(model_train_f1score))
        print("- AUC: {:.4f}".format(model_train_auc))

        print('----------------------------------')

        print('Model performance for Test set')
        print("- Accuracy: {:.4f}".format(model_test_accuracy))
        print("- Precision: {:.4f}".format(model_test_precision))
        print("- Recall: {:.4f}".format(model_test_recall))
        print("- F1 Score: {:.4f}".format(model_test_f1score))
        print("- AUC: {:.4f}".format(model_test_auc))
        auc_list.append(model_test_auc)
        f1_list.append(model_test_f1score)
        model_obj.append(model)

        print('='*35)
        print('\n')
    return model_list,auc_list,f1_list,model_obj

In [44]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Linear SVC": LinearSVC(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Ada Boost Classifier": AdaBoostClassifier(),
    "XGBoost Classifier": XGBClassifier()
    }

params = {
    "Logistic Regression":{},
    "Linear SVC":{},
    "Decision Tree Classifier":{},
    "Random Forest Classifier":{'max_depth':[3,5,10,None],
                                'n_estimators':[5,10,15],
                                'max_features':[1,3,5,7],
                                'min_samples_leaf':[10,20,30],
                                'min_samples_split':[10,20,30]
                              },
    "Ada Boost Classifier":{},
    "XGBoost Classifier":{
                               'min_child_weight': [1, 5, 10],
                               'gamma': [0.5, 1, 1.5, 2, 5],
                               'subsample': [0.6, 0.8, 1.0],
                               'colsample_bytree': [0.6, 0.8, 1.0],
                               'max_depth': [3, 4, 5]
                         }
        }   
model_list,auc_list,f1_list,model_obj = model_executor(models,X_train,X_test,y_train,y_test,params)

Logistic Regression
Model performance for Training set
- Accuracy: 0.9571
- Precision: 0.8441
- Recall: 0.5087
- F1 Score: 0.6348
- AUC: 0.7506
----------------------------------
Model performance for Test set
- Accuracy: 0.9618
- Precision: 0.8367
- Recall: 0.4970
- F1 Score: 0.6236
- AUC: 0.7452


Linear SVC
Model performance for Training set
- Accuracy: 0.9565
- Precision: 0.8088
- Recall: 0.5318
- F1 Score: 0.6417
- AUC: 0.7609
----------------------------------
Model performance for Test set
- Accuracy: 0.9614
- Precision: 0.7928
- Recall: 0.5333
- F1 Score: 0.6377
- AUC: 0.7619


Decision Tree Classifier
Model performance for Training set
- Accuracy: 0.9640
- Precision: 0.8964
- Recall: 0.5751
- F1 Score: 0.7007
- AUC: 0.7849
----------------------------------
Model performance for Test set
- Accuracy: 0.9668
- Precision: 0.8762
- Recall: 0.5576
- F1 Score: 0.6815
- AUC: 0.7761


Random Forest Classifier
Model performance for Training set
- Accuracy: 0.9606
- Precision: 0.8077
- 

In [45]:
perfDf = pd.DataFrame(list(zip(model_list, auc_list,f1_list)), columns=['Model Name', 'AUC','F1Score']).sort_values(by=["AUC","F1Score"],ascending=[False,False])
perfDf

Unnamed: 0,Model Name,AUC,F1Score
5,XGBoost Classifier,0.812406,0.704698
3,Random Forest Classifier,0.8122,0.702341
4,Ada Boost Classifier,0.77866,0.650519
2,Decision Tree Classifier,0.776106,0.681481
1,Linear SVC,0.761922,0.637681
0,Logistic Regression,0.745185,0.623574


In [50]:
xgb = model_obj[-1]
xgb.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 1.0,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 2,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 5,
 'max_leaves': None,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 1.0,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [57]:
train_rs = xgb.predict_proba(X_train)[:, 1]
test_rs = xgb.predict_proba(X_test)[:, 1]


train_df = df.iloc[train_idx,:]
test_df =df.iloc[test_idx,:]


train_df.loc[:,'relevance_score'] = train_rs
test_df.loc[:,'relevance_score'] = test_rs

train_df.sort_values(by=['receipt_id', 'relevance_score'], ascending=[True, False], inplace=True)
# Use groupby on 'receipt_id' and rank by 'relevance_score', assigning rank to a new 'rank' column
train_df['relevance_rank'] = train_df.groupby('receipt_id')['relevance_score'].rank(method='dense', ascending=False)
train_df['relevance_rank'] = train_df['relevance_rank'].astype(int)

In [58]:
# Relevance label for test
test_df.sort_values(by=['receipt_id', 'relevance_score'], ascending=[True, False], inplace=True)
# Use groupby on 'receipt_id' and rank by 'relevance_score', assigning rank to a new 'rank' column
test_df['relevance_rank'] = test_df.groupby('receipt_id')['relevance_score'].rank(method='dense', ascending=False)
test_df['relevance_rank'] = test_df['relevance_rank'].astype(int)
test_df.head()

Unnamed: 0,receipt_id,company_id,matched_transaction_id,feature_transaction_id,DateMappingMatch,AmountMappingMatch,DescriptionMatch,DifferentPredictedTime,TimeMappingMatch,PredictedNameMatch,ShortNameMatch,DifferentPredictedDate,PredictedAmountMatch,PredictedTimeCloseMatch,matched,relevance_score,relevance_rank
38,10003,10000,10412,10412,0.85,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.5558,1
39,10003,10000,10412,10413,0.85,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.5558,1
40,10003,10000,10412,10414,0.85,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.164758,2
41,10003,10000,10412,10415,0.85,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.164758,2
34,10003,10000,10412,10140,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.001045,3


In [59]:
train_df.to_pickle('./data/train_df_with_rank.pkl')
test_df.to_pickle('./data/test_df_with_rank.pkl')