In [9]:


import pandas as pd
import numpy as np
import seaborn as sns
import time
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')
import pickle as pkl
from feature_engine.encoding import OneHotEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.transformation import YeoJohnsonTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [1]:
def predict(data):
# trans_date_trans_time to pandas datetime
    data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
    data['trans_date_trans_time'].head(3)

    data['dob'] = pd.to_datetime(data['dob'])
    data['dob'].head(3)

    drop_cols = ['street','merchant','zip','first','last','trans_num','job'] # list of columns to be dropped
    data.drop(drop_cols, axis =1, inplace = True)
    list(data.columns)

    data['trans_hour'] = data['trans_date_trans_time'].dt.hour  # extracting the hour component using the dt accessor

    data['trans_hour'].unique() # printing the unique values in the extracted series


    data['trans_month'] = data['trans_date_trans_time'].dt.month # extracting the month number component using the dt accessor

    data['trans_month'].unique() # printing the unique values in the extracted series


    data['trans_dayofweek'] = data['trans_date_trans_time'].dt.day_name() # extracting the day name component using the dt accessor

    data['trans_dayofweek'].unique() # printing the unique values in the extracted series


    # data.groupby(['cc_num'])['cc_num'].count().sort_values(ascending = False).describe().astype(int)


    data.sort_values(by = ['cc_num','unix_time'], ascending = True, inplace = True)

# unix_time for the previouse transaction using the shift method in pandas

    data['unix_time_prev_trans'] = data.groupby(by = ['cc_num'])['unix_time'].shift(1)

    # For the first transactions-records all the credit cards, the previouse unit time will be null
    # we dont want any null values to be present in the variable as we are going to feed the dataset into machine learning models where null values are not expected
    # for all the rows with null values, we are filling with the current unit time value - 86400 (number of seconds in a day)

    data['unix_time_prev_trans'].fillna(data['unix_time'] - 86400, inplace = True)

    # calculatig the time delay between the previouse and current transaction - converting the variable into to mins

    data['timedelta_last_trans'] = (data['unix_time'] - data['unix_time_prev_trans'])//60


    data['dob'].head()

    """> calculating the age at the date of the transaction = `dob` - `trans_date_trans_time`"""

    data['cust_age'] = (data['trans_date_trans_time'] - data['dob']).astype('timedelta64[Y]') # calculting the age in days and converting it into years

    data['cust_age'].head() # lets look at the newly arrived age column



    data['lat_dist_cust_merch'] = (data['lat'] -data['merch_lat']).abs()
    data['lat_dist_cust_merch'].head(3)

    """> Calculate the long distance between the customer and current merchant"""

    data['long_dist_cust_merch'] = (data['long'] -data['merch_long']).abs()
    data['long_dist_cust_merch'].head(3)

    """> Get the lat and long values of the previouse merchant"""

    data['prev_merch_lat'] = data.groupby(by = ['cc_num'])['merch_lat'].shift(1) # latitude of the previouse merchant with pandas shift method

    data['prev_merch_long'] = data.groupby(by = ['cc_num'])['merch_long'].shift(1) # longitude of the previouse merchant with pandas shift method

    """> Fill the null values ( for all initial transctions 999 numbers ) with the lat long values of the current merchant"""

    data['prev_merch_lat'].fillna(data['merch_lat'], inplace = True)

    data['prev_merch_long'].fillna(data['merch_long'], inplace = True)

    """> Calculate the distnace between the current and the previouse merchant"""

    data['lat_dist_prev_merch'] = (data['merch_lat'] - data['prev_merch_lat']).abs() # calculate and convert into absolute value

    data['lat_dist_prev_merch'].head(3) # lets look at the newly arrived variable

    """> Calculate the distnace between the current and the previouse merchant"""

    data['long_dist_prev_merch'] = (data['merch_long'] -data['prev_merch_long']).abs() # calculate and convert into absolute value

    data['long_dist_prev_merch'].head(3) # lets look at the newly arrived variable



    drop_cols2 = ['trans_date_trans_time','cc_num','unix_time','unix_time_prev_trans','lat',
                'long','merch_lat','merch_long','prev_merch_lat','prev_merch_long','dob','city']

    """> Dropping the list of columns which are now redundant in the dataset"""

    data.drop(drop_cols2, axis = 1, inplace = True)
    data.reset_index(drop=True, inplace = True)
    list(data.columns) # lets look at the remaining list of columns


    X_test = data

    capper_iqr = pkl.load(open("pickles/capper_iqr.pkl","rb"))
    X_test = capper_iqr.transform(X_test) # tranforming the test X




    onehot_encod = pkl.load(open("pickles/onehod_encod.pkl","rb"))

    X_test = onehot_encod.transform(X_test) # transform test X


    variables = ['state','trans_dayofweek']



    # mean_encod = MeanEncoder(variables = variables)
    mean_encod = pkl.load(open("pickles/mean_encod-1.pkl","rb"))

    # mean_encod.fit(X_test,y_test)

    X_test = mean_encod.transform(X_test) # Transforming the X test

    X_test['state'].unique()

    yeojohnson_transformer = pkl.load(open("pickles/yeojohnson_transformer.pkl","rb"))

    X_test = yeojohnson_transformer.transform(X_test) # Transforming the X test

    scaler = pkl.load(open("pickles/scaler.pkl","rb"))

    scaler.data_max_

    scaler.data_min_

    X_test = pd.DataFrame(data = scaler.transform(X_test), columns = X_test.columns) # transform the X test

    logreg = pkl.load(open("pickles/logreg (1).pkl","rb"))
    return logreg.predict(X_test)



In [3]:
test_data = [
 '2020-06-21 12:14:25',
 2291163933867244,
 'fraud_Kirlin and Sons',
 'personal_care',
 2.86,
 'Jeff',
 'Elliott',
 'M',
 '351 Darlene Green',
 'Columbia',
 'SC',
 29209,
 33.9659,
 -80.9355,
 333497,
 'Mechanical engineer',
 '1968-03-19',
 '2da90c7d74bd46a0caf3777415b3ebd3',
 1371816865,
 33.986391,
 -81.200714,
]

In [4]:
columns = ['trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long']

In [6]:
import pandas as pd

In [11]:
dict_data = {}
for x in range(0,len(test_data)):
    dict_data[columns[x]] = [test_data[x]]
newDF = pd.DataFrame(dict_data)
newDF

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714


In [70]:
drop_cols = ['street','merchant','zip','first','last','trans_num','job'] # list of columns to be dropped

In [12]:
predict(newDF)

array([0])