## TRAINING DATA - PRE PROCESSING STEPS 

In [199]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#!pip install ipaddress
import ipaddress

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [200]:
order = pd.read_csv('train_order_data.csv')
merch = pd.read_csv('train_merchant_data.csv')
target = pd.read_csv('train.csv')
ip_country = pd.read_csv('ip_boundaries_countries.csv')

In [201]:
#Sanity Check for Train Dataset (Orders)
order.head()
order.count() #54213
order['Order_ID'].nunique() #54213 -- No duplicates found on order ID 

54213

In [202]:
#Sanity Check for Train Dataset (Merchants)
merch.count() #54213
merch['Merchant_ID'].nunique() #54213 -- No duplicates found on Merchant_ID

54213

In [203]:
merch.head()

Unnamed: 0,Ecommerce_Provider_ID,Merchant_ID,Merchant_Registration_Date,Registered_Device_ID,Gender,Age,IP_Address
0,1746213,50448,2018-05-01 21:15:11,VATQMMZTVOZUT,F,39,48.151.136.76
1,1746213,338754,2018-04-14 10:13:00,LJCILLBRQZNKS,M,35,94.9.145.169
2,1746213,291127,2018-06-20 07:44:22,JFVHSUGKDAYZV,F,40,58.94.157.121
3,1746213,319919,2018-06-27 01:41:39,WFRXMPLQYXRMY,M,37,193.187.41.186
4,1746213,195911,2018-01-05 00:55:41,GGHKWMSWHCMID,F,27,125.96.20.172


In [204]:
#Customer_id and Order ID are almost varying by record. Can be dropped 

order['Customer_ID'].nunique() #34081
order['Order_ID'].nunique() #54213 - No predictive power

54213

In [205]:
order.head()

Unnamed: 0,Customer_ID,Order_ID,Date_of_Order,Order_Value_USD,Order_Source,Order_Payment_Method,Merchant_ID
0,126221,37cea9512f8d,4/29/2018 16:39,148,Direct,Credit Card,124231
1,115471,09f12e6efde2,6/16/2018 17:05,145,SEO,Credit Card,136178
2,151786,4e69e956e159,10/26/2018 18:00,62,Ads,Internet Banking,198611
3,140456,663443aaeb82,12/12/2018 5:41,28,SEO,Debit Card,127993
4,114721,99258810c121,9/20/2018 11:06,70,Ads,Credit Card,250146


In [206]:
#join orders and merchants data on Merchant ID 
merged_data = pd.merge(order, merch, how='inner', on = 'Merchant_ID') 
merged_data.head()

Unnamed: 0,Customer_ID,Order_ID,Date_of_Order,Order_Value_USD,Order_Source,Order_Payment_Method,Merchant_ID,Ecommerce_Provider_ID,Merchant_Registration_Date,Registered_Device_ID,Gender,Age,IP_Address
0,126221,37cea9512f8d,4/29/2018 16:39,148,Direct,Credit Card,124231,1746213,2018-04-19 00:48:20,BEJVAJMFDUVOQ,M,30,37.113.112.143
1,115471,09f12e6efde2,6/16/2018 17:05,145,SEO,Credit Card,136178,1746213,2018-05-15 23:30:37,GFHLOGZGFHNYH,F,20,190.255.172.133
2,151786,4e69e956e159,10/26/2018 18:00,62,Ads,Internet Banking,198611,1746213,2018-08-11 04:12:15,MTELXLHIBWHGI,F,46,130.243.149.190
3,140456,663443aaeb82,12/12/2018 5:41,28,SEO,Debit Card,127993,1746213,2018-08-17 01:52:14,ZPJCMOXVRXVJF,M,37,48.250.147.203
4,114721,99258810c121,9/20/2018 11:06,70,Ads,Credit Card,250146,1746213,2018-05-26 11:58:33,NVDDVXILKZTVO,F,26,212.167.41.153


In [207]:
del(order,merch)

In [208]:
target.head()
target['Merchant_ID'].nunique() #54213 -- No duplicates on Merchant ID. OK to join 


54213

In [209]:
#join train data with y_train data to get it all together
merged_data = pd.merge(merged_data, target, how='inner', on = 'Merchant_ID') 

In [210]:
print(merged_data.count()) # 54213 -- Row count not affected, joins worked fine.
merged_data.head()

Customer_ID                   54213
Order_ID                      54213
Date_of_Order                 54213
Order_Value_USD               54213
Order_Source                  54213
Order_Payment_Method          54213
Merchant_ID                   54213
Ecommerce_Provider_ID         54213
Merchant_Registration_Date    54213
Registered_Device_ID          54213
Gender                        54213
Age                           54213
IP_Address                    54213
Fraudster                     54213
dtype: int64


Unnamed: 0,Customer_ID,Order_ID,Date_of_Order,Order_Value_USD,Order_Source,Order_Payment_Method,Merchant_ID,Ecommerce_Provider_ID,Merchant_Registration_Date,Registered_Device_ID,Gender,Age,IP_Address,Fraudster
0,126221,37cea9512f8d,4/29/2018 16:39,148,Direct,Credit Card,124231,1746213,2018-04-19 00:48:20,BEJVAJMFDUVOQ,M,30,37.113.112.143,0
1,115471,09f12e6efde2,6/16/2018 17:05,145,SEO,Credit Card,136178,1746213,2018-05-15 23:30:37,GFHLOGZGFHNYH,F,20,190.255.172.133,0
2,151786,4e69e956e159,10/26/2018 18:00,62,Ads,Internet Banking,198611,1746213,2018-08-11 04:12:15,MTELXLHIBWHGI,F,46,130.243.149.190,1
3,140456,663443aaeb82,12/12/2018 5:41,28,SEO,Debit Card,127993,1746213,2018-08-17 01:52:14,ZPJCMOXVRXVJF,M,37,48.250.147.203,0
4,114721,99258810c121,9/20/2018 11:06,70,Ads,Credit Card,250146,1746213,2018-05-26 11:58:33,NVDDVXILKZTVO,F,26,212.167.41.153,0


In [211]:
del(target)

### SPLITTING TRAIN & VALIDATION DATASET 

In [212]:
X = merged_data.copy().drop("Fraudster",axis=1)
y = merged_data["Fraudster"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,stratify=y) 

In [213]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)

X_train (43370, 13)
X_test (10843, 13)
y_train (43370,)
y_test (10843,)


In [214]:
y_train.head()

51991    0
21112    0
53254    1
18448    1
41112    0
Name: Fraudster, dtype: int64

In [215]:
X_train.head()

Unnamed: 0,Customer_ID,Order_ID,Date_of_Order,Order_Value_USD,Order_Source,Order_Payment_Method,Merchant_ID,Ecommerce_Provider_ID,Merchant_Registration_Date,Registered_Device_ID,Gender,Age,IP_Address
51991,146782,49f7c4ea1adb,5/28/2018 0:35,70,Direct,Credit Card,22531,1746213,2018-05-22 05:02:24,RDBBSYOZOTNNI,F,34,49.38.152.230
21112,143435,982010b97be1,7/2/2018 1:29,205,SEO,E-wallet,212149,1746213,2018-05-28 13:22:13,HLRZPHDJYOTEI,M,32,224.255.145.206
53254,128804,5dbd4945e855,7/9/2018 8:23,42,Direct,Debit Card,215246,1746213,2018-04-05 08:36:39,DHZBQWFMRWXRC,M,40,40.88.97.124
18448,116427,1c436a716085,9/18/2018 13:42,42,Ads,E-wallet,233609,1746213,2018-08-10 23:32:40,JAYGFJLAWREMI,F,37,152.58.223.10
41112,131271,d182930c0879,8/24/2018 21:38,242,Ads,Credit Card,89251,1746213,2018-06-04 09:40:52,FIHLNBGCNAIRA,F,40,110.3.131.79


#### Building a dataframe at a device ID level. The assumption here being for a given device ID, there should be a single merchant found. In scenarios where there are multiples, it could be suspected as fraudulent activity 


In [216]:
device_id = X_train.groupby(['Registered_Device_ID']).agg({'Merchant_ID':'nunique' }).reset_index()
device_id = device_id[(device_id['Merchant_ID'] > 1)].reset_index(drop=True)
device_id.head()

Unnamed: 0,Registered_Device_ID,Merchant_ID
0,ACMSFDOYRXYGA,4
1,AENAOEHJIECBG,4
2,AFFOYLIDDMMNT,3
3,AFTKHRRPSLIER,5
4,AGGAZBZOEKRKF,2


### Creating a new feature which will indicate if a given device type ID has multiple registrations tagged against it

In [217]:
device_id['Multiple_Merchants'] = np.nan

for i in range(0, device_id.shape[0]):
    if device_id.loc[i,'Merchant_ID'] > 1:
        device_id.loc[i,'Multiple_Merchants']  = 1
    else: device_id.loc[i,'Multiple_Merchants'] = 0 

In [218]:
device_id['Multiple_Merchants'] = device_id['Multiple_Merchants'].astype('category')
device_id.head()

Unnamed: 0,Registered_Device_ID,Merchant_ID,Multiple_Merchants
0,ACMSFDOYRXYGA,4,1.0
1,AENAOEHJIECBG,4,1.0
2,AFFOYLIDDMMNT,3,1.0
3,AFTKHRRPSLIER,5,1.0
4,AGGAZBZOEKRKF,2,1.0


In [219]:
device_id.drop(['Merchant_ID'],axis=1,inplace=True)

In [220]:
print(device_id.dtypes)
device_id.head()

Registered_Device_ID      object
Multiple_Merchants      category
dtype: object


Unnamed: 0,Registered_Device_ID,Multiple_Merchants
0,ACMSFDOYRXYGA,1.0
1,AENAOEHJIECBG,1.0
2,AFFOYLIDDMMNT,1.0
3,AFTKHRRPSLIER,1.0
4,AGGAZBZOEKRKF,1.0


In [221]:
#Add the new feautures on to the train dataset 
X_train = pd.merge(X_train, device_id, how='left', on = 'Registered_Device_ID') #get the device-id level flags on train data

In [222]:
print(X_train.dtypes)
X_train.head()

Customer_ID                      int64
Order_ID                        object
Date_of_Order                   object
Order_Value_USD                  int64
Order_Source                    object
Order_Payment_Method            object
Merchant_ID                      int64
Ecommerce_Provider_ID            int64
Merchant_Registration_Date      object
Registered_Device_ID            object
Gender                          object
Age                              int64
IP_Address                      object
Multiple_Merchants            category
dtype: object


Unnamed: 0,Customer_ID,Order_ID,Date_of_Order,Order_Value_USD,Order_Source,Order_Payment_Method,Merchant_ID,Ecommerce_Provider_ID,Merchant_Registration_Date,Registered_Device_ID,Gender,Age,IP_Address,Multiple_Merchants
0,146782,49f7c4ea1adb,5/28/2018 0:35,70,Direct,Credit Card,22531,1746213,2018-05-22 05:02:24,RDBBSYOZOTNNI,F,34,49.38.152.230,
1,143435,982010b97be1,7/2/2018 1:29,205,SEO,E-wallet,212149,1746213,2018-05-28 13:22:13,HLRZPHDJYOTEI,M,32,224.255.145.206,
2,128804,5dbd4945e855,7/9/2018 8:23,42,Direct,Debit Card,215246,1746213,2018-04-05 08:36:39,DHZBQWFMRWXRC,M,40,40.88.97.124,
3,116427,1c436a716085,9/18/2018 13:42,42,Ads,E-wallet,233609,1746213,2018-08-10 23:32:40,JAYGFJLAWREMI,F,37,152.58.223.10,
4,131271,d182930c0879,8/24/2018 21:38,242,Ads,Credit Card,89251,1746213,2018-06-04 09:40:52,FIHLNBGCNAIRA,F,40,110.3.131.79,


In [223]:
X_train['target'] = y_train

In [224]:
ip_address = X_train.groupby(['IP_Address']).agg({'target':'sum','Merchant_ID':'nunique' }).reset_index()
ip_address[ip_address['target']>1].head()

Unnamed: 0,IP_Address,target,Merchant_ID
1538,107.75.45.250,3.0,3
2417,112.114.32.137,2.0,3
5395,128.207.105.21,3.0,5
9535,15.50.64.234,2.0,4
9702,150.81.230.244,2.0,3


In [225]:
ip_address = X_train.groupby(['IP_Address']).agg({'Merchant_ID':'nunique' }).reset_index()
ip_address = ip_address[ip_address['Merchant_ID'] > 1]
ip_address['Ip_Addr_Multi_Reg'] = 1
ip_address['Ip_Addr_Multi_Reg'] = ip_address['Ip_Addr_Multi_Reg'].astype('category')
ip_address.drop(['Merchant_ID'],axis=1,inplace=True)
ip_address.head()

Unnamed: 0,IP_Address,Ip_Addr_Multi_Reg
123,0.254.90.41,1
127,0.34.194.207,1
259,1.21.148.100,1
273,1.227.108.162,1
373,10.134.113.154,1


In [226]:
#Add the new feautures on to the train dataset 
X_train = pd.merge(X_train, ip_address, how='left', on = 'IP_Address') #get the ip-address flags w more than 1 registration

In [227]:
ord_day = X_train.groupby(['Date_of_Order','Registered_Device_ID']).agg({'target':'sum', 'Order_ID':'count' }).reset_index()
ord_day[ord_day['Order_ID']>1].head()

Unnamed: 0,Date_of_Order,Registered_Device_ID,target,Order_ID
2,1/10/2018 10:13,SSJNDVNOUTLWP,0.0,2
3,1/10/2018 10:28,WWTBMVCTTBKYK,0.0,4
4,1/10/2018 10:34,VTNKNINMSWLBO,0.0,2
5,1/10/2018 10:52,ZYBQQQLWHGQHQ,0.0,6
6,1/10/2018 10:58,GKEOSYTFRBUFZ,0.0,4


In [228]:
X_train[X_train['Date_of_Order'] == '1/10/2018 15:00']

Unnamed: 0,Customer_ID,Order_ID,Date_of_Order,Order_Value_USD,Order_Source,Order_Payment_Method,Merchant_ID,Ecommerce_Provider_ID,Merchant_Registration_Date,Registered_Device_ID,Gender,Age,IP_Address,Multiple_Merchants,target,Ip_Addr_Multi_Reg
10744,134909,aa8d2c9d106a,1/10/2018 15:00,60,Direct,Credit Card,195130,1746213,2018-01-08 15:00:40,BNMPEZLFOYNSM,M,51,72.134.37.247,1.0,0.0,1
11182,135200,220f999e5862,1/10/2018 15:00,60,Direct,Credit Card,191610,1746213,2018-01-08 15:00:36,BNMPEZLFOYNSM,M,51,72.134.37.247,1.0,,1
15786,161358,a3e16ffd9292,1/10/2018 15:00,60,Direct,Credit Card,56977,1746213,2018-01-08 15:00:41,BNMPEZLFOYNSM,M,51,72.134.37.247,1.0,0.0,1
19636,154778,9c7e1fbd28e9,1/10/2018 15:00,60,Direct,Credit Card,26408,1746213,2018-01-08 15:00:43,BNMPEZLFOYNSM,M,51,72.134.37.247,1.0,0.0,1
41017,147925,7feeb2ea1d24,1/10/2018 15:00,60,Direct,Credit Card,242745,1746213,2018-01-08 15:00:42,BNMPEZLFOYNSM,M,51,72.134.37.247,1.0,,1


In [229]:
ord_day = X_train.groupby(['Date_of_Order','Registered_Device_ID']).agg({'Order_ID':'count'}).reset_index()
ord_day = ord_day[ord_day['Order_ID'] > 1]
ord_day['Mult_orders_same_time'] = 1
ord_day['Mult_orders_same_time'] = ord_day['Mult_orders_same_time'].astype('category')
ord_day.drop(['Order_ID'],axis=1,inplace=True)
ord_day.head()

Unnamed: 0,Date_of_Order,Registered_Device_ID,Mult_orders_same_time
2,1/10/2018 10:13,SSJNDVNOUTLWP,1
3,1/10/2018 10:28,WWTBMVCTTBKYK,1
4,1/10/2018 10:34,VTNKNINMSWLBO,1
5,1/10/2018 10:52,ZYBQQQLWHGQHQ,1
6,1/10/2018 10:58,GKEOSYTFRBUFZ,1


In [230]:
X_train = pd.merge(X_train, ord_day, how='left', on = ['Date_of_Order','Registered_Device_ID'])

In [231]:
X_train.head()

Unnamed: 0,Customer_ID,Order_ID,Date_of_Order,Order_Value_USD,Order_Source,Order_Payment_Method,Merchant_ID,Ecommerce_Provider_ID,Merchant_Registration_Date,Registered_Device_ID,Gender,Age,IP_Address,Multiple_Merchants,target,Ip_Addr_Multi_Reg,Mult_orders_same_time
0,146782,49f7c4ea1adb,5/28/2018 0:35,70,Direct,Credit Card,22531,1746213,2018-05-22 05:02:24,RDBBSYOZOTNNI,F,34,49.38.152.230,,0.0,,
1,143435,982010b97be1,7/2/2018 1:29,205,SEO,E-wallet,212149,1746213,2018-05-28 13:22:13,HLRZPHDJYOTEI,M,32,224.255.145.206,,,,
2,128804,5dbd4945e855,7/9/2018 8:23,42,Direct,Debit Card,215246,1746213,2018-04-05 08:36:39,DHZBQWFMRWXRC,M,40,40.88.97.124,,1.0,,
3,116427,1c436a716085,9/18/2018 13:42,42,Ads,E-wallet,233609,1746213,2018-08-10 23:32:40,JAYGFJLAWREMI,F,37,152.58.223.10,,0.0,,
4,131271,d182930c0879,8/24/2018 21:38,242,Ads,Credit Card,89251,1746213,2018-06-04 09:40:52,FIHLNBGCNAIRA,F,40,110.3.131.79,,0.0,,


In [232]:
### DROPPING COLUMNS 
X_train.drop(['target','Customer_ID','Order_ID','Date_of_Order','Merchant_ID','Ecommerce_Provider_ID','Merchant_Registration_Date','Registered_Device_ID','IP_Address'],axis=1,inplace=True)

In [233]:
X_train.dtypes

Order_Value_USD             int64
Order_Source               object
Order_Payment_Method       object
Gender                     object
Age                         int64
Multiple_Merchants       category
Ip_Addr_Multi_Reg        category
Mult_orders_same_time    category
dtype: object

In [234]:
X_train.head()

Unnamed: 0,Order_Value_USD,Order_Source,Order_Payment_Method,Gender,Age,Multiple_Merchants,Ip_Addr_Multi_Reg,Mult_orders_same_time
0,70,Direct,Credit Card,F,34,,,
1,205,SEO,E-wallet,M,32,,,
2,42,Direct,Debit Card,M,40,,,
3,42,Ads,E-wallet,F,37,,,
4,242,Ads,Credit Card,F,40,,,


### SCALING 

In [235]:
from sklearn.preprocessing import StandardScaler
#X_train.loc[:,['Order_Value_USD','Age']]
scaler = StandardScaler()
scaler.fit(X_train.loc[:,['Order_Value_USD','Age']])

X_train.loc[:,['Order_Value_USD','Age']] = scaler.transform(X_train.loc[:,['Order_Value_USD','Age']])
#X_test.iloc[:,:5] = scaler.transform(X_test.loc[:,['Order_Value_USD','Age']])

  return self.partial_fit(X, y)
  


In [236]:
X_train.head()

Unnamed: 0,Order_Value_USD,Order_Source,Order_Payment_Method,Gender,Age,Multiple_Merchants,Ip_Addr_Multi_Reg,Mult_orders_same_time
0,-0.484556,Direct,Credit Card,F,0.100811,,,
1,2.475841,SEO,E-wallet,M,-0.131313,,,
2,-1.098565,Direct,Debit Card,M,0.797182,,,
3,-1.098565,Ads,E-wallet,F,0.448997,,,
4,3.287209,Ads,Credit Card,F,0.797182,,,


In [238]:
X_train['Multiple_Merchants'] = X_train['Multiple_Merchants'].cat.add_categories([0])
X_train['Ip_Addr_Multi_Reg'] = X_train['Ip_Addr_Multi_Reg'].cat.add_categories([0])
X_train['Mult_orders_same_time'] = X_train['Mult_orders_same_time'].cat.add_categories([0])

X_train['Multiple_Merchants'].fillna(0,inplace=True)
X_train['Ip_Addr_Multi_Reg'].fillna(0,inplace=True)
X_train['Mult_orders_same_time'].fillna(0,inplace=True)

In [239]:
X_train.dtypes

Order_Value_USD           float64
Order_Source               object
Order_Payment_Method       object
Gender                     object
Age                       float64
Multiple_Merchants       category
Ip_Addr_Multi_Reg        category
Mult_orders_same_time    category
dtype: object

### VISUALIZATION 

In [95]:
X_train['fraudster'] = y_train

In [97]:
visual = X_train.groupby(['Multiple_Merchants','Mult_orders_same_time','Ip_Addr_Multi_Reg','fraudster']).size().reset_index()
visual.to_csv("Visuals.csv")

#### Multiple_Merchants itself is explaining the fraudsters variable sufficiently; there isn't any new information found by Mult_orders_same_time,Ip_Addr_Multi_Reg. The rest two can be dropped 

In [240]:
X_train.drop(['Mult_orders_same_time','Ip_Addr_Multi_Reg'],axis=1,inplace=True)

### DUMMYFICATION

In [None]:
#cleansed_data = pd.read_csv('cleansed-train.csv')
cat_cols = ["Order_Source","Order_Payment_Method","Gender","Multiple_Merchants"]
X_train = pd.get_dummies(X_train,columns=cat_cols,drop_first=True,)

In [245]:
X_train.dtypes

Order_Value_USD                          float64
Age                                      float64
Order_Source_Direct                        uint8
Order_Source_SEO                           uint8
Order_Payment_Method_Credit Card           uint8
Order_Payment_Method_Debit Card            uint8
Order_Payment_Method_E-wallet              uint8
Order_Payment_Method_Internet Banking      uint8
Gender_M                                   uint8
Multiple_Merchants_0.0                     uint8
dtype: object

In [246]:
X_train.head()

Unnamed: 0,Order_Value_USD,Age,Order_Source_Direct,Order_Source_SEO,Order_Payment_Method_Credit Card,Order_Payment_Method_Debit Card,Order_Payment_Method_E-wallet,Order_Payment_Method_Internet Banking,Gender_M,Multiple_Merchants_0.0
0,-0.484556,0.100811,1,0,1,0,0,0,0,1
1,2.475841,-0.131313,0,1,0,0,1,0,1,1
2,-1.098565,0.797182,1,0,0,1,0,0,1,1
3,-1.098565,0.448997,0,0,0,0,1,0,0,1
4,3.287209,0.797182,0,0,1,0,0,0,0,1


### PRE PROCESSING FOR VALIDATION DATA 

In [247]:
X_test.head()

Unnamed: 0,Customer_ID,Order_ID,Date_of_Order,Order_Value_USD,Order_Source,Order_Payment_Method,Merchant_ID,Ecommerce_Provider_ID,Merchant_Registration_Date,Registered_Device_ID,Gender,Age,IP_Address
53491,152293,a3e5bf224ff8,3/14/2018 4:08,70,Ads,Internet Banking,272907,1746213,2018-02-06 09:49:05,GCQZBQVVIQNDJ,M,21,32.75.217.224
10587,145094,c81ecb04f8ce,7/8/2018 20:33,110,SEO,Credit Card,117336,1746213,2018-03-12 10:35:54,DGIOHTXRKTFMI,M,27,206.155.223.207
1086,161604,95cf2a8f46e7,5/5/2018 11:15,88,SEO,Internet Banking,73457,1746213,2018-04-24 12:55:58,YARRWPFDHEEMA,M,31,238.120.39.109
11422,143811,3b8d113c67e8,2/22/2018 5:21,122,SEO,Credit Card,201470,1746213,2018-01-25 19:22:41,AXONHDDPXULGP,F,26,41.161.53.4
45326,155266,d85c07f2683c,8/14/2018 15:42,35,SEO,Credit Card,305645,1746213,2018-07-15 04:14:37,PENJCCBHBLZUH,M,33,91.129.72.155


In [248]:
## Adding feauture 
X_test = pd.merge(X_test, device_id, how='left', on = 'Registered_Device_ID') #get the device-id level flags on train data

### DROPPING COLUMNS

X_test.drop(['Customer_ID','Order_ID','Date_of_Order','Merchant_ID','Ecommerce_Provider_ID','Merchant_Registration_Date','Registered_Device_ID','IP_Address'],axis=1,inplace=True)
X_test['Multiple_Merchants'] = X_test['Multiple_Merchants'].cat.add_categories([0])

# Fill NAs
X_test['Multiple_Merchants'].fillna(0,inplace=True)

#Scaling
X_test.loc[:,['Order_Value_USD','Age']] = scaler.transform(X_test.loc[:,['Order_Value_USD','Age']])

  del sys.path[0]


In [249]:
### DUMMYFICATION

#cleansed_data = pd.read_csv('cleansed-train.csv')
cat_cols = ["Order_Source","Order_Payment_Method","Gender","Multiple_Merchants"]
X_test = pd.get_dummies(X_test,columns=cat_cols,drop_first=True,)

In [250]:
X_test.head()

Unnamed: 0,Order_Value_USD,Age,Order_Source_Direct,Order_Source_SEO,Order_Payment_Method_Credit Card,Order_Payment_Method_Debit Card,Order_Payment_Method_E-wallet,Order_Payment_Method_Internet Banking,Gender_M,Multiple_Merchants_0.0
0,-0.484556,-1.407994,0,0,0,0,0,1,1,1
1,0.392599,-0.711622,0,1,1,0,0,0,1,1
2,-0.089837,-0.247375,0,1,0,0,0,1,1,1
3,0.655745,-0.827684,0,1,1,0,0,0,0,1
4,-1.252067,-0.015251,0,1,1,0,0,0,1,1


In [251]:
X_train.dtypes

Order_Value_USD                          float64
Age                                      float64
Order_Source_Direct                        uint8
Order_Source_SEO                           uint8
Order_Payment_Method_Credit Card           uint8
Order_Payment_Method_Debit Card            uint8
Order_Payment_Method_E-wallet              uint8
Order_Payment_Method_Internet Banking      uint8
Gender_M                                   uint8
Multiple_Merchants_0.0                     uint8
dtype: object

In [252]:
X_test.dtypes

Order_Value_USD                          float64
Age                                      float64
Order_Source_Direct                        uint8
Order_Source_SEO                           uint8
Order_Payment_Method_Credit Card           uint8
Order_Payment_Method_Debit Card            uint8
Order_Payment_Method_E-wallet              uint8
Order_Payment_Method_Internet Banking      uint8
Gender_M                                   uint8
Multiple_Merchants_0.0                     uint8
dtype: object

### MODEL IMPLEMENTATION 

In [253]:
X_MINI_TRAIN, X_MINI_TEST, Y_MINI_TRAIN, Y_MINI_TEST = train_test_split(X_train, y_train, test_size=0.95,stratify=y_train,random_state=123)
print('X_MINI_TRAIN',X_MINI_TRAIN.shape)
print('X_MINI_TEST',X_MINI_TEST.shape)
print('Y_MINI_TRAIN',Y_MINI_TRAIN.shape)
print('Y_MINI_TEST',Y_MINI_TEST.shape)

X_MINI_TRAIN (2168, 10)
X_MINI_TEST (41202, 10)
Y_MINI_TRAIN (2168,)
Y_MINI_TEST (41202,)


#### Below distribution shows this is a highly imbalanced class

In [254]:
#VERIFY Y POPULATION%%
print(Y_MINI_TRAIN.value_counts()/Y_MINI_TRAIN.count())
print(y_train.value_counts()/y_train.count())

0    0.907288
1    0.092712
Name: Fraudster, dtype: float64
0    0.907309
1    0.092691
Name: Fraudster, dtype: float64


In [255]:
del([X_MINI_TEST,Y_MINI_TEST])

#### model1 - XGBOOST 

In [258]:
from sklearn.metrics import f1_score

In [259]:
from xgboost import XGBClassifier
XGB_model1 = XGBClassifier(colsample_bytree = 0.9, max_depth=10, n_estimators=200,random_state=123,scale_pos_weight=9,n_jobs=-1)
XGB_model1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.9, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=123,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9, seed=None,
       silent=None, subsample=1, verbosity=1)

In [260]:
train_preds = XGB_model1.predict(X_train)
trainingscore_xgb1 = f1_score(y_train,train_preds)
print("XGBM Mode1 - Training Error - ",trainingscore_xgb1)

y_preds = XGB_model1.predict(X_test)
f1_score_xgb1 = f1_score(y_test,y_preds)
print("XGBM Mode1 - Validation Error - ",f1_score_xgb1)

XGBM Mode1 - Training Error -  0.7371173330428152
XGBM Mode1 - Validation Error -  0.5093292990418559


In [261]:
## Overfitting - reduce max depth & increased learning rate 

In [262]:
from xgboost import XGBClassifier
XGB_model1 = XGBClassifier(colsample_bytree = 0.9, max_depth=5, n_estimators=200,random_state=123,scale_pos_weight=9,n_jobs=-1,learning_rate=0.3)
XGB_model1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.9, gamma=0,
       learning_rate=0.3, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=123,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9, seed=None,
       silent=None, subsample=1, verbosity=1)

In [263]:
train_preds = XGB_model1.predict(X_train)
trainingscore_xgb1 = f1_score(y_train,train_preds)
print("XGBM Mode1 - Training Error - ",trainingscore_xgb1)

y_preds = XGB_model1.predict(X_test)
f1_score_xgb1 = f1_score(y_test,y_preds)
print("XGBM Mode1 - Validation Error - ",f1_score_xgb1)

XGBM Mode1 - Training Error -  0.6383081570996978
XGBM Mode1 - Validation Error -  0.5197112715158245


In [264]:
## Reduced colsample_bytree

In [265]:
from xgboost import XGBClassifier
XGB_model1 = XGBClassifier(colsample_bytree = 0.5, max_depth=5, n_estimators=200,random_state=123,scale_pos_weight=9,n_jobs=-1)
XGB_model1.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=123,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9, seed=None,
       silent=None, subsample=1, verbosity=1)

In [266]:
train_preds = XGB_model1.predict(X_train)
trainingscore_xgb1 = f1_score(y_train,train_preds)
print("XGBM Mode1 - Training Error - ",trainingscore_xgb1)

y_preds = XGB_model1.predict(X_test)
f1_score_xgb1 = f1_score(y_test,y_preds)
print("XGBM Mode1 - Validation Error - ",f1_score_xgb1)

XGBM Mode1 - Training Error -  0.6331312017640573
XGBM Mode1 - Validation Error -  0.5754152823920266


In [267]:
#increased max_depth & reduced  n_estimators

In [268]:
from xgboost import XGBClassifier
XGB_model1 = XGBClassifier(colsample_bytree = 0.5, max_depth=7, n_estimators=150,random_state=123,scale_pos_weight=9,n_jobs=-1)
XGB_model1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=150, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=123,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9, seed=None,
       silent=None, subsample=1, verbosity=1)

In [269]:
train_preds = XGB_model1.predict(X_train)
trainingscore_xgb1 = f1_score(y_train,train_preds)
print("XGBM Mode1 - Training Error - ",trainingscore_xgb1)

y_preds = XGB_model1.predict(X_test)
f1_score_xgb1 = f1_score(y_test,y_preds)
print("XGBM Mode1 - Validation Error - ",f1_score_xgb1)

XGBM Mode1 - Training Error -  0.6357974546439209
XGBM Mode1 - Validation Error -  0.5675324675324674


In [270]:
#Increase n_estimators & colsample

In [271]:
from xgboost import XGBClassifier
XGB_model1 = XGBClassifier(colsample_bytree = 0.7, max_depth=7, n_estimators=220,random_state=123,scale_pos_weight=9,n_jobs=-1)
XGB_model1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=220, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=123,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9, seed=None,
       silent=None, subsample=1, verbosity=1)

In [272]:
train_preds = XGB_model1.predict(X_train)
trainingscore_xgb1 = f1_score(y_train,train_preds)
print("XGBM Mode1 - Training Error - ",trainingscore_xgb1)

y_preds = XGB_model1.predict(X_test)
f1_score_xgb1 = f1_score(y_test,y_preds)
print("XGBM Mode1 - Validation Error - ",f1_score_xgb1)

XGBM Mode1 - Training Error -  0.6508141638666322
XGBM Mode1 - Validation Error -  0.5436893203883496


In [273]:
from xgboost import XGBClassifier
XGB_model1 = XGBClassifier(colsample_bytree = 0.4, max_depth=5, n_estimators=150,random_state=123,scale_pos_weight=9,n_jobs=-1)
XGB_model1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.4, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=150, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=123,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9, seed=None,
       silent=None, subsample=1, verbosity=1)

In [274]:
train_preds = XGB_model1.predict(X_train)
trainingscore_xgb1 = f1_score(y_train,train_preds)
print("XGBM Mode1 - Training Error - ",trainingscore_xgb1)

y_preds = XGB_model1.predict(X_test)
f1_score_xgb1 = f1_score(y_test,y_preds)
print("XGBM Mode1 - Validation Error - ",f1_score_xgb1)

XGBM Mode1 - Training Error -  0.6307606885063853
XGBM Mode1 - Validation Error -  0.5781041388518023


In [None]:
## CHOSEN MODEL w LEAST VALIDATION ERROR 
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.4, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=150, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=123,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9, seed=None,
       silent=None, subsample=1, verbosity=1)

### SVM MODEL 

In [275]:
from sklearn.svm import SVC


In [276]:
svc = SVC(kernel='linear', class_weight='balanced', C=1.0, random_state=123)
svc.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=123,
  shrinking=True, tol=0.001, verbose=False)

In [277]:
train_preds = svc.predict(X_train)
trainingscore_svc = f1_score(y_train,train_preds)
print("SVM Mode1 - Training Error - ",trainingscore_svc)

y_preds = svc.predict(X_test)
f1_score_svc = f1_score(y_test,y_preds)
print("SVM Mode1 - Validation Error - ",f1_score_svc)

SVM Mode1 - Training Error -  0.6284838350055741
SVM Mode1 - Validation Error -  0.5815983881799865


### SVC Run 2 

In [278]:
svc = SVC(kernel='sigmoid', class_weight='balanced', C=1.0,gamma=0.1, random_state=123)
svc.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=123, shrinking=True,
  tol=0.001, verbose=False)

In [279]:
train_preds = svc.predict(X_train)
trainingscore_svc = f1_score(y_train,train_preds)
print("SVM Mode1 - Training Error - ",trainingscore_svc)

y_preds = svc.predict(X_test)
f1_score_svc = f1_score(y_test,y_preds)
print("SVM Mode1 - Validation Error - ",f1_score_svc)

SVM Mode1 - Training Error -  0.2683318286688735
SVM Mode1 - Validation Error -  0.24142886470950523


### SVM 3 

In [281]:
svc = SVC(kernel='linear', class_weight='balanced', C=3.0, random_state=123)
svc.fit(X_train,y_train)

SVC(C=3.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=123,
  shrinking=True, tol=0.001, verbose=False)

In [282]:
train_preds = svc.predict(X_train)
trainingscore_svc = f1_score(y_train,train_preds)
print("SVM Mode1 - Training Error - ",trainingscore_svc)

y_preds = svc.predict(X_test)
f1_score_svc = f1_score(y_test,y_preds)
print("SVM Mode1 - Validation Error - ",f1_score_svc)

SVM Mode1 - Training Error -  0.6284838350055741
SVM Mode1 - Validation Error -  0.5815983881799865


### SVC MODEL 3

In [283]:
svc = SVC(kernel='linear', class_weight='balanced', C=6.0, random_state=123)
svc.fit(X_train,y_train)


SVC(C=6.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=123,
  shrinking=True, tol=0.001, verbose=False)

In [285]:
train_preds = svc.predict(X_train)
trainingscore_svc = f1_score(y_train,train_preds)
print("SVM Mode1 - Training Error - ",trainingscore_svc)

y_preds = svc.predict(X_test)
f1_score_svc = f1_score(y_test,y_preds)
print("SVM Mode1 - Validation Error - ",f1_score_svc)

SVM Mode1 - Training Error -  0.6284838350055741
SVM Mode1 - Validation Error -  0.5815983881799865
