In [1]:
import numpy as np
import pandas as pd

## Data Import

In [2]:
fraud_data = pd.read_csv('./dataset/Fraud/Fraud_Data.csv')
ip_address = pd.read_csv('./dataset/Fraud/IpAddress_to_Country.csv')
fraud_data.head(3)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1


### Feature engineering, step 1: map country to every transaction based on ip_address 

In [3]:
ip_address['high_low_delta'] = ip_address['upper_bound_ip_address'] - ip_address['lower_bound_ip_address']
ip_address.shape

(138846, 4)

Given the huge amount of ip_address intervals (138,846 of them) involved in this "mapping ip_address into country" problem, we need to come up with a smarter way to do this. First sort ip_address dataframe by lower_bound_up_address column in ascending order.

In [4]:
ip_address.iloc[- 1, ]

lower_bound_ip_address    3.7581e+09
upper_bound_ip_address    3758096383
country                    Australia
high_low_delta                   255
Name: 138845, dtype: object

In [5]:
def find_interval(ip, df, lower, country):
    # ip is the ip_address taken out from each row of fraud_data 
    # df is the check-up table for all the ip_address, intervals, and country
    # index_by_col: the column in df to compare ip_address from the ip argument
    # return_by_col: return the country column in df
    
    df = df.sort_values(by = lower)
    lower_bound_list = df[lower]
    delta = lower_bound_list.apply(lambda x : ip - x)
    delta = delta.loc[delta >= 0] # delete all the negative rows
    
    remain_list_length = delta.size
    
    
    """
    # when ip is smaller than the smallest lower_bound_ip_address
    if remain_list_length == 0:
        return(None)
    # when ip is larger than the largest upper_bound_ip_address
    if remain_list_length == df.shape[0]:
        if ip > df.iloc[-1,]['upper_bound_ip_address']:
            return(None)
    """
    
            
    # the last row of delta, represent the largest lower_bound_ip_address that is lower than the ip 
    # remember that the df.iloc[index,] start from 0, so to retrive the index of the last row, 
    # we shall subtract 1 from length
    minimum = remain_list_length - 1
    
    if minimum < 0:
        return(None)
    if ip > df.iloc[minimum,]['upper_bound_ip_address']:
        return(None)
    else:
        country = df.iloc[minimum,][country]
        return(country)  

In [6]:
# test cases

# too small ->None
ip1 = ip_address.iloc[0,]['lower_bound_ip_address'] - 1
print(find_interval(ip1, ip_address, 'lower_bound_ip_address', 'country'))

# the smallest ip_address => 0
ip2 = ip_address.iloc[0,]['lower_bound_ip_address'] 
print(find_interval(ip2, ip_address, 'lower_bound_ip_address', 'country'))

# too large -> None
ip3 = ip_address.iloc[-1,]['upper_bound_ip_address'] + 1
print(find_interval(ip3, ip_address, 'lower_bound_ip_address', 'country'))

# the largest ip_address -> ip_address.shape[0] - 1
ip4 = ip_address.iloc[-1,]['upper_bound_ip_address'] 
print(find_interval(ip4, ip_address, 'lower_bound_ip_address', 'country'))

ip5 = fraud_data.iloc[0,]['ip_address']
print('ip_address: %s', ip5)
print(find_interval(ip5, ip_address, 'lower_bound_ip_address', 'country'))

None
Australia
None
Australia
ip_address: %s 732758368.79972
Japan


In [7]:
# too slow to run
#%timeit
#countries = fraud_data['ip_address'].apply(find_interval, args = (ip_address,'lower_bound_ip_address','country'))

In [8]:
countries = []
for i in range(fraud_data.shape[0]):
    ip = fraud_data.loc[i, 'ip_address']
    tmp = ip_address[(ip_address['lower_bound_ip_address'] <= ip) &
                          (ip_address['upper_bound_ip_address'] >= ip)]
    if len(tmp) == 1:
        countries.append(tmp['country'].values[0])
    else:
        countries.append('NA')
        
fraud_data['country'] = countries

fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States


## Feature Engineering

### 1. Feature Engineering

In [9]:
# multiple users signs in with the same device is suspicious
# share_device_with_many_people
users_per_device = pd.pivot_table(fraud_data,values = 'user_id' ,index = ['device_id'], columns = ['class'], aggfunc = np.ma.count)
users_per_device = pd.DataFrame(users_per_device.to_records()).fillna(value = 0) # fill NaN with 0
users_per_device[users_per_device['1'] > 0].sort_values(by = '1')
# multiple users sign in from the same ip_address is suspicious
# share ip_address with many people


# time elapse between purchase_time and signup_time


Unnamed: 0,device_id,0,1
68781,NAWIYIEIBGPTM,0.0,1.0
88312,QRHXPQDSQKWBH,0.0,1.0
88302,QRHDETBSRRRJB,0.0,1.0
88301,QRHBXAQWBBCFH,1.0,1.0
88260,QRDDOSBKOMCBV,1.0,1.0
88249,QRBNPUXXRZMBQ,1.0,1.0
88152,QQPSOTLKHJVQW,1.0,1.0
88085,QQGMCXJXSYRYE,0.0,1.0
88072,QQEEWBVSJGYWO,0.0,1.0
88067,QQDMCJNMNSSOD,1.0,1.0


### 2. Check whether the positive and negative classes are balanced
negative class : positive class(fraud) is about 10:1, it seems that the response variable is highly imbalanced, we could reach 90% prediction accuracy by predicting negative(0) for every instances, while our goal is actually to identify the positive class(1).

#### targeting metrics:
1. recall: among the positive instances, what proportion of them are detectable by our classifier
2. precision: from all the instances that are predicted as positive, what proportion of them are actually positive
3. Kappa

#### effective methods:
1. sampling with SMOTE
2. anomaly detection: one-class SVM, isolation forests, see this post https://scikit-learn.org/stable/modules/outlier_detection.html

#### penalty
1. penalized SVM



In [10]:
fraud_data.groupby('class').size()

class
0    136961
1     14151
dtype: int64