In [1]:
import numpy as np
import pandas as pd

## Data Import

In [2]:
fraud_data = pd.read_csv('./Fraud/Fraud_Data.csv')
ip_address = pd.read_csv('./Fraud/IpAddress_to_Country.csv')
fraud_data.head(3)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1


### Feature engineering, step 1: map country to every transaction based on ip_address 

In [3]:
ip_address['high_low_delta'] = ip_address['upper_bound_ip_address'] - ip_address['lower_bound_ip_address']
ip_address.shape

(138846, 4)

Given the huge amount of ip_address intervals (138,846 of them) involved in this "mapping ip_address into country" problem, we need to come up with a smarter way to do this. First sort ip_address dataframe by lower_bound_up_address column in ascending order.

In [98]:
ip_address.iloc[- 1, ]

lower_bound_ip_address    3.7581e+09
upper_bound_ip_address    3758096383
country                    Australia
high_low_delta                   255
Name: 138845, dtype: object

In [81]:
# Version_1
def find_interval_1(ip, df, index_by_col,return_by_col):
    df = df.sort_values(by = index_by_col)
    lower_bound_list = df[index_by_col]
    delta = lower_bound_list.apply(lambda x : ip - x)
    delta = delta.clip_lower(0) # clip_lower(threshold) turn the values lower than the threshold into the input threshold value
    minimum = np.argmin(delta) - 1 # will return the last positive number's index, have to go backwards one index 
    country = df.iloc[minimum,][return_by_col]
    return(country)  

In [101]:
def find_interval(ip, df, lower, country):
    # ip is the ip_address taken out from each row of fraud_data 
    # df is the check-up table for all the ip_address, intervals, and country
    # index_by_col: the column in df to compare ip_address from the ip argument
    # return_by_col: return the country column in df
    
    df = df.sort_values(by = lower)
    lower_bound_list = df[lower]
    delta = lower_bound_list.apply(lambda x : ip - x)
    delta = delta.loc[delta >= 0] # delete all the negative rows
    
    remain_list_length = delta.size
    
    # when ip is smaller than the smallest lower_bound_ip_address
    if remain_list_length == 0:
        return(None)
    # when ip is larger than the largest upper_bound_ip_address
    if remain_list_length == df.shape[0]:
        if ip > df.iloc[-1,]['upper_bound_ip_address']:
            return(None)
            
    # the last row of delta, represent the largest lower_bound_ip_address that is lower than the ip 
    # remember that the df.iloc[index,] start from 0, so to retrive the index of the last row, 
    # we shall subtract 1 from length
    minimum = remain_list_length - 1
    
    
    """
    minimum = np.argmin(delta)
    country = df.iloc[minimum,][country]
    return(country)
    """
    return(minimum)

In [102]:
# test on the first row of fraud_data
ip_sample = fraud_data.iloc[1,]['ip_address']
find_interval(ip_sample, ip_address, 'lower_bound_ip_address', 'country')
#find_interval(ip_sample)

1017

In [120]:
# test cases

# too small ->None
ip1 = ip_address.iloc[0,]['lower_bound_ip_address'] - 1
print(find_interval(ip1, ip_address, 'lower_bound_ip_address', 'country'))

# the smallest ip_address => 0
ip2 = ip_address.iloc[0,]['lower_bound_ip_address'] 
print(find_interval(ip2, ip_address, 'lower_bound_ip_address', 'country'))

# too large -> None
ip3 = ip_address.iloc[-1,]['upper_bound_ip_address'] + 1
print(find_interval(ip3, ip_address, 'lower_bound_ip_address', 'country'))

# the largest ip_address -> ip_address.shape[0] - 1
ip4 = ip_address.iloc[-1,]['upper_bound_ip_address'] 
print(find_interval(ip4, ip_address, 'lower_bound_ip_address', 'country'))

ip5 = fraud_data.iloc[0,]['ip_address']
print('ip_address: %s', ip5)
print(ip_address.iloc[find_interval(ip5, ip_address, 'lower_bound_ip_address', 'country'),])

None
0
None
138845
ip_address: %s 732758368.79972
lower_bound_ip_address    7.29809e+08
upper_bound_ip_address      734003199
country                         Japan
high_low_delta             4.1943e+06
Name: 4554, dtype: object


lower_bound_ip_address      3.52322e+08
upper_bound_ip_address        369098751
country                   United States
high_low_delta              1.67772e+07
Name: 1018, dtype: object