In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [25]:

def drop_cols():
    
    data = pd.read_json('data/data.zip')
    
    #Creates a Fraud column of True or False for each event.
    data['Fraud'] = (data['acct_type'] == 'fraudster_event') | (data['acct_type'] == 'fraudster') | (data['acct_type'] == 'fraudster_att')
            
    #Drops all columns that we cannot use due to API
    data = data.drop(['acct_type','approx_payout_date', 'body_length', 'gts', 'num_order', 'num_payouts', 'sale_duration2'], axis = 1)
    
    #drop columns we cannot use at the time of prediction
    data = data.drop(['payout_type', 'sale_duration', ], axis = 1)
    
    #drop cols I cannot categorize
    data = data.drop(['description', 'name', 'org_desc', 'org_name', 'payee_name', 'previous_payouts', 'ticket_types', 'venue_address', 'venue_name'], axis=1)
    
    #drop cols too cumbersome to categorize
    data = data.drop(['email_domain'], axis=1)
    
    return data

In [26]:
data = drop_cols()
data.head()

Unnamed: 0,channels,country,currency,delivery_method,event_created,event_end,event_published,event_start,fb_published,has_analytics,has_header,has_logo,listed,name_length,object_id,org_facebook,org_twitter,show_map,user_age,user_created,user_type,venue_country,venue_latitude,venue_longitude,venue_state,Fraud
0,5,US,USD,0.0,1262739706,1265630400,1263110000.0,1265594400,0,0,1.0,0,y,60,527017,0.0,0.0,1,36,1259613950,1,US,25.777471,-80.133433,FL,True
1,0,US,USD,1.0,1293832670,1296288000,1293833000.0,1296255600,0,0,0.0,1,n,27,786878,0.0,12.0,0,149,1280942776,3,US,32.776566,-79.930922,SC,False
2,8,US,USD,1.0,1291090956,1295740800,1291092000.0,1295713800,0,0,,0,y,28,787337,0.0,0.0,0,214,1272559388,3,US,33.944201,-118.080419,CA,False
3,6,IE,EUR,1.0,1360681570,1388534400,1360683000.0,1360702800,0,0,0.0,1,y,21,885645,0.0,0.0,0,889,1283870102,3,,,,,False
4,11,US,USD,0.0,1291994666,1297468800,1291995000.0,1297440000,1,0,0.0,0,y,66,1114349,0.0,0.0,0,35,1288984065,3,US,42.353848,-71.044276,MA,False


In [6]:
data.columns

Index(['channels', 'country', 'currency', 'delivery_method', 'email_domain', 'event_created', 'event_end', 'event_published', 'event_start', 'fb_published', 'has_analytics', 'has_header', 'has_logo', 'listed', 'name_length', 'object_id', 'org_facebook', 'org_twitter', 'show_map', 'user_age', 'user_created', 'user_type', 'venue_country', 'venue_latitude', 'venue_longitude', 'venue_state', 'Fraud'], dtype='object')

In [7]:
data.shape

(14337, 27)

In [8]:
# def drop_nan(data):
#     pd.get_dummies(columns=)
#     data = data.dropna(how='any')
#     return data

In [9]:
# drop_nan(data).shape

In [10]:
def one_hot(input_df, columns):
    '''
    One-hot encode the provided list of columns and return a new copy of the data frame
    '''
    df = input_df.copy()

    for col in columns:
        dummies = pd.get_dummies(df[col], dummy_na=True)
        dummies.drop(dummies.columns[-1], axis=1, inplace=True)
        df = df.drop(col, axis=1).merge(dummies, left_index=True, right_index=True)
    
    return df

In [13]:
data_dummies = one_hot(data,['country', 'venue_state', 'venue_country', 'user_type', 'channels', 'email_domain'])

In [14]:
data_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14337 entries, 0 to 14336
Columns: 6393 entries, currency to zwia.org
dtypes: bool(1), float64(7), int64(11), object(2), uint8(6372)
memory usage: 89.3+ MB


In [34]:
data.country.unique()

array(['US', 'IE', 'FR', 'CA', 'GB', '', 'AU', 'ES', 'NL', 'DE', 'VN',
       'MY', 'NZ', 'PK', 'MA', 'AR', 'MX', 'CH', None, 'SG', 'TH', 'BE',
       'PH', 'A1', 'CI', 'AT', 'ID', 'PS', 'PT', 'TR', 'NI', 'KE', 'IT',
       'HU', 'RS', 'RO', 'NG', 'CZ', 'PR', 'AE', 'BS', 'KH', 'JM', 'IN',
       'NA', 'FI', 'HR', 'BG', 'VI', 'TJ', 'GH', 'PE', 'QA', 'SI', 'GR',
       'BB', 'CM', 'IS', 'SE', 'RU', 'DZ', 'VE', 'UY', 'ZA', 'IM', 'LB',
       'CR', 'IL', 'CN', 'DK', 'CO', 'EC', 'JE'], dtype=object)

In [15]:
data.venue_state.unique()

array(['FL', 'SC', 'CA', None, 'MA', 'KY', 'MD', 'NC', '', 'TX', 'NY',
       'ME', 'MI', 'AZ', 'MN', 'IL', 'British Columbia', 'OH', 'DC', 'IN',
       'AL', 'IA', 'GA', 'London', 'Ontario', 'PA', 'New South Wales',
       'VA', 'Suffolk', 'Bournemouth', 'MASSACHUSETTS', 'Alberta', 'WA',
       'QC', 'Manchester', 'Birmingham', 'WI', 'Leeds', 'NJ', 'NV',
       'England', 'OR', 'CT', 'Ile De France', 'CO', 'ONTARIO',
       'South York', 'Nova Scotia', 'UT', 'Gt Lon', 'NSW', 'KS', 'Surrey',
       'HI', 'TN', 'BC', 'Sheffield', 'Berlin', 'West Mids', 'SK',
       'Scotland', 'QLD', 'AB', 'Newcastle Upon Tyne', 'Derry', 'LA', 'ON',
       'Hertfordshire', 'Auckland', 'Gloucestershire', 'NE', 'OK', 'MO',
       'Haute Normandie', 'VIC', 'Glasgow City', 'Queensland', 'PACA',
       'Warks', 'Sor-Trondelag', 'SA', 'ID', 'NH', 'Cardiff',
       'Greater London', 'NJ 07102-4398', 'AR', 'Edinburgh, City Of',
       'PENNSYLVANIA', 'Quebec', 'AK', 'Cambs', 'New Jersey', 'NB',
       'Midlothi

In [16]:
data.venue_country.unique()

array(['US', None, '', 'CA', 'FR', 'GB', 'AU', 'ES', 'NL', 'DE', 'IE',
       'NZ', 'NO', 'AR', 'SG', 'BE', 'PH', 'IT', 'AT', 'MA', 'ID', 'NI',
       'VN', 'AE', 'DO', 'PR', 'CZ', 'DK', 'TR', 'BS', 'HT', 'KH', 'HU',
       'IN', 'NA', 'KE', 'PK', 'HK', 'SE', 'HR', 'VI', 'TH', 'JM', 'LU',
       'PL', 'CM', 'MX', 'QA', 'IS', 'CN', 'FI', 'DZ', 'ZA', 'UY', 'PT',
       'MC', 'SK', 'RU', 'BG', 'JP', 'TT', 'CO', 'RE', 'NG', 'OM', 'JE',
       'CY', 'IL', 'MY', 'BR'], dtype=object)

In [17]:
data.user_type.unique()

array([  1,   3,   4,   5, 103,   2])

In [22]:
data.email_domain.value_counts()

gmail.com                           3097
yahoo.com                            792
hotmail.com                          399
aol.com                              198
live.com                              80
me.com                                75
ymail.com                             68
comcast.net                           60
generalassemb.ly                      58
yahoo.co.uk                           52
kineticevents.com                     52
hotmail.co.uk                         48
improvboston.com                      46
sippingnpainting.com                  39
claytonislandtours.com                37
racetonowhere.com                     35
lidf.co.uk                            35
live.fr                               33
yahoo.ca                              31
greatworldadventures.com              31
shaw.ca                               28
sbcglobal.net                         26
mac.com                               25
msn.com                               25
live.co.uk      

In [47]:
8928/14337

0.6227244193345888