In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)

In [35]:

def drop_cols():
    
    data = pd.read_json('data/data.zip')
    
    #Creates a Fraud column of True or False for each event.
    data['Fraud'] = (data['acct_type'] == 'fraudster_event') | (data['acct_type'] == 'fraudster') | (data['acct_type'] == 'fraudster_att')
            
    #Drops all columns that we cannot use due to API
    data = data.drop(['acct_type','approx_payout_date', 'body_length', 'gts', 'num_order', 'num_payouts', 'sale_duration2'], axis = 1)
    
    #drop columns we cannot use at the time of prediction
    data = data.drop(['payout_type', 'sale_duration', ], axis = 1)
    
    #drop cols I cannot categorize
    data = data.drop(['description', 'name', 'org_desc', 'org_name', 'payee_name', 'previous_payouts', 'ticket_types', 'venue_address', 'venue_name'], axis=1)
    
    #drop cols too cumbersome to categorize
    data = data.drop(['email_domain'], axis=1)
    
    #drop cols with too much missing data
    data = data.drop(['has_header'], axis=1)
    
    return data

In [36]:
data = drop_cols()
data.head()

Unnamed: 0,channels,country,currency,delivery_method,event_created,event_end,event_published,event_start,fb_published,has_analytics,has_logo,listed,name_length,object_id,org_facebook,org_twitter,show_map,user_age,user_created,user_type,venue_country,venue_latitude,venue_longitude,venue_state,Fraud
0,5,US,USD,0.0,1262739706,1265630400,1263110000.0,1265594400,0,0,0,y,60,527017,0.0,0.0,1,36,1259613950,1,US,25.777471,-80.133433,FL,True
1,0,US,USD,1.0,1293832670,1296288000,1293833000.0,1296255600,0,0,1,n,27,786878,0.0,12.0,0,149,1280942776,3,US,32.776566,-79.930922,SC,False
2,8,US,USD,1.0,1291090956,1295740800,1291092000.0,1295713800,0,0,0,y,28,787337,0.0,0.0,0,214,1272559388,3,US,33.944201,-118.080419,CA,False
3,6,IE,EUR,1.0,1360681570,1388534400,1360683000.0,1360702800,0,0,1,y,21,885645,0.0,0.0,0,889,1283870102,3,,,,,False
4,11,US,USD,0.0,1291994666,1297468800,1291995000.0,1297440000,1,0,0,y,66,1114349,0.0,0.0,0,35,1288984065,3,US,42.353848,-71.044276,MA,False


In [37]:
data.columns

Index(['channels', 'country', 'currency', 'delivery_method', 'event_created', 'event_end', 'event_published', 'event_start', 'fb_published', 'has_analytics', 'has_logo', 'listed', 'name_length', 'object_id', 'org_facebook', 'org_twitter', 'show_map', 'user_age', 'user_created', 'user_type', 'venue_country', 'venue_latitude', 'venue_longitude', 'venue_state', 'Fraud'], dtype='object')

In [38]:
data.shape

(14337, 25)

In [8]:
def drop_nan(data):
    pd.get_dummies(columns=)
    data = data.dropna(how='any')
    return data

In [9]:
drop_nan(data).shape

In [29]:
def one_hot(input_df, columns):
    '''
    One-hot encode the provided list of columns and return a new copy of the data frame
    '''
    df = input_df.copy()

    for col in columns:
        dummies = pd.get_dummies(df[col], dummy_na=True, drop_first=True)
        #dummies.drop(dummies[col], axis=1, inplace=True)
        df = df.drop(col, axis=1).merge(dummies, left_index=True, right_index=True)
    
    return df

In [30]:
data_dummies = one_hot(data,['country', 'venue_state', 'venue_country', 'user_type', 'channels'])

In [31]:
data_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14337 entries, 0 to 14336
Columns: 621 entries, currency to 13.0
dtypes: bool(1), float64(7), int64(11), object(2), uint8(600)
memory usage: 10.4+ MB


In [32]:
data_dummies.columns

Index([       'currency', 'delivery_method',   'event_created',       'event_end', 'event_published',     'event_start',    'fb_published',   'has_analytics',      'has_header',        'has_logo',
       ...
                 '4.0_y',           '5.0_y',               6.0,               7.0,               8.0,               9.0,              10.0,              11.0,              12.0,              13.0], dtype='object', length=621)

In [33]:
data_dummies.head(10)

Unnamed: 0,currency,delivery_method,event_created,event_end,event_published,event_start,fb_published,has_analytics,has_header,has_logo,listed,name_length,object_id,org_facebook,org_twitter,show_map,user_age,user_created,venue_latitude,venue_longitude,Fraud,_x,A1,AE_x,AR_x,AT_x,AU_x,BB,BE_x,BG_x,BS_x,CA_x,CH,CI,CM_x,CN_x,CO_x,CR,CZ_x,DE_x,DK_x,DZ_x,EC,ES_x,FI_x,FR_x,GB_x,GH,GR,HR_x,HU_x,ID_x,IE_x,IL_x,IM,IN_x,IS_x,IT_x,JE_x,JM_x,KE_x,KH_x,LB,MA_x,MX_x,MY_x,NA_x,NG_x,NI_x,NL_x,NZ_x,PE_x,PH_x,PK_x,PR_x,PS,PT_x,QA_x,RO,RS,RU_x,SE_x,SG_x,SI,TH_x,TJ,TR_x,US_x,UY_x,VE,VI_x,VN_x,ZA_x,_y,AB,ACT,AK,AL,ALAKSA,AR_y,AUSTRALIA,AZ,Aberdeen City,Abu Dhabi,Ad Dawhah,Aguada,Alberta,Algiers,Alsace,Antrim,Antwerpen,Auckland,Australian Capital Territory,Auvergne,BA,BC,BW,BY,Bali,Bath And North East Somerset,Bath and North East Somerset,Bay Of Plenty,Bayern,Bedfordshire,Beds,Beijing,Belfast,Belgravia,Berkshire,Berlin,Birmingham,Blackpool,Bournemouth,Bracknell Forest,Bradford,Brighton And Hove,"Bristol, City Of",British Columbia,Brussels,Brussels Hoofdstedelijk Gewest,Bruxelles,Buckinghamshire,Budapest,Burgandy,CA_y,CA 92590,CA 94607,CA 94704,CANADA,CAR,CO_y,CT,Caerphilly,California,Cambridgeshire,Cambs,Canterbury,Capital Region of Denmark,Cardiff,Catalonia,Central Bedfordshire,Central Java,Centre,Ceredigion,Cheshire,Cheshire West and Chester,Christchurch,City of Bristol,City of Edinburgh,City of London,City of Westminster,City of Zagreb,Clackmannanshire,Clare,Cleveland,Cnwll,Co Antrim,Co. Kildare,Co. Waterford,Community of Madrid,Comunidad de Madrid,Cordillera Administrative Region,Cork,Cornwall,County Dublin,County Waterford,Coventry,Cumb,Cumbria,Cundinamarca,D.C.,D.F.,DC,DE_y,DKI Jakarta,Derby,Derbyshire,Derry,Devon,District of Columbia,Distrito Federal,Dorset,Doukkala-Abda,Dubai,Dublin,Dudley,Dundee City,Dur,Durham,East Dunbartonshire,East Renfrewshire,East Riding of Yorkshire,East Sussex,"Edinburgh, City Of",England,Erongo,Essex,Esteli,Etelä-Suomi,FL,FL 3220,FLORIDA (DOWNTOWN,Falkirk,Famagusta,Faro,Fife,Flevoland,Florida,GA,GP,Galway,Glasgow City,Glos,Gloucestershire,Grand Casablanca,Greater London,Gt Lon,Gt Man,Guanajuato,Gullbringusysla,Gwynd,HAWAII 96815,HE,HI,Hamburg,Hamilton,Hampshire,Hanoi,Haute Normandie,Hertford,...,North Somerset,North York,North Yorkshire,Northamptonshire,Northern Ireland,Northern Territory,Northumberland,Nottingham,Nottinghamshire,Notts,Nova Scotia,OH,OHIO,OK,ON,ON K7H 3C6,ON.,ONTARIO,OR,Ontario,Otago,Ouest,Oxfordshire,Oxon,PA,PACA,PCh,PE_y,PENNSYLVANIA,PENNSYLVANIA 19149,PM,Pays De La Loire,Pays de la Loire,Pembrokeshire,Perth And Kinross,Peterborough,Phnom Penh,Phuket,Plymouth,Poole,Portsmouth,Provence alpes cote D&amp;amp;amp;amp;amp;amp;amp;,Puerto Rico,Punjab,QC,QLD,QUEBEC,Quang Binh province,Quebec,Queensland,RA,RI,Ranong,Reading,Redcar and Cleveland,Rhone Alpes,Richmond Upon Thames,Royal Borough of Kensington and Chelsea,SA,SC,SD,SK_x,Saint James,Saint-Paul,Salford,Salta,San Juan,Saskatchewan,Scotland,Scottish Borders,"Scottish Borders, The",Shanghai,Sheffield,Shropshire,Sindh,Solihull,Somerset,Sor-Trondelag,Souss-Massa-Draa,South Australia,South Ayrshire,South Carolina,South York,Southampton,Southend On Sea,Southend-on-Sea,St Helier,St Thomas,Staffordshire,Stirling,Stockholm County,Stockholms län,Stoke On Trent,Stoke-on-Trent,Suffk,Suffolk,Suffolk County,Sunderland,Surrey,Swansea,Swindon,T & W,TAS,TEXAS,THAILAND,TIBURON,TN,TX,Takaka,Tangier-Tetouan,Tasmania,Tel Aviv,The City of Brighton and Hove,Thurrock,Tien Giang,Tokyo,UK,UT,Utrecht,VA,VIC,VIRGINA,VT,Vermont,Victoria,Vienna,Virginia,Vlaams Gewest,Voronezhskaya oblast,WA,WESTERN AUSTRALIA,WI,WV,WY,Waikato,Waitakere,Wales,Walloon Region,Warks,Warrington,Warszawa,Warwickshire,Waterford,Wellington,West Berkshire,West Java,West Midlands,West Mids,West Sussex,West York,West Yorkshire,Western Australia,Western Cape,Whangarei,Wicklow,Wien,Wiltshire,Wirral,Wokingham,Wolverhampton,Worcestershire,York,ZH,Zuid Holland,Unnamed: 416,AE_y,AR,AT_y,AU_y,BE_y,BG_y,BR,BS_y,CA,CM_y,CN_y,CO,CY,CZ_y,DE,DK_y,DO,DZ_y,ES_y,FI_y,FR_y,GB_y,HK,HR_y,HT,HU_y,ID,IE_y,IL,IN,IS_y,IT_y,JE_y,JM_y,JP,KE_y,KH_y,LU,MA,MC,MX_y,MY_y,NA_y,NG_y,NI_y,NL,NO,NZ_y,OM,PH_y,PK_y,PL,PR_y,PT_y,QA_y,RE,RU_y,SE_y,SG_y,SK_y,TH_y,TR_y,TT,US_y,UY_y,VI_y,VN_y,ZA_y,1.0,2.0,3.0,4.0_x,5.0_x,103.0,0.0,4.0_y,5.0_y,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0
0,USD,0.0,1262739706,1265630400,1263110000.0,1265594400,0,0,1.0,0,y,60,527017,0.0,0.0,1,36,1259613950,25.777471,-80.133433,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,USD,1.0,1293832670,1296288000,1293833000.0,1296255600,0,0,0.0,1,n,27,786878,0.0,12.0,0,149,1280942776,32.776566,-79.930922,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,USD,1.0,1291090956,1295740800,1291092000.0,1295713800,0,0,,0,y,28,787337,0.0,0.0,0,214,1272559388,33.944201,-118.080419,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,EUR,1.0,1360681570,1388534400,1360683000.0,1360702800,0,0,0.0,1,y,21,885645,0.0,0.0,0,889,1283870102,,,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,USD,0.0,1291994666,1297468800,1291995000.0,1297440000,1,0,0.0,0,y,66,1114349,0.0,0.0,0,35,1288984065,42.353848,-71.044276,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
5,USD,0.0,1294421810,1300064400,1294422000.0,1300053600,0,0,0.0,1,y,44,1179983,0.0,0.0,1,299,1268579110,38.209797,-84.558831,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
6,USD,1.0,1294425018,1297477800,1294425000.0,1297468800,0,0,1.0,1,y,36,1180179,0.0,0.0,1,706,1233437951,39.41427,-77.405089,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7,USD,0.0,1294427837,1296277200,1294428000.0,1296271800,0,0,0.0,1,y,28,1180391,18.0,0.0,1,71,1288276103,35.580468,-82.563855,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
8,USD,0.0,1294428122,1297753200,1294880000.0,1297737000,1,0,0.0,1,y,19,1180423,0.0,0.0,0,0,1294428121,37.792847,-122.402082,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
9,USD,0.0,1294428286,1296882000,1294428000.0,1296876600,0,0,0.0,1,y,28,1180435,18.0,0.0,1,71,1288276103,35.580468,-82.563855,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [34]:
data.country.unique()

array(['US', 'IE', 'FR', 'CA', 'GB', '', 'AU', 'ES', 'NL', 'DE', 'VN',
       'MY', 'NZ', 'PK', 'MA', 'AR', 'MX', 'CH', None, 'SG', 'TH', 'BE',
       'PH', 'A1', 'CI', 'AT', 'ID', 'PS', 'PT', 'TR', 'NI', 'KE', 'IT',
       'HU', 'RS', 'RO', 'NG', 'CZ', 'PR', 'AE', 'BS', 'KH', 'JM', 'IN',
       'NA', 'FI', 'HR', 'BG', 'VI', 'TJ', 'GH', 'PE', 'QA', 'SI', 'GR',
       'BB', 'CM', 'IS', 'SE', 'RU', 'DZ', 'VE', 'UY', 'ZA', 'IM', 'LB',
       'CR', 'IL', 'CN', 'DK', 'CO', 'EC', 'JE'], dtype=object)

In [15]:
data.venue_state.unique()

array(['FL', 'SC', 'CA', None, 'MA', 'KY', 'MD', 'NC', '', 'TX', 'NY',
       'ME', 'MI', 'AZ', 'MN', 'IL', 'British Columbia', 'OH', 'DC', 'IN',
       'AL', 'IA', 'GA', 'London', 'Ontario', 'PA', 'New South Wales',
       'VA', 'Suffolk', 'Bournemouth', 'MASSACHUSETTS', 'Alberta', 'WA',
       'QC', 'Manchester', 'Birmingham', 'WI', 'Leeds', 'NJ', 'NV',
       'England', 'OR', 'CT', 'Ile De France', 'CO', 'ONTARIO',
       'South York', 'Nova Scotia', 'UT', 'Gt Lon', 'NSW', 'KS', 'Surrey',
       'HI', 'TN', 'BC', 'Sheffield', 'Berlin', 'West Mids', 'SK',
       'Scotland', 'QLD', 'AB', 'Newcastle Upon Tyne', 'Derry', 'LA', 'ON',
       'Hertfordshire', 'Auckland', 'Gloucestershire', 'NE', 'OK', 'MO',
       'Haute Normandie', 'VIC', 'Glasgow City', 'Queensland', 'PACA',
       'Warks', 'Sor-Trondelag', 'SA', 'ID', 'NH', 'Cardiff',
       'Greater London', 'NJ 07102-4398', 'AR', 'Edinburgh, City Of',
       'PENNSYLVANIA', 'Quebec', 'AK', 'Cambs', 'New Jersey', 'NB',
       'Midlothi

In [16]:
data.venue_country.unique()

array(['US', None, '', 'CA', 'FR', 'GB', 'AU', 'ES', 'NL', 'DE', 'IE',
       'NZ', 'NO', 'AR', 'SG', 'BE', 'PH', 'IT', 'AT', 'MA', 'ID', 'NI',
       'VN', 'AE', 'DO', 'PR', 'CZ', 'DK', 'TR', 'BS', 'HT', 'KH', 'HU',
       'IN', 'NA', 'KE', 'PK', 'HK', 'SE', 'HR', 'VI', 'TH', 'JM', 'LU',
       'PL', 'CM', 'MX', 'QA', 'IS', 'CN', 'FI', 'DZ', 'ZA', 'UY', 'PT',
       'MC', 'SK', 'RU', 'BG', 'JP', 'TT', 'CO', 'RE', 'NG', 'OM', 'JE',
       'CY', 'IL', 'MY', 'BR'], dtype=object)

In [17]:
data.user_type.unique()

array([  1,   3,   4,   5, 103,   2])

In [22]:
data.email_domain.value_counts()

gmail.com                           3097
yahoo.com                            792
hotmail.com                          399
aol.com                              198
live.com                              80
me.com                                75
ymail.com                             68
comcast.net                           60
generalassemb.ly                      58
yahoo.co.uk                           52
kineticevents.com                     52
hotmail.co.uk                         48
improvboston.com                      46
sippingnpainting.com                  39
claytonislandtours.com                37
racetonowhere.com                     35
lidf.co.uk                            35
live.fr                               33
yahoo.ca                              31
greatworldadventures.com              31
shaw.ca                               28
sbcglobal.net                         26
mac.com                               25
msn.com                               25
live.co.uk      

In [47]:
8928/14337

0.6227244193345888