In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime as dt

In [3]:
data =  pd.read_csv('data/fraud_train_SMOTE.csv', sep=",")
data = data.drop(columns=['city','zip', 'lat', 'long', 'merch_lat', 'merch_long'])
job_category_mapping = {
    # Healthcare
    'Psychologist, counselling': 'Healthcare',
    'Special educational needs teacher': 'Education',
    'Physiotherapist': 'Healthcare',
    'Psychologist, forensic': 'Healthcare',
    'Therapist, occupational': 'Healthcare',
    'Counsellor': 'Healthcare',
    'Ambulance person': 'Healthcare',
    'Nurse, mental health': 'Healthcare',
    'Nurse, children\'s': 'Healthcare',
    'Doctor, hospital': 'Healthcare',
    'Doctor, general practice': 'Healthcare',
    'Paediatric nurse': 'Healthcare',
    'Psychiatric nurse': 'Healthcare',
    'Clinical biochemist': 'Healthcare',
    'Pharmacist, community': 'Healthcare',
    'Physiological scientist': 'Healthcare',
    'Psychotherapist': 'Healthcare',
    'Clinical psychologist': 'Healthcare',
    'Health visitor': 'Healthcare',
    
    # Education
    'Special educational needs teacher': 'Education',
    'Primary school teacher': 'Education',
    'Secondary school teacher': 'Education',
    'Teacher, English as a foreign language': 'Education',
    'Further education lecturer': 'Education',
    'Higher education careers adviser': 'Education',
    'Educational psychologist': 'Education',
    'Teaching laboratory technician': 'Education',
    'Teacher, early years/pre': 'Education',
    'Teacher, special educational needs': 'Education',
    'Teacher, adult education': 'Education',
    'Education officer, community': 'Education',
    'Librarian, academic': 'Education',
    'Careers information officer': 'Education',
    'Museum/gallery exhibitions officer': 'Education',
    
    # Engineering
    'Engineer, land': 'Engineering',
    'Engineer, biomedical': 'Engineering',
    'Engineer, mining': 'Engineering',
    'Engineer, technical sales': 'Engineering',
    'Engineer, electronics': 'Engineering',
    'Engineer, communications': 'Engineering',
    'Engineer, structural': 'Engineering',
    'Engineer, building services': 'Engineering',
    'Engineer, civil (consulting)': 'Engineering',
    'Engineer, civil (contracting)': 'Engineering',
    'Mechanical engineer': 'Engineering',
    'Electrical engineer': 'Engineering',
    'Chemical engineer': 'Engineering',
    'Civil engineer': 'Engineering',
    'Engineer, technical': 'Engineering',
    'Mining engineer': 'Engineering',
    'Engineer, materials': 'Engineering',
    'Water engineer': 'Engineering',
    'Environmental consultant': 'Engineering',
    'Control and instrumentation engineer': 'Engineering',
    'Software engineer': 'Engineering',
    
    # Information Technology
    'IT trainer': 'Information Technology',
    'Systems developer': 'Information Technology',
    'Applications developer': 'Information Technology',
    'Systems analyst': 'Information Technology',
    'Network engineer': 'Information Technology',
    'Database administrator': 'Information Technology',
    'Web designer': 'Information Technology',
    'Programmer, multimedia': 'Information Technology',
    'Multimedia programmer': 'Information Technology',
    'Telecommunications researcher': 'Information Technology',
    'Database designer': 'Information Technology',
    'Data scientist': 'Information Technology',
    'IT consultant': 'Information Technology',
    'Data analyst': 'Information Technology',
    'Software developer': 'Information Technology',
    
    # Finance
    'Corporate investment banker': 'Finance',
    'Financial adviser': 'Finance',
    'Financial trader': 'Finance',
    'Investment analyst': 'Finance',
    'Tax inspector': 'Finance',
    'Chartered accountant': 'Finance',
    'Risk analyst': 'Finance',
    'Pensions consultant': 'Finance',
    'Chartered loss adjuster': 'Finance',
    'Loss adjuster, chartered': 'Finance',
    'Purchasing manager': 'Finance',
    'Operations geologist': 'Finance',
    'Accounting technician': 'Finance',
    'Banker': 'Finance',
    'Finance manager': 'Finance',
    'Tax adviser': 'Finance',
    
    # Media
    'Designer, multimedia': 'Media',
    'Designer, furniture': 'Media',
    'Designer, jewellery': 'Media',
    'Designer, ceramics/pottery': 'Media',
    'Designer, industrial/product': 'Media',
    'Designer, interior/spatial': 'Media',
    'Animator': 'Media',
    'Fine artist': 'Media',
    'Illustrator': 'Media',
    'Photographer': 'Media',
    'Film/video editor': 'Media',
    'Video editor': 'Media',
    'Producer, radio': 'Media',
    'Producer, television/film/video': 'Media',
    'Journalist, newspaper': 'Media',
    'Editor, magazine features': 'Media',
    'Editor, film/video': 'Media',
    'Advertising account executive': 'Media',
    'Copywriter, advertising': 'Media',
    'Public relations account executive': 'Media',
    
    # Legal
    'Patent attorney': 'Legal',
    'Solicitor, Scotland': 'Legal',
    'Solicitor': 'Legal',
    'Barrister': 'Legal',
    'Chartered legal executive (England and Wales)': 'Legal',
    'Legal secretary': 'Legal',
    'Claims inspector/assessor': 'Legal',
    'Chartered loss adjuster': 'Legal',
    'Trade mark attorney': 'Legal',
    
    # Social Services 
    'Probation officer': 'Social Services ',
    'Youth worker': 'Social Services ',
    'Social researcher': 'Social Services ',
    'Volunteer coordinator': 'Social Services ',
    'Social work assistant': 'Social Services ',
    'Counselling psychologist': 'Social Services ',
    'Mental health nurse': 'Social Services ',
    
    # Science
    'Scientist, research (maths)': 'Science',
    'Research officer, trade union': 'Science',
    'Research officer, political party': 'Science',
    'Research scientist (physical sciences)': 'Science',
    'Geochemist': 'Science',
    'Geologist, engineering': 'Science',
    'Biochemist, clinical': 'Science',
    'Immunologist': 'Science',
    'Phytotherapist': 'Science',
    'Meteorologist': 'Science',
    
    # Construction
    'Surveyor, land': 'Construction',
    'Surveyor, minerals': 'Construction',
    'Surveyor, commercial/residential': 'Construction',
    'Surveyor, rural practice': 'Construction',
    'Architect': 'Construction',
    'Landscape architect': 'Construction',
    'Town planner': 'Construction',
    'Building surveyor': 'Construction',
    'Building control surveyor': 'Construction',
    
    # Retail
    'Retail merchandiser': 'Retail',
    'Retail buyer': 'Retail',
    'Sales professional, IT': 'Retail',
    'Sales executive': 'Retail',
    'Sales manager': 'Retail',
    'Retail manager': 'Retail',
    'Visual merchandiser': 'Retail',
    
    # Public Services
    'Police officer': 'Public Services',
    'Civil Service fast streamer': 'Public Services',
    'Immigration officer': 'Public Services',
    'Regulatory affairs officer': 'Public Services',
    'Social services officer': 'Public Services',
    'Community development worker': 'Public Services',
    'Exhibitions officer, museum/gallery': 'Public Services',
    'Museum/gallery conservator': 'Public Services',
    'Public relations officer': 'Public Services',
    'Public affairs consultant': 'Public Services',
    
    # Transport
    'Transport planner': 'Transport',
    'Pilot, airline': 'Transport',
    'Air traffic controller': 'Transport',
    'Shipping broker': 'Transport',
    'Freight forwarder': 'Transport',
    'Logistics and distribution manager': 'Transport',
    'Supply chain manager': 'Transport',
}



# Apply the categorization
data['job_category'] = data['job'].apply(lambda x: job_category_mapping.get(x, 'Other'))
data = data.drop(columns=['job'])

data['category'] = data['category'].replace({'misc_net': 'misc_online', 
                                                         'grocery_pos': 'grocery_pt_of_sale',
                                                        'gas_transport': 'gas',
                                                        'misc_pos' : 'misc_pt_of_sale',
                                                        'grocery_net': 'grocery_online',
                                                        'shopping_net': 'shopping_online',
                                                         'shopping_pos': 'shopping_pt_of_sale',})



In [4]:
state_stats = data.groupby('state')['is_fraud'].agg(['sum', 'count'])
state_stats['state_fraud_prob'] = state_stats['sum'] / state_stats['count']
data = data.merge(state_stats[['state_fraud_prob']], on='state', how='left')
data = data.drop(columns=['state'])

data.head()

Unnamed: 0,merchant,category,amt,gender,city_pop,transaction_year,transaction_month,transaction_day,transaction_hour,age,distance_in_miles,is_fraud,job_category,state_fraud_prob
0,"fraud_Rippin, Kub and Mann",misc_online,4.97,F,3495,2019,1,1,0,30,48.838809,0,Healthcare,0.14731
1,"fraud_Heller, Gutmann and Zieme",grocery_pt_of_sale,107.23,F,149,2019,1,1,0,40,18.773185,0,Education,0.154406
2,fraud_Lind-Buckridge,entertainment,220.11,M,4154,2019,1,1,0,56,67.236892,0,Other,0.059643
3,"fraud_Kutch, Hermiston and Farrell",gas,45.0,M,1939,2019,1,1,0,51,59.449252,0,Legal,0.086858
4,fraud_Keeling-Crist,misc_pt_of_sale,41.96,M,99,2019,1,1,0,32,48.192064,0,Other,0.191225


In [5]:
merchant_stats = data.groupby('merchant')['is_fraud'].agg(['sum', 'count'])
merchant_stats['merchant_fraud_prob'] = merchant_stats['sum'] / merchant_stats['count']
data = data.merge(merchant_stats[['merchant_fraud_prob']], on='merchant', how='left')
data = data.drop(columns=['merchant'])
data["gender"] = data["gender"].apply(lambda x: int(x == "M"))
data.head()

Unnamed: 0,category,amt,gender,city_pop,transaction_year,transaction_month,transaction_day,transaction_hour,age,distance_in_miles,is_fraud,job_category,state_fraud_prob,merchant_fraud_prob
0,misc_online,4.97,0,3495,2019,1,1,0,30,48.838809,0,Healthcare,0.14731,0.36308
1,grocery_pt_of_sale,107.23,0,149,2019,1,1,0,40,18.773185,0,Education,0.154406,0.260896
2,entertainment,220.11,1,4154,2019,1,1,0,56,67.236892,0,Other,0.059643,0.096512
3,gas,45.0,1,1939,2019,1,1,0,51,59.449252,0,Legal,0.086858,0.135458
4,misc_pt_of_sale,41.96,1,99,2019,1,1,0,32,48.192064,0,Other,0.191225,0.115942


In [6]:
data.to_csv('data/model_data-full1.csv', index=False) 

### Steps to convert test data

In [7]:
t_data =  pd.read_csv('data/df_test_cleaned.csv', sep=",")

t_data = t_data.drop(columns=['city','zip', 'lat', 'long', 'merch_lat', 'merch_long'])

job_category_mapping = {
    # Healthcare
    'Psychologist, counselling': 'Healthcare',
    'Special educational needs teacher': 'Education',
    'Physiotherapist': 'Healthcare',
    'Psychologist, forensic': 'Healthcare',
    'Therapist, occupational': 'Healthcare',
    'Counsellor': 'Healthcare',
    'Ambulance person': 'Healthcare',
    'Nurse, mental health': 'Healthcare',
    'Nurse, children\'s': 'Healthcare',
    'Doctor, hospital': 'Healthcare',
    'Doctor, general practice': 'Healthcare',
    'Paediatric nurse': 'Healthcare',
    'Psychiatric nurse': 'Healthcare',
    'Clinical biochemist': 'Healthcare',
    'Pharmacist, community': 'Healthcare',
    'Physiological scientist': 'Healthcare',
    'Psychotherapist': 'Healthcare',
    'Clinical psychologist': 'Healthcare',
    'Health visitor': 'Healthcare',
    
    # Education
    'Special educational needs teacher': 'Education',
    'Primary school teacher': 'Education',
    'Secondary school teacher': 'Education',
    'Teacher, English as a foreign language': 'Education',
    'Further education lecturer': 'Education',
    'Higher education careers adviser': 'Education',
    'Educational psychologist': 'Education',
    'Teaching laboratory technician': 'Education',
    'Teacher, early years/pre': 'Education',
    'Teacher, special educational needs': 'Education',
    'Teacher, adult education': 'Education',
    'Education officer, community': 'Education',
    'Librarian, academic': 'Education',
    'Careers information officer': 'Education',
    'Museum/gallery exhibitions officer': 'Education',
    
    # Engineering
    'Engineer, land': 'Engineering',
    'Engineer, biomedical': 'Engineering',
    'Engineer, mining': 'Engineering',
    'Engineer, technical sales': 'Engineering',
    'Engineer, electronics': 'Engineering',
    'Engineer, communications': 'Engineering',
    'Engineer, structural': 'Engineering',
    'Engineer, building services': 'Engineering',
    'Engineer, civil (consulting)': 'Engineering',
    'Engineer, civil (contracting)': 'Engineering',
    'Mechanical engineer': 'Engineering',
    'Electrical engineer': 'Engineering',
    'Chemical engineer': 'Engineering',
    'Civil engineer': 'Engineering',
    'Engineer, technical': 'Engineering',
    'Mining engineer': 'Engineering',
    'Engineer, materials': 'Engineering',
    'Water engineer': 'Engineering',
    'Environmental consultant': 'Engineering',
    'Control and instrumentation engineer': 'Engineering',
    'Software engineer': 'Engineering',
    
    # Information Technology
    'IT trainer': 'Information Technology',
    'Systems developer': 'Information Technology',
    'Applications developer': 'Information Technology',
    'Systems analyst': 'Information Technology',
    'Network engineer': 'Information Technology',
    'Database administrator': 'Information Technology',
    'Web designer': 'Information Technology',
    'Programmer, multimedia': 'Information Technology',
    'Multimedia programmer': 'Information Technology',
    'Telecommunications researcher': 'Information Technology',
    'Database designer': 'Information Technology',
    'Data scientist': 'Information Technology',
    'IT consultant': 'Information Technology',
    'Data analyst': 'Information Technology',
    'Software developer': 'Information Technology',
    
    # Finance
    'Corporate investment banker': 'Finance',
    'Financial adviser': 'Finance',
    'Financial trader': 'Finance',
    'Investment analyst': 'Finance',
    'Tax inspector': 'Finance',
    'Chartered accountant': 'Finance',
    'Risk analyst': 'Finance',
    'Pensions consultant': 'Finance',
    'Chartered loss adjuster': 'Finance',
    'Loss adjuster, chartered': 'Finance',
    'Purchasing manager': 'Finance',
    'Operations geologist': 'Finance',
    'Accounting technician': 'Finance',
    'Banker': 'Finance',
    'Finance manager': 'Finance',
    'Tax adviser': 'Finance',
    
    # Media
    'Designer, multimedia': 'Media',
    'Designer, furniture': 'Media',
    'Designer, jewellery': 'Media',
    'Designer, ceramics/pottery': 'Media',
    'Designer, industrial/product': 'Media',
    'Designer, interior/spatial': 'Media',
    'Animator': 'Media',
    'Fine artist': 'Media',
    'Illustrator': 'Media',
    'Photographer': 'Media',
    'Film/video editor': 'Media',
    'Video editor': 'Media',
    'Producer, radio': 'Media',
    'Producer, television/film/video': 'Media',
    'Journalist, newspaper': 'Media',
    'Editor, magazine features': 'Media',
    'Editor, film/video': 'Media',
    'Advertising account executive': 'Media',
    'Copywriter, advertising': 'Media',
    'Public relations account executive': 'Media',
    
    # Legal
    'Patent attorney': 'Legal',
    'Solicitor, Scotland': 'Legal',
    'Solicitor': 'Legal',
    'Barrister': 'Legal',
    'Chartered legal executive (England and Wales)': 'Legal',
    'Legal secretary': 'Legal',
    'Claims inspector/assessor': 'Legal',
    'Chartered loss adjuster': 'Legal',
    'Trade mark attorney': 'Legal',
    
    # Social Services 
    'Probation officer': 'Social Services ',
    'Youth worker': 'Social Services ',
    'Social researcher': 'Social Services ',
    'Volunteer coordinator': 'Social Services ',
    'Social work assistant': 'Social Services ',
    'Counselling psychologist': 'Social Services ',
    'Mental health nurse': 'Social Services ',
    
    # Science
    'Scientist, research (maths)': 'Science',
    'Research officer, trade union': 'Science',
    'Research officer, political party': 'Science',
    'Research scientist (physical sciences)': 'Science',
    'Geochemist': 'Science',
    'Geologist, engineering': 'Science',
    'Biochemist, clinical': 'Science',
    'Immunologist': 'Science',
    'Phytotherapist': 'Science',
    'Meteorologist': 'Science',
    
    # Construction
    'Surveyor, land': 'Construction',
    'Surveyor, minerals': 'Construction',
    'Surveyor, commercial/residential': 'Construction',
    'Surveyor, rural practice': 'Construction',
    'Architect': 'Construction',
    'Landscape architect': 'Construction',
    'Town planner': 'Construction',
    'Building surveyor': 'Construction',
    'Building control surveyor': 'Construction',
    
    # Retail
    'Retail merchandiser': 'Retail',
    'Retail buyer': 'Retail',
    'Sales professional, IT': 'Retail',
    'Sales executive': 'Retail',
    'Sales manager': 'Retail',
    'Retail manager': 'Retail',
    'Visual merchandiser': 'Retail',
    
    # Public Services
    'Police officer': 'Public Services',
    'Civil Service fast streamer': 'Public Services',
    'Immigration officer': 'Public Services',
    'Regulatory affairs officer': 'Public Services',
    'Social services officer': 'Public Services',
    'Community development worker': 'Public Services',
    'Exhibitions officer, museum/gallery': 'Public Services',
    'Museum/gallery conservator': 'Public Services',
    'Public relations officer': 'Public Services',
    'Public affairs consultant': 'Public Services',
    
    # Transport
    'Transport planner': 'Transport',
    'Pilot, airline': 'Transport',
    'Air traffic controller': 'Transport',
    'Shipping broker': 'Transport',
    'Freight forwarder': 'Transport',
    'Logistics and distribution manager': 'Transport',
    'Supply chain manager': 'Transport',
}



# Apply the categorization
t_data['job_category'] = t_data['job'].apply(lambda x: job_category_mapping.get(x, 'Other'))
t_data = t_data.drop(columns=['job'])

t_data['category'] = t_data['category'].replace({'misc_net': 'misc_online', 
                                                         'grocery_pos': 'grocery_pt_of_sale',
                                                        'gas_transport': 'gas',
                                                        'misc_pos' : 'misc_pt_of_sale',
                                                        'grocery_net': 'grocery_online',
                                                        'shopping_net': 'shopping_online',
                                                         'shopping_pos': 'shopping_pt_of_sale',})


state_stats = t_data.groupby('state')['is_fraud'].agg(['sum', 'count'])
state_stats['state_fraud_prob'] = state_stats['sum'] / state_stats['count']
t_data = t_data.merge(state_stats[['state_fraud_prob']], on='state', how='left')
t_data = t_data.drop(columns=['state'])


merchant_stats = t_data.groupby('merchant')['is_fraud'].agg(['sum', 'count'])
merchant_stats['merchant_fraud_prob'] = merchant_stats['sum'] / merchant_stats['count']
t_data = t_data.merge(merchant_stats[['merchant_fraud_prob']], on='merchant', how='left')
t_data = t_data.drop(columns=['merchant'])
t_data["gender"] = t_data["gender"].apply(lambda x: int(x == "M"))
t_data.head()


Unnamed: 0,category,amt,gender,city_pop,is_fraud,transaction_year,transaction_month,transaction_day,transaction_hour,age,distance_in_miles,job_category,state_fraud_prob,merchant_fraud_prob
0,personal_care,2.86,1,333497,0,2020,6,21,12,52,15.261955,Engineering,0.003429,0.004076
1,personal_care,29.84,0,302,0,2020,6,21,12,30,65.198157,Retail,0.0,0.001274
2,health_fitness,41.28,0,34496,0,2020,6,21,12,49,36.711068,Other,0.004872,0.001325
3,misc_pt_of_sale,60.05,1,54767,0,2020,6,21,12,32,17.211284,Other,0.002928,0.004219
4,travel,3.19,1,1126,0,2020,6,21,12,64,64.831552,Other,0.003101,0.002703


In [8]:
t_data.to_csv('data/model_data-test.csv', index=False) 

In [9]:
data.head()

Unnamed: 0,category,amt,gender,city_pop,transaction_year,transaction_month,transaction_day,transaction_hour,age,distance_in_miles,is_fraud,job_category,state_fraud_prob,merchant_fraud_prob
0,misc_online,4.97,0,3495,2019,1,1,0,30,48.838809,0,Healthcare,0.14731,0.36308
1,grocery_pt_of_sale,107.23,0,149,2019,1,1,0,40,18.773185,0,Education,0.154406,0.260896
2,entertainment,220.11,1,4154,2019,1,1,0,56,67.236892,0,Other,0.059643,0.096512
3,gas,45.0,1,1939,2019,1,1,0,51,59.449252,0,Legal,0.086858,0.135458
4,misc_pt_of_sale,41.96,1,99,2019,1,1,0,32,48.192064,0,Other,0.191225,0.115942


### Converting test and train categorical data to numbers

In [10]:
# Run it if needed

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

data['job_category'] = label_encoder.fit_transform(data['job_category'])
data['category'] = label_encoder.fit_transform(data['category'])

t_data['job_category'] = label_encoder.fit_transform(t_data['job_category'])
t_data['category'] = label_encoder.fit_transform(t_data['category'])


In [11]:
t_data.head()

Unnamed: 0,category,amt,gender,city_pop,is_fraud,transaction_year,transaction_month,transaction_day,transaction_hour,age,distance_in_miles,job_category,state_fraud_prob,merchant_fraud_prob
0,10,2.86,1,333497,0,2020,6,21,12,52,15.261955,2,0.003429,0.004076
1,10,29.84,0,302,0,2020,6,21,12,30,65.198157,10,0.0,0.001274
2,5,41.28,0,34496,0,2020,6,21,12,49,36.711068,8,0.004872,0.001325
3,9,60.05,1,54767,0,2020,6,21,12,32,17.211284,8,0.002928,0.004219
4,13,3.19,1,1126,0,2020,6,21,12,64,64.831552,8,0.003101,0.002703


In [12]:
data.head()

Unnamed: 0,category,amt,gender,city_pop,transaction_year,transaction_month,transaction_day,transaction_hour,age,distance_in_miles,is_fraud,job_category,state_fraud_prob,merchant_fraud_prob
0,8,4.97,0,3495,2019,1,1,0,30,48.838809,0,4,0.14731,0.36308
1,4,107.23,0,149,2019,1,1,0,40,18.773185,0,1,0.154406,0.260896
2,0,220.11,1,4154,2019,1,1,0,56,67.236892,0,8,0.059643,0.096512
3,2,45.0,1,1939,2019,1,1,0,51,59.449252,0,6,0.086858,0.135458
4,9,41.96,1,99,2019,1,1,0,32,48.192064,0,8,0.191225,0.115942
