In [48]:
from google.cloud import bigquery

In [53]:
client = bigquery.Client()
project_id = client.project

In [54]:
dataset_id = f"{project_id}.data_to_ai_workshop"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"

dataset = client.create_dataset(dataset, exists_ok=True)

In [55]:
uri = "gs://labs.roitraining.com/data-to-ai-workshop/fraud_data_raw.csv"
table_id = f"{project_id}.data_to_ai_workshop.fraud_raw_data"

In [56]:
job_config = bigquery.LoadJobConfig(
    source_format = bigquery.SourceFormat.CSV,
    skip_leading_rows = 1,
    autodetect = True
)

In [57]:
load_job = client.load_table_from_uri(uri, table_id, job_config = job_config)
load_job.result()

LoadJob<project=qwiklabs-gcp-01-a82571ac193f, location=US, id=ee775926-143b-4139-9162-0824ad978ad0>

In [60]:
%%bigquery
SELECT *
FROM `data_to_ai_workshop.fraud_raw_data`
LIMIT 5


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Applicant_ID,Age,Employment_Status,Income,Number_of_Dependents,Amount_Requested,Previous_Assistance_Received,Previous_Assistance_Date,Supporting_Doc_Verified,Application_Frequency_Last_Year,IP_Address,Device_Type,Application_Date,Fraudulent
0,217,65,Unemployed,28984,4,5872,False,NaT,False,1,156.133.45.45,Mobile,2024-08-18,0
1,226,54,Self-Employed,0,1,6631,False,NaT,False,1,245.13.80.245,Tablet,2024-05-11,0
2,240,26,Self-Employed,64477,5,8612,False,NaT,True,1,213.103.170.95,Mobile,2024-08-14,0
3,252,28,Unemployed,28576,4,2951,False,NaT,True,1,234.179.149.207,Desktop,2024-06-12,0
4,266,43,Employed,44930,5,2324,False,NaT,False,1,66.109.96.227,Mobile,2024-08-16,0


In [61]:
%%bigquery
SELECT Employment_Status, count(*)
FROM `data_to_ai_workshop.fraud_raw_data` group by Employment_Status


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Employment_Status,f0_
0,Unemployed,16584
1,Self-Employed,16682
2,Employed,16734


In [62]:
%%bigquery
SELECT Device_Type, count(*)
FROM `data_to_ai_workshop.fraud_raw_data` group by Device_Type


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Device_Type,f0_
0,Mobile,16733
1,Tablet,16652
2,Desktop,16615


In [63]:
%%bigquery
SELECT age, count(*)
FROM `data_to_ai_workshop.fraud_raw_data`
group by age order by age desc
limit 10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,age,f0_
0,65,1021
1,64,1057
2,63,1036
3,62,1072
4,61,1016
5,60,1034
6,59,1049
7,58,1078
8,57,1054
9,56,1021


In [64]:
%%bigquery
CREATE OR REPLACE TABLE `data_to_ai_workshop.fraud_training_data` AS
SELECT * EXCEPT(previous_assistance_received,supporting_doc_verified),
  --One-hot encode Employment_Status
  CASE WHEN Employment_Status = 'Employed' THEN 1 ELSE 0 END AS Employment_Employed,
  CASE WHEN Employment_Status = 'Unemployed' THEN 1 ELSE 0 END AS Employment_Unemployed,
  CASE WHEN Employment_Status = 'Self-employed' THEN 1 ELSE 0 END AS Employment_SelfEmployed,

  --One-hot encode Device_Type
  CASE WHEN Device_Type = 'Mobile' THEN 1 ELSE 0 END AS Device_Mobile,
  CASE WHEN Device_Type = 'Desktop' THEN 1 ELSE 0 END AS Device_Desktop,
  CASE WHEN Device_Type = 'Tablet' THEN 1 ELSE 0 END AS Device_Tablet,

  --Age Binning (One-hot encoded)
  CASE WHEN age BETWEEN 18 AND 24 THEN 1 ELSE 0 END AS age_18_24,
  CASE WHEN age BETWEEN 25 AND 34 THEN 1 ELSE 0 END AS age_25_34,
  CASE WHEN age BETWEEN 35 AND 44 THEN 1 ELSE 0 END AS age_35_44,
  CASE WHEN age BETWEEN 45 AND 54 THEN 1 ELSE 0 END AS age_45_54,
  CASE WHEN age >= 55 THEN 1 ELSE 0 END AS age_55_plus,

  --Ratio calculation
  SAFE_DIVIDE(income, amount_requested) AS Income_to_Amount_Requested,

  --Time Since Previous Assistance
  DATE_DIFF(
    COALESCE(application_date, DATE('1970-01-01')),
    COALESCE(previous_assistance_date, DATE('1970-01-01')),
    DAY
  ) AS Time_Since_Previous_Assistance_Days,

  --Convert Boolean True/False to 1/0
  CAST(previous_assistance_received AS INT64) AS Previous_Assistance_Received,
  CAST(supporting_doc_verified AS INT64) AS Supporting_Doc_Verified,

FROM
  `data_to_ai_workshop.fraud_raw_data`


Query is running:   0%|          |

In [65]:
%%bigquery
SELECT *
FROM `data_to_ai_workshop.fraud_training_data`
limit 10



Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,Applicant_ID,Age,Employment_Status,Income,Number_of_Dependents,Amount_Requested,Previous_Assistance_Date,Application_Frequency_Last_Year,IP_Address,Device_Type,...,Device_Tablet,age_18_24,age_25_34,age_35_44,age_45_54,age_55_plus,Income_to_Amount_Requested,Time_Since_Previous_Assistance_Days,Previous_Assistance_Received,Supporting_Doc_Verified
0,4477,45,Employed,10497,3,5314,NaT,1,212.42.106.103,Desktop,...,0,0,0,0,1,0,1.975348,19778,0,1
1,5919,41,Employed,0,1,7156,NaT,1,184.225.104.59,Desktop,...,0,0,0,1,0,0,0.0,19725,0,1
2,10004,19,Employed,0,2,6293,NaT,1,236.8.2.118,Desktop,...,0,1,0,0,0,0,0.0,19846,0,0
3,24589,32,Employed,0,3,9598,NaT,1,26.131.192.97,Desktop,...,0,0,1,0,0,0,0.0,20044,0,0
4,48680,64,Employed,0,2,8213,NaT,1,72.207.154.79,Desktop,...,0,0,0,0,0,1,0.0,20058,0,1
5,18284,26,Employed,42961,3,1587,2023-05-08,1,175.39.81.172,Desktop,...,0,0,1,0,0,0,27.070573,397,1,1
6,22649,18,Employed,0,4,7763,2023-05-25,1,184.225.97.235,Desktop,...,0,1,0,0,0,0,0.0,472,1,0
7,19161,29,Employed,62567,4,5206,2023-08-04,1,245.13.111.44,Desktop,...,0,0,1,0,0,0,12.018248,470,1,1
8,23441,48,Employed,67435,2,8265,2023-12-13,1,202.18.110.45,Desktop,...,0,0,0,0,1,0,8.159105,97,1,0
9,34923,30,Employed,0,3,4288,2024-01-02,1,42.196.104.219,Desktop,...,0,0,1,0,0,0,0.0,187,1,0
