In [46]:
!pip3 install xgboost google-cloud-aiplatform scikit-learn pandas-gbq joblib



In [62]:
import re
import os
import uuid

import numpy as np
from google.cloud import aiplatform
from google.cloud import bigquery
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import tqdm
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    roc_auc_score,
)
import joblib
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import StandardScaler

In [55]:
PROJECT_ID = ''
LOCATION = ''
DATASET_URL = ''
MODEL_DIR = './model'
EXPERIMENT_NAME = 'local-dev'

In [None]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)
bq_client = bigquery.Client(project=PROJECT_ID)

In [57]:
# get data from dataset
def download_table(bq_table_uri: str):
    prefix = "bq://"
    if bq_table_uri.startswith(prefix):
        bq_table_uri = bq_table_uri[len(prefix) :]
    table = bq_client.get_table(bq_table_uri)
    return bq_client.list_rows(table).to_dataframe()

df = download_table(DATASET_URL)

In [58]:
X = df.drop("is_scam", axis=1)
y = df["is_scam"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
aiplatform.start_run(f"run{uuid.uuid1()}")

params = {'class_weight': 'balanced', 'max_iter': 1000}

model = LogisticRegression(**params)
model.fit(X_train_scaled, y_train)

aiplatform.log_params(params)

y_pred = model.predict(X_test_scaled)

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

metrics = {'f1': f1, 'recall': recall, 'roc_auc': roc_auc}
print(metrics)
aiplatform.log_metrics(metrics)

os.makedirs(MODEL_DIR, exist_ok=True)
model_path = f'{MODEL_DIR}/model.joblib'
with open(model_path, 'wb') as f:
    joblib.dump(model, f)

aiplatform.save_model(model=model)

aiplatform.end_run()