In [3]:
import pandas as pd
import requests
from datetime import datetime
# Externas
from evidently import ColumnMapping
from evidently.metrics import (
    RegressionQualityMetric,
    RegressionPredictedVsActualScatter,
    RegressionPredictedVsActualPlot,
    RegressionErrorPlot,
    RegressionAbsPercentageErrorPlot,
    RegressionErrorDistribution,
    RegressionErrorNormality,
    RegressionTopErrorMetric
)
from evidently.metric_preset import TargetDriftPreset
from evidently.report import Report
from typing import List, Text
from google.cloud import bigquery
import logging
from evidently import ColumnMapping

#----------------

from fastapi import FastAPI, UploadFile, File, BackgroundTasks
from fastapi.responses import(
    HTMLResponse,
    JSONResponse,
    Response,
    FileResponse
)
import numpy as np
import json
#from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from typing import List, Dict, Any, Callable, Text
import pandas as pd
import uvicorn
from joblib import load
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from mangum import Mangum
import os
import logging



ModuleNotFoundError: No module named 'mangum'

# CARGAMOS CURRENT y REFERENCE DATA

### Current data

In [29]:
# Cargar el archivo CSV
credentials_path = 'src/connections/protean-fabric-386717-d6a21dd66382.json'
credentials_dir = open(credentials_path, 'rb')

In [43]:
# bigquery_connection.py
from google.cloud import bigquery
from google.oauth2 import service_account
import os

def connect_to_bigquery():

    # Load BigQuery credentials from the secret
    #credentials_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

    # Load BigQuery credentials from the service_account.json file
    #credentials = service_account.Credentials.from_service_account_info(credentials_json)

    credentials = service_account.Credentials.from_service_account_file('src/connections/protean-fabric-386717-d6a21dd66382.json')

    # Connect to the BigQuery API using the credentials
    client = bigquery.Client(credentials=credentials)
    
    return client

In [44]:
def load_current_data(window_size: int = 2000) -> pd.DataFrame:
    try:
        client = connect_to_bigquery()
        project_id = 'protean-fabric-386717'
        dataset_extract_id = "ml_datasets"
        table_extract_id = "insurance_predictions_v2"

        query = f"SELECT * FROM `{project_id}.{dataset_extract_id}.{table_extract_id}`\
        ORDER BY  date  DESC\
        LIMIT {window_size};\
        """
        current_data= client.query(query).to_dataframe() 
        return current_data
   
    except Exception as e:
        logging.error(e)
        raise e

In [45]:
current_data = load_current_data()


In [46]:
current_data.drop(columns=['user_cod', 'date'], inplace=True)
#current_data.rename(columns={'prediction': 'target'}, inplace=True)

In [47]:
current_data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,prediction
0,40,female,29.29,0,no,northwest,10734.833008
1,26,female,32.03,1,no,northeast,6566.753906
2,34,female,28.59,2,yes,southeast,17528.738281
3,24,female,27.6,1,yes,northwest,15099.791016
4,47,female,33.43,0,yes,northwest,46186.519531


### Reference data

In [48]:
def load_reference_data() -> pd.DataFrame: #columns: List[Text]) -> pd.DataFrame:
    #train_file = "data/insurance.csv"
    train_file = "insurance.csv"
    reference_data = pd.read_csv(train_file)
    return reference_data

In [49]:
reference_data = load_reference_data()

In [50]:
reference_data.rename(columns={'charges': 'target'}, inplace=True)

In [51]:
reference_data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,target
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# DEFINIR ESQUEMA DE REPORTE

In [None]:
print("Prepare column_mapping object for Evidently reports")
    column_mapping = ColumnMapping()
    column_mapping.target = target_col
    column_mapping.prediction = prediction_col
    column_mapping.numerical_features = num_features
    column_mapping.categorical_features = cat_features

In [61]:
# config to column mapping parameter for Evidently
def get_column_mapping() -> ColumnMapping:
    column_mapping = ColumnMapping()
    column_mapping.target = 'target'
    column_mapping.prediction = 'prediction'
    column_mapping.numerical_features = ['age', 'bmi', 'children']
    column_mapping.categorical_features = ['sex', 'smoker', 'region']
    return column_mapping

In [65]:
print(column_mapping)

ColumnMapping(target='target', prediction='prediction', datetime='datetime', id=None, numerical_features=['age', 'bmi', 'children'], categorical_features=['sex', 'smoker', 'region'], datetime_features=None, target_names=None, task=None, pos_label=1, text_features=None)


In [67]:
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, DataQualityPreset

In [72]:
drift_report = Report(metrics=[DataDriftPreset(), TargetDriftPreset()])
 
drift_report.run(reference_data=reference_data, current_data=current_data, column_mapping=column_mapping)


ValueError: Column (target) is partially present in data

In [70]:
drift_report

### REPORTE

In [62]:
def build_model_performance_report(
        reference_data: pd.DataFrame,
        current_data: pd.DataFrame,
        column_mapping: ColumnMapping) -> Text:

    model_performance_report = Report(metrics =[
        RegressionQualityMetric(),
        RegressionPredictedVsActualScatter(),
        RegressionPredictedVsActualPlot(),
        RegressionErrorPlot(),
        RegressionAbsPercentageErrorPlot(),
        RegressionErrorDistribution(),
        RegressionErrorNormality(),
        RegressionTopErrorMetric()
    ])
    model_performance_report.run(
        reference_data = reference_data,
        current_data = current_data,
        column_mapping = column_mapping)

    report_path = "reports/model_performance.html"
    model_performance_report.save_html(report_path)

    return report_path

In [63]:
def monitor_model_performance(window_size: int = 3000) -> FileResponse:

    logging.info('Read current data')
    current_data: pd.DataFrame = load_current_data(window_size)
    current_data.drop(columns=['user_cod', 'date'], inplace=True)
    #current_data.rename(columns={'prediction': 'target'}, inplace=True)

    logging.info('Read reference data')
    reference_data = load_reference_data()#columns=DATA_COLUMNS['columns'])
    reference_data.rename(columns={'charges': 'target'}, inplace=True)
    
    logging.info('Build report')
    column_mapping: ColumnMapping = get_column_mapping()
    report_path: Text = build_model_performance_report(
        reference_data=reference_data,
        current_data=current_data,
        column_mapping=column_mapping)
    

    logging.info('Return report as html')
    return FileResponse(report_path)


In [64]:
RUN  = monitor_model_performance()

ValueError: Column (target) is partially present in data

In [73]:
import pandas as pd
from pathlib import Path
import joblib
import pandas as pd

In [74]:
path = "data/raw/green_tripdata_2021-01.parquet"
data = pd.read_parquet(path)
#data = data.sample(frac=0.3)
    

In [75]:
data = data.sample(frac=0.3)

In [77]:
data.dtypes

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                      float64
PULocationID                      int64
DOLocationID                      int64
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                        object
improvement_surcharge           float64
total_amount                    float64
payment_type                    float64
trip_type                       float64
congestion_surcharge            float64
dtype: object