In [10]:
import pathlib
from datascience_core.modelling.azure_model_manager import AzureModelManager

In [11]:
model_manager = AzureModelManager(
    ml_studio_name="mlw-ml-dev",
    subscription_id="6cbe45a0-6565-4c4a-b1a9-0929f276bbcd",
    resource_group="rg-data-science-dev",
)

In [12]:
model_manager.initialise_workspace()

In [13]:
model_manager.get_model_files_from_registered_model(model_name="PP5_v1")

In [14]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# load model.pkl from the model folder into an XGBClassifier object
model = CatBoostClassifier()
model.load_model(str(pathlib.Path.cwd() / "pp5_v1.cb"))

<catboost.core.CatBoostClassifier at 0x261f6e29ba0>

In [16]:
from datascience_core.data_retrieval import ProjectDatasetManager

#dataset_manager = ProjectDatasetManager("affiliate_suppression")
#datasets = dataset_manager.load_datasets("20230516_DS_Export_6months_with_retro")
#affiliate_leads = datasets["20230516_DS_Export_6months_with_retro"]

pulling dataset 20230516_DS_Export_6months_with_retro


AttributeError: 'StorageStreamDownloader' object has no attribute 'read'

In [17]:
import pandas as pd

# read in 4016852c-d13f-4292-9fa8-6b86d7f0b9b8 from the dataset folder into a pandas dataframe
affiliate_leads2 = pd.read_csv(
    str(pathlib.Path.cwd() / "4016852c-d13f-4292-9fa8-6b86d7f0b9b8"))

  affiliate_leads2 = pd.read_csv(


In [22]:
from datascience_core.data_transformation import ColumnDotRenamer, CatTypeConverter
from datascience_core.data_transformation import ColumnFiller
from datascience_core.data_transformation import ValueReplacer
from datascience_core.data_transformation import DataFramePipe
from typing import List
import os


class PPModel:
    def __init__(self, fmt="flat"):
        self.fmt = fmt
        self.model = CatBoostClassifier()

        self.model.load_model(str(pathlib.Path.cwd() / "pp5_v1.cb"))
        self.make_categorical = list(
            pd.DataFrame(data=self.model.get_cat_feature_indices(), columns=["indices"])
            .merge(
                pd.DataFrame(
                    data=self.model.feature_names_, columns=["features"]
                ).reset_index(),
                how="left",
                left_on="indices",
                right_on="index",
            )["features"]
            .values
        )

        self.dfTop100Features = pd.DataFrame.from_dict(
            data={
                "features": self.model.feature_names_,
                "values": self.model.feature_importances_,
            }
        ).sort_values(by="values", ascending=False)[0:100]


        step1 = ColumnDotRenamer(fmt=fmt, from_name=" ", to_name=".")
        step3 = ColumnFiller(
            self.model.feature_names_,
            self.dfTop100Features["features"].values,
            fmt=fmt,
        )

        step4 = ValueReplacer()
        step5 = CatTypeConverter(self.make_categorical)

        self.pipe = DataFramePipe([step1, step3, step4, step5])

    def preprocess(self, df_in):
        return self.pipe.run(df_in)

    def make_prediction(self, df_in):
        return list(self.model.predict_proba(df_in[self.model.feature_names_])[:, 1])

    def postprocess(self, confidence: float, amount_to_finance: float) -> List[str]:
        df_out = pd.DataFrame.from_dict(
            {"Confidence": confidence, "AMF": amount_to_finance}
        )
        df_out["Prediction"] = "subprime"
        df_out.loc[df_out["Confidence"] > 0.65, "Prediction"] = "prime"
        return list(df_out["Prediction"].values)

In [23]:
pp = PPModel()

In [None]:
preprocess1 = pp.preprocess(affiliate_leads2[0:10000])

In [None]:
Results1 = pp.make_prediction(preprocess1)

In [None]:
affiliate_leads2[0:100000]

In [62]:
import gc
import numpy as np

def predict_dataset_in_chunks(data, chunk_size=10000):
    total_rows = len(data)
    num_chunks = int(np.ceil(total_rows / chunk_size))

    model = PPModel()
    predictions_list = []

    for i in range(num_chunks):
        start_index = i * chunk_size
        end_index = min((i + 1) * chunk_size, total_rows)
        chunk_data = data[start_index:end_index]
        print(f"Processing chunk {i+1}/{num_chunks}...")

        preprocessed_data = model.preprocess(chunk_data)
        predictions = model.make_prediction(preprocessed_data)
        predictions_list.append(predictions)

        # Free up memory by deleting unnecessary variables
        #del preprocessed_data, predictions
        #gc.collect()

    return np.array(predictions_list).flatten()



In [64]:
predictions = predict_dataset_in_chunks(affiliate_leads2,10000)

Processing chunk 1/59...
Processing chunk 2/59...
Processing chunk 3/59...
Processing chunk 4/59...
Processing chunk 5/59...
Processing chunk 6/59...
Processing chunk 7/59...
Processing chunk 8/59...
Processing chunk 9/59...
Processing chunk 10/59...
Processing chunk 11/59...
Processing chunk 12/59...
Processing chunk 13/59...
Processing chunk 14/59...
Processing chunk 15/59...
Processing chunk 16/59...
Processing chunk 17/59...
Processing chunk 18/59...
Processing chunk 19/59...
Processing chunk 20/59...
Processing chunk 21/59...
Processing chunk 22/59...
Processing chunk 23/59...
Processing chunk 24/59...
Processing chunk 25/59...
Processing chunk 26/59...
Processing chunk 27/59...
Processing chunk 28/59...
Processing chunk 29/59...
Processing chunk 30/59...
Processing chunk 31/59...
Processing chunk 32/59...
Processing chunk 33/59...
Processing chunk 34/59...
Processing chunk 35/59...
Processing chunk 36/59...
Processing chunk 37/59...
Processing chunk 38/59...
Processing chunk 39/5

  return np.array(predictions_list).flatten()


In [65]:
#save predictions to a new dataframe and save it to csv
df = pd.DataFrame(predictions)
df.to_csv("predictions.csv")


In [70]:
# flatten the numpy array of list (predictions) into one flat numpy array
predictions2 = np.array(predictions).concatenate()

AttributeError: 'numpy.ndarray' object has no attribute 'concatenate'

In [71]:
predictions2 = np.concatenate(predictions)

In [72]:
predictions2

array([0.42394842, 0.34127297, 0.05663861, ..., 0.04243624, 0.04505876,
       0.04243624])

In [73]:
# save the predictions2 to a csv file
np.savetxt("predictions2.csv", predictions2, delimiter=",")

In [74]:
affiliate_leads2["Prediction"] = predictions2

In [79]:
affiliate_leads2[["Prediction","ApplicationId"]].to_csv("predictions_with_id.csv")

In [76]:
affiliate_leads2

Unnamed: 0,ApplicationId,App.MainCustomerId,App.ApplicationDate,App.AmountToFinance,App.VehicleType,App.Title,App.Gender,App.AgeAtApplication,App.Maritalstatus,App.DrivingLicenceType,...,Make,GlassAdjustedPrice,Mileage,FuelType,Transmission,Colour,IsWrittenAgreement,IsDocOut,IsApproval,Prediction
0,12857471,12492273,2022-12-01 08:55:00,7995.0,Car,Mr,Male,57.0,Married,International Licence,...,,,,,,,False,False,True,0.423948
1,12858185,12492987,2022-12-01 12:13:00,12995.0,Car,Mr,Male,19.0,Single,Full UK Driving Licence,...,,,,,,,False,False,False,0.341273
2,12858412,12493214,2022-12-01 13:16:00,5700.0,Car,Mr,Male,54.0,Married,Full UK Driving Licence,...,,,,,,,False,False,False,0.056639
3,12858430,12493232,2022-12-01 13:21:00,17500.0,Car,Mr,Male,56.0,Married,Full UK Driving Licence,...,,,,,,,False,False,True,0.093321
4,12858682,12493484,2022-12-01 14:27:00,6500.0,Van,Mr,Male,41.0,Married,Full UK Driving Licence,...,,,,,,,False,False,True,0.457911
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580101,13541028,13162527,2023-04-24 13:04:00,7900.0,Car,Mrs,Female,32.0,Cohabiting,Full UK Driving Licence,...,PEUGEOT,0,71182.0,PETROL,MANUAL 5 GEARS,WHITE,True,True,True,0.169504
580102,13541028,13162527,2023-04-24 13:04:00,7900.0,Car,Mrs,Female,32.0,Cohabiting,Full UK Driving Licence,...,PEUGEOT,0,71182.0,PETROL,MANUAL 5 GEARS,WHITE,True,True,True,0.208516
580103,13138667,12775547,2023-02-02 21:32:00,30000.0,Car,Mr,Male,50.0,Cohabiting,Full UK Driving Licence,...,MERCEDES-BENZ,0,76000.0,PETROL,AUTO 7 GEARS,GREY,True,True,True,0.042436
580104,12922661,12557904,2022-12-18 10:40:00,5000.0,Car,Mr,Male,36.0,Married,Full UK Driving Licence,...,NISSAN,0,98419.0,DIESEL,MANUAL 5 GEARS,BLACK,True,True,True,0.045059
