# Notebook with API code

In [28]:
# Library import
import pandas as pd
import numpy as np
import unidecode
from datetime import timedelta

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
from sklearn.compose import 

from sklearn.cluster import KMeans

import os

import pickle

In [4]:
# Function definition
def pretty_string(column):
    column_space = ' '.join(column.split())
    return unidecode.unidecode(column_space.lower())

## Creating model / update model

In [34]:
def update_model(PATH):

    orders_PATH = PATH + "/olist_orders_dataset.csv"
    order_items_PATH = PATH + "/olist_order_items_dataset.csv"
    order_payments_PATH = PATH + "/olist_order_payment_dataset.csv"
    order_reviews_PATH = PATH + "/olist_order_reviews_dataset.csv"
    customers_PATH = PATH + "/olist_customers_dataset.csv"
    cities_PATH = PATH + "/BRAZIL_CITIES.csv"

    # Load dataset from Olist Db
    #orders = pd.read_csv(orders_PATH)
    #order_items = pd.read_csv(order_items_PATH)
    #order_payments = pd.read_csv(order_payments_PATH)
    #order_reviews = pd.read_csv(order_reviews_PATH)
    #customers = pd.read_csv(customers_PATH)
    #cities = pd.read_csv(cities_PATH, sep=';')

    orders = pd.read_csv('olist_orders_dataset.csv')
    order_items = pd.read_csv('olist_order_items_dataset.csv')
    order_payments = pd.read_csv('olist_order_payments_dataset.csv')
    order_reviews = pd.read_csv('olist_order_reviews_dataset.csv')
    customers = pd.read_csv('olist_customers_dataset.csv')
    cities = pd.read_csv('BRAZIL_CITIES.csv', sep=';')
    
    #################
    ## Orders dataset
    #################

    # Keep only delivered orders and variable of interest
    orders = orders.loc[orders['order_status'] \
                        == 'delivered', ['order_id',
                                         'customer_id',
                                         'order_purchase_timestamp']]

    # Drop na In case of missing purchase timestamp
    orders.dropna(axis=0, inplace=True)

    #################
    ## Order_items dataset
    #################

    # Groupe by items to calculated total price, total freight values
    order_infos = order_items.groupby('order_id')\
        .agg(tot_price=('price', 'sum'),
             tot_freight_value=('freight_value', 'sum')).reset_index()


    #################
    ## Order_reviews dataset
    #################

    # Keep only variable of interest
    order_reviews = order_reviews.loc[:, ['review_id',
                                          'order_id',
                                          'review_score']]

    #################
    ## City size feature creation
    #################

    # Customer city case homogeneisation
    customers['customer_city'] = customers['customer_city']\
        .apply(pretty_string)

    # Keep variable of interest
    cities = cities.loc[:, ['CITY',
                            'STATE',
                            'IBGE_RES_POP']]

    # Case modification
    cities['CITY'] = cities['CITY'].apply(pretty_string)

    #################
    ## Merging
    #################

    df_orders = orders.merge(order_infos,
                             how='inner',
                             on='order_id')


    df_orders = df_orders.merge(order_reviews,
                                how='inner',
                                on='order_id')
    df_orders.drop('review_id', axis=1, inplace=True)

    df_orders = df_orders.merge(customers,
                                how='inner',
                                on='customer_id')
    df_orders.drop('customer_id', axis=1, inplace=True)

    df_orders = df_orders.merge(cities,
                                how='inner',
                                left_on=['customer_city',
                                         'customer_state'],
                                right_on=['CITY', 'STATE'])
    col_drop = ['customer_zip_code_prefix',
                'customer_state',
                'customer_city',
                'CITY',
                'STATE']
    df_orders.drop(col_drop, axis=1, inplace=True)

    # Drop na In case of missing city population
    df_orders.dropna(axis=0, inplace=True)

    df_orders['order_purchase_timestamp'] = \
        pd.to_datetime(df_orders['order_purchase_timestamp'])
    
    #################
    ## Automated outliers drop with isolation forrest
    #################

    # Creating features dataframe
    X = df_orders.loc[:, ['tot_price',
                          'tot_freight_value']]


    # Scaling features using Robust Scaler
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    # Model fitting with reserach of O.O1 percent of outlier
    model = IsolationForest(contamination=0.00001)

    model.fit(X_scaled)

    # Drop detected outliers

    outliers = model.predict(X_scaled) == -1

    df_orders.drop(df_orders[outliers].index,
                   axis=0,
                   inplace=True)
    
    #################
    ## Features Creation
    #################

    # Snapshot of the last order timestamp
    snapshot_date = df_orders['order_purchase_timestamp'].max() + timedelta(days=1)

    # Clustering model features caculating
    df_client = df_orders.groupby(['customer_unique_id']).apply(lambda x: pd.Series({
        'recency': (snapshot_date - x['order_purchase_timestamp'].max()).days,
        'frequency': x['order_id'].count(),
        'monetary_value': (x['tot_price'] + x['tot_freight_value']).sum(),
        'avg_review_score': x['review_score'].mean(),
        'customer_city_size': x['IBGE_RES_POP'].mean()
    }))


    # We only keep client with at least 2 purchases
    df_client = df_client[df_client['frequency'] > 1]
    
    #df_client.to_csv('base_dataset_model.csv')
        
    #################
    ## Model fitting
    #################

    # Based model preprossessing
    scaler = RobustScaler()

    X_scaled = scaler.fit_transform(df_client)

    # Five clusters model fitting
    
    best_model = KMeans(n_clusters=5,
                        max_iter=1000,
                        n_init=10,
                        random_state=0)

    best_model.fit(X_scaled)
    
    #################
    ## Save model as pickle file
    #################
    import pickle

    with open('pickle_model', 'wb') as file:
        pickle.dump(best_model, file)

    with open('pickle_scaler', 'wb') as file:
        pickle.dump(scaler, file)
    
    
    return(best_model)

In [35]:
update_model(PATH="")

KMeans(max_iter=1000, n_clusters=5, random_state=0)

In [45]:
 # Open the saved model
with open('pickle_model', 'rb') as file:
    best_model = pickle.load(file)

with open('pickle_scaler', 'rb') as file:
    scaler = pickle.load(file)

df = scaler.transform(customer_info)
best_model.predict(df)


array([3], dtype=int32)

In [46]:
import numpy as np
import pickle

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

class Olist_cluster:
    '''Create a object to implement the olist clustering.
    '''
    def __init__(self, scaler_path:str, model_path:str):
        self.scaler = self.get_scaler(scaler_path)
        self.model = self.get_model(model_path)
        self.olist_cluster = {
            0: 'Former average',
            1: 'Current average',
            2: 'Big city average',
            3: 'Unsatisfied',
            4: 'Champions'}
    
    def get_model(self, model_path:str) -> KMeans:
        '''Open the pkl file which store the model.
        Arguments: 
            model_path: Path model with pkl extension
        
        Returns:
            model: Model object
        '''

        with open(model_path,"rb") as f:
            model = pickle.load(f)
        
        return model
    
    def get_scaler(self, scaler_path:str) -> StandardScaler:
        '''Open the pkl file which store the scaler.
        Arguments: 
            scaler_path: Path scaler with pkl extension
        
        Returns:
            scaler: scaler object
        '''

        with open(scaler_path,"rb") as f:
            scaler = pickle.load(f)
        
        return scaler

    def make_prediction(self, features:dict)->str:
        '''Predicts the cluster.
        Argument:
            features: list
        
        return:
            cluster: str
        '''
        features = np.array(list(features.values()))
        features_scaled = self.scaler.transform(features.reshape(1,-1))
        pred = self.model.predict(features_scaled.reshape(1,-1))[0]
        cluster_pred = self.olist_cluster[pred]
        return cluster_pred

array([[ 0.02317497,  0.2       , -0.04763175, -0.125     ,  1.67201465]])