In [54]:
import pandas as pd
import numpy as np
from pycaret.classification import *
from helper_funcs import data_parser


In [55]:
df = data_parser.read_data(add_geo_location=True)


In [56]:
customer_df = df[~df.duplicated()]
customer_df = customer_df[customer_df['customer_unique_id'].notnull()]
total_customers = customer_df['customer_unique_id'].nunique()

# Count the number of unique customers who have made a purchase in the last 180 days
customer_df['Churn'] = np.where(customer_df['order_purchase_timestamp'] >= customer_df['order_purchase_timestamp'].max(
) - pd.Timedelta(days=90), 0, 1)


In [57]:
mask = customer_df.columns.isin(["order_id", "customer_id", "order_purchase_timestamp",
                                "order_approved_at", "order_delivered_carrier_date",
                                 "order_delivered_customer_date", "order_estimated_delivery_date",
                                 "order_item_id", "product_id", "seller_id", "shipping_limit_date", "customer_unique_id",
                                 "geolocation_city_x", "geolocation_city_y", "geolocation_state_x", "geolocation_state_y"])
df_model = customer_df.loc[:, ~(mask)]
df_model = df_model.astype({
    "customer_city": "category",
    "seller_city": "category"
})


In [58]:
s = setup(data=df_model, target="Churn", categorical_features=["customer_city", "seller_city", "order_status", "seller_state", "payment_type", "customer_state",
                                                                        "product_category_name"], session_id = 153, use_gpu=True)

In [59]:
best = compare_models(include = ['lr', 'dt', 'knn', 'rbfsvm', 'ridge', 'gbc', 'et'])

In [None]:
save_model(best, "best_model")

In [None]:
fig = plot_model(best,"feature", display_format="streamlit")