<center><h1>Scripts del Proyecto</h1></center>

# 1. Preparación de Datos

In [75]:
import pandas as pd
import numpy as np
import lifetimes

# Lectura del archivo original

df = pd.read_csv('../data/raw/Online_Retail.csv')

In [76]:
# Nos quedamos los valores de Precio y Cantidad mayores a cero

df = df[df['Quantity'] > 0 ]
df = df[df['UnitPrice'] > 0]
df = df[~df['InvoiceNo'].str.contains("C",na=False)] # drop returned items

In [77]:
# Eliminación de valores perdidos

df.dropna(inplace=True)

In [78]:
# Manejo de Outliers

def find_boundaries(df, variable, q1=0.05, q2=0.95):

    # the boundaries are the quantiles

    lower_boundary = df[variable].quantile(q1)
    upper_boundary = df[variable].quantile(q2)

    return upper_boundary, lower_boundary

def capping_outliers(df,variable):
    upper_boundary,lower_boundary =  find_boundaries(df,variable)
    df[variable] = np.where(df[variable] > upper_boundary, upper_boundary,
                       np.where(df[variable] < lower_boundary, lower_boundary, df[variable]))

capping_outliers(df,'UnitPrice')
capping_outliers(df,'Quantity')

In [79]:
# Filtra valores 'Usamos solo del Reino Unido'

df = df[df.Country == 'United Kingdom']

In [80]:
# Se crea la columna 'Total Price'

df['Total Price'] = df['UnitPrice'] * df['Quantity']

In [81]:
# Creating Summary Dataset

clv = lifetimes.utils.summary_data_from_transaction_data(df,'CustomerID','InvoiceDate','Total Price',observation_period_end='2011-12-09')

# we want only customers shopped more than 2 times

clv = clv[clv['frequency']>1] 

In [82]:
# Exporta la data procesada

clv.to_csv('../data/processed/rfm_data.csv')

# 2. Modeling y Segmentation

In [83]:
import pandas as pd
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
import dill

In [84]:
# Lectura del archivo rfm

clv = pd.read_csv('../data/processed/rfm_data.csv')

<font size=3><b>Predicting Number or Purchase<b></font>

In [85]:
# BG/NBD Model for Predicting Number of Purchase

bgf = BetaGeoFitter(penalizer_coef=0.001)
bgf.fit(clv['frequency'], clv['recency'], clv['T'])

<lifetimes.BetaGeoFitter: fitted with 1738 subjects, a: 0.00, alpha: 112.07, b: 0.00, r: 2.38>

In [86]:
# Guardando el modelo con dill

with open('../models/bgf_model.pkl', 'wb') as file:
    dill.dump(bgf, file)

In [87]:
# Abriendo el modelo con dill

with open('../models/bgf_model.pkl', 'rb') as file:
    bgf_loaded = dill.load(file)

In [88]:
#import pickle
#pickle.load(open('../models/bgf_model.pkl', 'rb')).summary

In [89]:
# Expected number pf purchases in 6 months

t = 180 # 30 day period
clv['expected_purc_6_months'] = bgf_loaded.conditional_expected_number_of_purchases_up_to_time(t, clv['frequency'], clv['recency'], clv['T'])

<font size=3><b>Predicting the most likely value per transaction<b></font>

In [90]:
# using Gamma Gamma Model

ggf = GammaGammaFitter(penalizer_coef=0.01)
ggf.fit(clv["frequency"],
        clv["monetary_value"])

<lifetimes.GammaGammaFitter: fitted with 1738 subjects, p: 3.80, q: 0.35, v: 3.73>

In [91]:
# Guardando el modelo con dill

with open('../models/ggf_model.pkl', 'wb') as file:
    dill.dump(ggf, file)

In [92]:
# Abriendo el modelo con dill

with open('../models/ggf_model.pkl', 'rb') as file:
    ggf_loaded = dill.load(file)

In [93]:
# Customer Lifetime Value

clv['6_Months_CLV']=ggf.customer_lifetime_value(bgf,
                                   clv["frequency"],
                                   clv["recency"],
                                   clv["T"],
                                   clv["monetary_value"],
                                   time=6,
                                   freq='D',
                                   discount_rate=0.01)

<font size=3><b>Segmentation Customers by CLV<b></font>

In [94]:
clv['Segment'] =  pd.qcut(clv['6_Months_CLV'],4,labels = ['Hibernating','Need Attention',
                                                          'Loyal Customers','Champions'])

In [95]:
# Exporta la data segmentada

clv.to_csv('../data/segments/segmentos.csv')