In [1]:
import math
import os
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

pd.set_option('display.float_format',lambda x: f"{x:,.2f}")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 100)

from IPython.display import display, HTML
# display(HTML(data.to_html(index = False)))

In [2]:
# data from https://www.kaggle.com/datasets/davinwijaya/customer-retention
# y: conversion
# treatment: offer

In [3]:
data = pd.read_csv('../data/customer_retention/data.csv')

In [4]:
data.describe(include = 'all')

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
count,64000.0,64000.0,64000.0,64000.0,64000,64000.0,64000,64000,64000.0
unique,,,,,3,,3,3,
top,,,,,Surburban,,Web,Buy One Get One,
freq,,,,,28776,,28217,21387,
mean,5.76,242.09,0.55,0.55,,0.5,,,0.15
std,3.51,256.16,0.5,0.5,,0.5,,,0.35
min,1.0,29.99,0.0,0.0,,0.0,,,0.0
25%,2.0,64.66,0.0,0.0,,0.0,,,0.0
50%,6.0,158.11,1.0,1.0,,1.0,,,0.0
75%,9.0,325.66,1.0,1.0,,1.0,,,0.0


In [5]:
def convert_to_num(data):
    data_copy = data.copy()
    object_cols = [col for col in data.columns if data[col].dtype == "object"]
    for col in object_cols:
        label = data[col].unique().tolist()
        data_copy[col] = data[col].apply(lambda x: label.index(x))
        dic = dict()
        for i, x in enumerate(label):
            dic[x] = i
        print('{} : {}'.format(col, dic))
    return data_copy

In [6]:
df = convert_to_num(data)

zip_code : {'Surburban': 0, 'Rural': 1, 'Urban': 2}
channel : {'Phone': 0, 'Web': 1, 'Multichannel': 2}
offer : {'Buy One Get One': 0, 'No Offer': 1, 'Discount': 2}


In [7]:
df = df.rename(columns={'conversion': 'target'})
df = df.rename(columns={'offer': 'treatment'})

In [8]:
df.describe(include = 'all')

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,treatment,target
count,64000.0,64000.0,64000.0,64000.0,64000.0,64000.0,64000.0,64000.0,64000.0
mean,5.76,242.09,0.55,0.55,0.95,0.5,0.68,1.0,0.15
std,3.51,256.16,0.5,0.5,0.92,0.5,0.68,0.82,0.35
min,1.0,29.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,64.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,158.11,1.0,1.0,1.0,1.0,1.0,1.0,0.0
75%,9.0,325.66,1.0,1.0,2.0,1.0,1.0,2.0,0.0
max,12.0,3345.93,1.0,1.0,2.0,1.0,2.0,2.0,1.0


In [9]:
df.to_csv('../data/customer_retention/data_treated.csv', index = False)