## About

This notebook provides a demonstration of how the training and test data was generated from the marketing_campaign.csv.

### Downloading the data

Open a Linux terminal and use the command below:

```bash
$ wget https://raw.githubusercontent.com/Demiga-g/iFood-Marketing-Analysis/main/marketing_campaign.csv
```

In [1]:
import pandas as pd

# Read data
df = pd.read_csv('./data/marketing_campaign.csv', delimiter=';')
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


In [2]:
# Check data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [3]:
# Transform `Dt_Customer` to datetime dtype
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])
df['Dt_Customer'].dtype

dtype('<M8[ns]')

In [4]:
# Check year range of the data
start_period = df["Dt_Customer"].min().year
end_period = df["Dt_Customer"].max().year

start_period, end_period

(2012, 2014)

In [5]:
# Training and Validation Data

# Use years before 2014 as training data
training_data = (
    df[df['Dt_Customer'].dt.year < 2014]
    .reset_index()
    .drop(['index'], axis=1)
)

# Check shape of training data
training_data.shape

(1683, 29)

In [6]:
# Use 2014 as the validation data
validation_data = (
    df[df['Dt_Customer'].dt.year >=2014].reset_index()
    .drop(['index'], axis=1)
)

# Check shape of validation data
validation_data.shape

(557, 29)

In [7]:
# # Save training data to folder
# training_data.to_csv('./data/training_data.csv', index=False)

# # Save validation data to folder
# validation_data.to_csv('./data/validation_data.csv', index=False)

## Generating synthetic data

In [8]:
import numpy as np
from datetime import datetime, timedelta

# Define the number of instances
num_instances = 6062

# Generate the ID column with unique values
ids = np.arange(2178, 109327)
ids = np.random.choice(ids, size=num_instances, replace=False)

# Generate the Year_Birth column with values ranging from 1914 to 1996
year_birth = np.random.randint(1914, 1997, size=num_instances).astype('float')
year_birth[np.random.choice(range(num_instances), size=30)] = np.nan
# year_birth = year_birth.astype('int')

# Generate the Income column with values ranging from 1873 to 8870588
income = np.random.randint(1873, 8870589, size=num_instances).astype('float')
income[np.random.choice(range(num_instances), size=80)] = np.nan

# Generate the Kidhome column with values ranging from 0 to 6
kidhome = np.random.randint(0, 7, size=num_instances).astype('float')
kidhome[np.random.choice(range(num_instances), size=5)] = np.nan
# kidhome = kidhome.astype('int')

# Generate the Teenhome column with values ranging from 0 to 3
teenhome = np.random.randint(0, 4, size=num_instances)
teenhome = teenhome.astype('int')

# Generate the Recency column with values ranging from 0 to 80
recency = np.random.randint(0, 81, size=num_instances)

# Generate the MntWines column with values ranging from 0 to 2508
mntwines = np.random.randint(0, 2509, size=num_instances).astype('float')
mntwines[np.random.choice(range(num_instances), size=33)] = np.nan
# mntwines = mntwines.astype('int')

# Generate the MntFruits column with values ranging from 0 to 240
mntfruits = np.random.randint(0, 241, size=num_instances)

# Generate the MntMeatProducts column with values ranging from 0 to 2889
mntmeatproducts = np.random.randint(0, 2890, size=num_instances).astype('float')
mntmeatproducts[np.random.choice(range(num_instances), size=18)] = np.nan
# mntmeatproducts = mntmeatproducts.astype('int')

# Generate the MntFishProducts column with values ranging from 0 to 400
mntfishproducts = np.random.randint(0, 401, size=num_instances)

# Generate the MntSweetProducts column with values ranging from 0 to 400
mntsweetproducts = np.random.randint(0, 401, size=num_instances).astype('float')
mntsweetproducts[np.random.choice(range(num_instances), size=21)] = np.nan
# mntsweetproducts = mntsweetproducts.astype('int')

# Generate the MntGoldProds column with values ranging from 0 to 500
mntgoldprods = np.random.randint(0, 501, size=num_instances).astype('float')
mntgoldprods[np.random.choice(range(num_instances), size=5)] = np.nan
# mntgoldprods = mntgoldprods.astype('int')

# Generate the NumDealsPurchases column with values ranging from 0 to 38
numdealspurchases = np.random.randint(0, 39, size=num_instances)

# Generate the NumWebPurchases column with values ranging from 0 to 50
numwebpurchases = np.random.randint(0, 51, size=num_instances)

# Generate the NumCatalogPurchases column with values ranging from 0 to 20
numcatalogpurchases = np.random.randint(0, 21, size=num_instances)

# Generate the NumStorePurchases column with values ranging from 0 to 25
numstorepurchases = np.random.randint(0, 26, size=num_instances)

# Generate the NumWebVisitsMonth column with values ranging from 0 to 17
numwebvisitsmonth = np.random.randint(0, 18, size=num_instances)

# Generate the Complain column with values of either 0 or 1
complain = np.random.choice([0, 1], size=num_instances, p=[0.88, 0.12])

# Generate the AcceptedCmp columns with values of either 0 or 1
acceptedcmp3 = np.random.choice([0, 1], size=num_instances, p=[0.9272, 0.0728])
acceptedcmp4 = np.random.choice([0, 1], size=num_instances, p=[0.9254, 0.0746])
acceptedcmp5 = np.random.choice([0, 1], size=num_instances, p=[0.9272, 0.0728])
acceptedcmp1 = np.random.choice([0, 1], size=num_instances, p=[0.9357, 0.0643])
acceptedcmp2 = np.random.choice([0, 1], size=num_instances, p=[0.9866, 0.0134])

# Generate the Z_CostContact and Z_Revenue columns with constant values of 1
z_costcontact = np.ones(num_instances).astype('int')
z_revenue = np.ones(num_instances).astype('int')

# Generate the Response column with values of either 0 or 1
response = np.random.choice([0, 1], size=num_instances, p=[0.55, 0.45])

# Generate the Education column with values 'Graduation', 'PhD', 'Master', 'Basic', '2n Cycle'
education = np.random.choice(['Graduation', 'PhD', 'Master', 'Basic', '2n Cycle'], size=num_instances)

# Generate the Marital_Status column with values 'Single', 'Together', 'Married', 'Divorced', 'Widow', 'Alone', 'Absurd', 'YOLO'
marital_status = np.random.choice(['Single', 'Together', 'Married', 'Divorced', 'Widow', 'Alone', 'Absurd', 'YOLO'], size=num_instances)

# Generate the Dt_Customer column with values between January 04, 2012, and June 07, 2014
start_date = datetime(2012, 1, 4)
end_date = datetime(2014, 6, 7)
dt_customer = np.array([start_date + timedelta(days=np.random.randint((end_date - start_date).days)) for _ in range(num_instances)], dtype='datetime64[ns]')
dt_customer[np.random.choice(range(num_instances), size=10)] = np.datetime64('NaT')
dt_customer = np.datetime_as_string(dt_customer, unit='D')

In [9]:
# Create the DataFrame
data = {
    'ID': ids,
    'Year_Birth': year_birth,
    'Income': income,
    'Kidhome': kidhome,
    'Teenhome': teenhome,
    'Recency': recency,
    'MntWines': mntwines,
    'MntFruits': mntfruits,
    'MntMeatProducts': mntmeatproducts,
    'MntFishProducts': mntfishproducts,
    'MntSweetProducts': mntsweetproducts,
    'MntGoldProds': mntgoldprods,
    'NumDealsPurchases': numdealspurchases,
    'NumWebPurchases': numwebpurchases,
    'NumCatalogPurchases': numcatalogpurchases,
    'NumStorePurchases': numstorepurchases,
    'NumWebVisitsMonth': numwebvisitsmonth,
    'Complain': complain,
    'AcceptedCmp3': acceptedcmp3,
    'AcceptedCmp4': acceptedcmp4,
    'AcceptedCmp5': acceptedcmp5,
    'AcceptedCmp1': acceptedcmp1,
    'AcceptedCmp2': acceptedcmp2,
    'Z_CostContact': z_costcontact,
    'Z_Revenue': z_revenue,
    'Response': response,
    'Education': education,
    'Marital_Status': marital_status,
    'Dt_Customer': dt_customer
}
df = pd.DataFrame(data)

In [12]:
df.to_csv('./data/synthetic-data.csv', index=False)