![AuroraAI](images/auroraai-small.png)

# Data preparation script for marketing campaign data

This script prepares the marketing campaign data for clustercards generation.

In [None]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from datetime import datetime
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from yamlconfig import read_config

## Read and define variables

In [None]:
config = read_config()
c = config['mc']
print('Settings:')
print(c)

In [None]:
DATADIR = c['datadir']
DATAFILE = c['datafile']
METAFILE = c['metafile']

INDEX_COL = 0
#CSV_SEP = '\t'
CSV_SEP = ';'

In [None]:
datafilename = "{}/{}".format(DATADIR, DATAFILE)
metafilename = "{}/{}".format(DATADIR, METAFILE)
assert os.path.isfile(datafilename), "File missing"
assert os.path.isfile(metafilename), "File missing"

## Read data

### Metadata for variables

In [None]:
df_vars = pd.read_excel(metafilename, index_col=0)
background = df_vars.loc['Taustamuuttuja']>0
background = background[background].index.values

### Actual data variables

In [None]:
df = pd.read_csv(datafilename, sep=CSV_SEP, index_col=INDEX_COL)
df

## Process data

### Combine variables

In [None]:
print(df.Education.unique())
df.Education.hist();

In [None]:
#df.Dt_Customer = pd.to_datetime(df.Dt_Customer, format='%d-%m-%Y')
df.Dt_Customer = pd.to_datetime(df.Dt_Customer, format='%Y-%m-%d')
df['Age'] = 2022-df.Year_Birth
df['Education_78'] = df.Education.isin(['PhD', 'Master', '2n Cycle']).astype(int)
df['Married'] = df.Marital_Status.isin(['Together', 'Married']).astype(int)
df

In [None]:
df.Education_78.hist();

### NaNs

In [None]:
mean_income = df.Income.mean()
df.Income = df.Income.fillna(mean_income)

### Background variables

In [None]:
df_bg = df[background]
df = df.drop(background, axis=1)
print(len(background), 'background variables removed:', background)
print('Data has', len(df), 'rows,', len(df.columns), 'columns')

In [None]:
assert df.isnull().sum().sum()==0, "NULLs exists!"

## Output

Save actual and background datasets as CSV files.

In [None]:
now = datetime.now()
todaystr = now.strftime("%Y-%m-%d")

In [None]:
outfile = "{}/mc-data-{}.csv".format(DATADIR, todaystr)
print(outfile)
df.to_csv(outfile, index=False)

In [None]:
outfile = "{}/mc-bg-{}.csv".format(DATADIR, todaystr)
print(outfile)
df_bg.to_csv(outfile, index=False)