![AuroraAI](images/auroraai-small.png)

# Data preparation script for AuroraAI MunRipari data

This script prepares the MunRipari data for clustercards generation.

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from datetime import datetime
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from yamlconfig import read_config

## Read and define variables

In [None]:
config = read_config()
c = config['munripari']
print('Settings:')
print(c)

In [None]:
DATADIR = c['datadir']
DATAFILE = c['datafile']
METAFILE = c['metafile']
OUTPREFIX = c['outprefix']

#INDEX_COL = None
#CSV_SEP = ';'
INDEX_COL = 0
CSV_SEP = ','
NAN_LIMIT = 0.25
POINTSIZE = 5

In [None]:
datafilename = "{}/{}".format(DATADIR, DATAFILE)
metafilename = "{}/{}".format(DATADIR, METAFILE)
assert os.path.isfile(datafilename), "File missing"
assert os.path.isfile(metafilename), "File missing"

## Read data

### Read metadata for variables

In [None]:
df_labels = pd.read_excel(metafilename, index_col="Muuttuja")
df_labels.replace(np.nan, '0', inplace=True)
for c in df_labels.columns:
    if c == "Kuvaus": continue
    df_labels[c] = df_labels[c].astype(int)
df_labels.info()
df_labels.sample(5)

In [None]:
background = df_labels['Taustamuuttuja']>0
background = list(background[background].index.values)
print(background)

### Read actual data from CSV

In [None]:
df = pd.read_csv(datafilename, sep=CSV_SEP, index_col=INDEX_COL, na_values=" ")
df.info()
df.sample(5)

In [None]:
df.describe()

### Plot some variables

In [None]:
def plot_hist(var):
    counts = df[var].value_counts(dropna=False)
    counts.plot.bar(title=var, grid=True, rot=20);
plot_hist('Yksikkö')

In [None]:
def plot_hist2(var, **kwargs):
    counts = df[var].value_counts(dropna=False)
    fig, ax = plt.subplots(1,1, figsize=(8,5))
    ax.bar(range(len(counts)), counts)
    ax.set_title(var)
    ax.set_xticks(range(len(counts)))
    ax.set_xticklabels(list(counts.index), **kwargs)
    plt.savefig("{}.png".format(var), bbox_inches='tight')
plot_hist2('Yksikkö', rotation=45, ha='right', rotation_mode='anchor')

In [None]:
plot_hist2('Alue')

## Process data

### Filtering

In [None]:
pass

#### NaNs

In [None]:
ser_nans = df.isnull().sum()
ser_too_many_nans = ser_nans[ser_nans>NAN_LIMIT*len(df)]
too_many_nans = list(ser_too_many_nans.index.values)
print('Removing variables with more than {} NaNs:'.format(NAN_LIMIT*len(df)))
print(ser_too_many_nans)
print()
df = df.drop(ser_too_many_nans.index, axis=1)
df.info(verbose=False)

In [None]:
df

### Background variables

In [None]:
for v in too_many_nans:
    if v in background:
        print('Removing', v, 'from background variables')
        background.remove(v)
print(background)

In [None]:
df_bg = df[background]
df = df.drop(background, axis=1)
print(len(background), 'background variables removed:', background)
print('Actual data has', len(df), 'rows,', len(df.columns), 'columns')
print('Background data has', len(df_bg), 'rows,', len(df_bg.columns), 'columns')

## Save data as a CSV file

In [None]:
now = datetime.now()
todaystr = now.strftime("%Y-%m-%d")
outfile = "{}/{}-data-{}.csv".format(DATADIR, OUTPREFIX, todaystr)
print(outfile)
df.to_csv(outfile, index=False)

In [None]:
now = datetime.now()
todaystr = now.strftime("%Y-%m-%d")
outfile = "{}/{}-bg-{}.csv".format(DATADIR, OUTPREFIX, todaystr)
print(outfile)
df_bg.to_csv(outfile, index=False)